diff --git a/.gitmodules b/.gitmodules
index 64372bf3cc41964cc9fbc298d80f154fc6f9dee1..79827d19945515b283d73c2fa3312ddf097d6b8b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -54,6 +54,3 @@
 [submodule "ext/openxr"]
 	path = ext/openxr
 	url = https://github.com/KhronosGroup/OpenXR-SDK.git
-[submodule "ext/cudart"]
-	path = ext/cudart
-	url = https://gitlab.com/nvidia/headers/cuda-individual/cudart.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15ab7f4724bf65b19c97ec5dbb0b0b69959c5a25..7b0e9914efa1edfaae3e38e2582e1213171ecc26 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -194,6 +194,7 @@ target_include_directories(lava.base PUBLIC
         $<BUILD_INTERFACE:${LIBLAVA_EXT_DIR}/VulkanMemoryAllocator/include>
         $<BUILD_INTERFACE:${LIBLAVA_EXT_DIR}/volk>
         $<BUILD_INTERFACE:${LIBLAVA_INC_DIR}/nvenc_headers>
+        $<BUILD_INTERFACE:${LIBLAVA_EXT_DIR}/cudart/include>
         )
 
 target_link_libraries(lava.base
@@ -500,6 +501,7 @@ set(EXT_INCLUDE_DIRS
         ${LIBLAVA_EXT_DIR}/argh
         ${LIBLAVA_EXT_DIR}/imgui
         ${LIBLAVA_INC_DIR}/nvenc_headers
+        ${LIBLAVA_EXT_DIR}/cudart/include
         )
 
 foreach(DIR ${EXT_INCLUDE_DIRS})
@@ -801,6 +803,7 @@ message("=======================================================================
         target_link_libraries(${NAME} lava::app assimp::assimp)
         target_include_directories(${NAME} PUBLIC ${OPENVR_HEADER_DIR})
         target_link_libraries(${NAME} "${CMAKE_SOURCE_DIR}/ext/openvr/lib/win64/openvr_api.lib")
+        target_link_libraries(${NAME} "${CMAKE_SOURCE_DIR}/ext/cudart/lib/x64/cuda.lib")
 
         if(MSVC)
             source_group("" FILES ${SRC_DIR}/main.cpp)
diff --git a/ext/cudart b/ext/cudart
deleted file mode 160000
index b92c3c0351d6d4402b81101ffcd6660474d1306b..0000000000000000000000000000000000000000
--- a/ext/cudart
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit b92c3c0351d6d4402b81101ffcd6660474d1306b
diff --git a/ext/cudart/bin/cudart32_110.dll b/ext/cudart/bin/cudart32_110.dll
new file mode 100644
index 0000000000000000000000000000000000000000..20cf2aee49aed62ae6d5ccad7bbbf5d4f3474eca
Binary files /dev/null and b/ext/cudart/bin/cudart32_110.dll differ
diff --git a/ext/cudart/bin/cudart64_110.dll b/ext/cudart/bin/cudart64_110.dll
new file mode 100644
index 0000000000000000000000000000000000000000..b36d7a3e29ff90d1e43460489ec78d3b78b5fda9
Binary files /dev/null and b/ext/cudart/bin/cudart64_110.dll differ
diff --git a/ext/cudart/include/CL/cl.h b/ext/cudart/include/CL/cl.h
new file mode 100644
index 0000000000000000000000000000000000000000..290495a68f1be43916c468b9a3881c6972085e89
--- /dev/null
+++ b/ext/cudart/include/CL/cl.h
@@ -0,0 +1,1578 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_H
+#define __OPENCL_CL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl_platform.h>
+#else
+#include <CL/cl_platform.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+
+typedef struct _cl_platform_id *    cl_platform_id;
+typedef struct _cl_device_id *      cl_device_id;
+typedef struct _cl_context *        cl_context;
+typedef struct _cl_command_queue *  cl_command_queue;
+typedef struct _cl_mem *            cl_mem;
+typedef struct _cl_program *        cl_program;
+typedef struct _cl_kernel *         cl_kernel;
+typedef struct _cl_event *          cl_event;
+typedef struct _cl_sampler *        cl_sampler;
+
+typedef cl_uint             cl_bool;                     /* WARNING!  Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */
+typedef cl_ulong            cl_bitfield;
+typedef cl_ulong            cl_properties;
+typedef cl_bitfield         cl_device_type;
+typedef cl_uint             cl_platform_info;
+typedef cl_uint             cl_device_info;
+typedef cl_bitfield         cl_device_fp_config;
+typedef cl_uint             cl_device_mem_cache_type;
+typedef cl_uint             cl_device_local_mem_type;
+typedef cl_bitfield         cl_device_exec_capabilities;
+typedef cl_bitfield         cl_device_svm_capabilities;
+typedef cl_bitfield         cl_command_queue_properties;
+typedef intptr_t            cl_device_partition_property;
+typedef cl_bitfield         cl_device_affinity_domain;
+
+typedef intptr_t            cl_context_properties;
+typedef cl_uint             cl_context_info;
+typedef cl_properties       cl_queue_properties;
+typedef cl_uint             cl_command_queue_info;
+typedef cl_uint             cl_channel_order;
+typedef cl_uint             cl_channel_type;
+typedef cl_bitfield         cl_mem_flags;
+typedef cl_bitfield         cl_svm_mem_flags;
+typedef cl_uint             cl_mem_object_type;
+typedef cl_uint             cl_mem_info;
+typedef cl_bitfield         cl_mem_migration_flags;
+typedef cl_uint             cl_image_info;
+typedef cl_uint             cl_buffer_create_type;
+typedef cl_uint             cl_addressing_mode;
+typedef cl_uint             cl_filter_mode;
+typedef cl_uint             cl_sampler_info;
+typedef cl_bitfield         cl_map_flags;
+typedef intptr_t            cl_pipe_properties;
+typedef cl_uint             cl_pipe_info;
+typedef cl_uint             cl_program_info;
+typedef cl_uint             cl_program_build_info;
+typedef cl_uint             cl_program_binary_type;
+typedef cl_int              cl_build_status;
+typedef cl_uint             cl_kernel_info;
+typedef cl_uint             cl_kernel_arg_info;
+typedef cl_uint             cl_kernel_arg_address_qualifier;
+typedef cl_uint             cl_kernel_arg_access_qualifier;
+typedef cl_bitfield         cl_kernel_arg_type_qualifier;
+typedef cl_uint             cl_kernel_work_group_info;
+typedef cl_uint             cl_kernel_sub_group_info;
+typedef cl_uint             cl_event_info;
+typedef cl_uint             cl_command_type;
+typedef cl_uint             cl_profiling_info;
+typedef cl_properties       cl_sampler_properties;
+typedef cl_uint             cl_kernel_exec_info;
+typedef cl_bitfield         cl_device_atomic_capabilities;
+typedef cl_bitfield         cl_device_device_enqueue_capabilities;
+typedef cl_uint             cl_khronos_vendor_id;
+typedef cl_properties       cl_mem_properties;
+typedef cl_uint             cl_version;
+
+typedef struct _cl_image_format {
+    cl_channel_order        image_channel_order;
+    cl_channel_type         image_channel_data_type;
+} cl_image_format;
+
+typedef struct _cl_image_desc {
+    cl_mem_object_type      image_type;
+    size_t                  image_width;
+    size_t                  image_height;
+    size_t                  image_depth;
+    size_t                  image_array_size;
+    size_t                  image_row_pitch;
+    size_t                  image_slice_pitch;
+    cl_uint                 num_mip_levels;
+    cl_uint                 num_samples;
+#ifdef __GNUC__
+    __extension__   /* Prevents warnings about anonymous union in -pedantic builds */
+#endif
+    union {
+      cl_mem                  buffer;
+      cl_mem                  mem_object;
+    };
+} cl_image_desc;
+
+typedef struct _cl_buffer_region {
+    size_t                  origin;
+    size_t                  size;
+} cl_buffer_region;
+
+#define CL_NAME_VERSION_MAX_NAME_SIZE 64
+
+typedef struct _cl_name_version {
+    cl_version              version;
+    char                    name[CL_NAME_VERSION_MAX_NAME_SIZE];
+} cl_name_version;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_SUCCESS                                  0
+#define CL_DEVICE_NOT_FOUND                         -1
+#define CL_DEVICE_NOT_AVAILABLE                     -2
+#define CL_COMPILER_NOT_AVAILABLE                   -3
+#define CL_MEM_OBJECT_ALLOCATION_FAILURE            -4
+#define CL_OUT_OF_RESOURCES                         -5
+#define CL_OUT_OF_HOST_MEMORY                       -6
+#define CL_PROFILING_INFO_NOT_AVAILABLE             -7
+#define CL_MEM_COPY_OVERLAP                         -8
+#define CL_IMAGE_FORMAT_MISMATCH                    -9
+#define CL_IMAGE_FORMAT_NOT_SUPPORTED               -10
+#define CL_BUILD_PROGRAM_FAILURE                    -11
+#define CL_MAP_FAILURE                              -12
+#define CL_MISALIGNED_SUB_BUFFER_OFFSET             -13
+#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14
+#define CL_COMPILE_PROGRAM_FAILURE                  -15
+#define CL_LINKER_NOT_AVAILABLE                     -16
+#define CL_LINK_PROGRAM_FAILURE                     -17
+#define CL_DEVICE_PARTITION_FAILED                  -18
+#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE            -19
+
+#define CL_INVALID_VALUE                            -30
+#define CL_INVALID_DEVICE_TYPE                      -31
+#define CL_INVALID_PLATFORM                         -32
+#define CL_INVALID_DEVICE                           -33
+#define CL_INVALID_CONTEXT                          -34
+#define CL_INVALID_QUEUE_PROPERTIES                 -35
+#define CL_INVALID_COMMAND_QUEUE                    -36
+#define CL_INVALID_HOST_PTR                         -37
+#define CL_INVALID_MEM_OBJECT                       -38
+#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR          -39
+#define CL_INVALID_IMAGE_SIZE                       -40
+#define CL_INVALID_SAMPLER                          -41
+#define CL_INVALID_BINARY                           -42
+#define CL_INVALID_BUILD_OPTIONS                    -43
+#define CL_INVALID_PROGRAM                          -44
+#define CL_INVALID_PROGRAM_EXECUTABLE               -45
+#define CL_INVALID_KERNEL_NAME                      -46
+#define CL_INVALID_KERNEL_DEFINITION                -47
+#define CL_INVALID_KERNEL                           -48
+#define CL_INVALID_ARG_INDEX                        -49
+#define CL_INVALID_ARG_VALUE                        -50
+#define CL_INVALID_ARG_SIZE                         -51
+#define CL_INVALID_KERNEL_ARGS                      -52
+#define CL_INVALID_WORK_DIMENSION                   -53
+#define CL_INVALID_WORK_GROUP_SIZE                  -54
+#define CL_INVALID_WORK_ITEM_SIZE                   -55
+#define CL_INVALID_GLOBAL_OFFSET                    -56
+#define CL_INVALID_EVENT_WAIT_LIST                  -57
+#define CL_INVALID_EVENT                            -58
+#define CL_INVALID_OPERATION                        -59
+#define CL_INVALID_GL_OBJECT                        -60
+#define CL_INVALID_BUFFER_SIZE                      -61
+#define CL_INVALID_MIP_LEVEL                        -62
+#define CL_INVALID_GLOBAL_WORK_SIZE                 -63
+#define CL_INVALID_PROPERTY                         -64
+#define CL_INVALID_IMAGE_DESCRIPTOR                 -65
+#define CL_INVALID_COMPILER_OPTIONS                 -66
+#define CL_INVALID_LINKER_OPTIONS                   -67
+#define CL_INVALID_DEVICE_PARTITION_COUNT           -68
+#define CL_INVALID_PIPE_SIZE                        -69
+#define CL_INVALID_DEVICE_QUEUE                     -70
+#define CL_INVALID_SPEC_ID                          -71
+#define CL_MAX_SIZE_RESTRICTION_EXCEEDED            -72
+
+/* OpenCL Version */
+#define CL_VERSION_1_0                              1
+#define CL_VERSION_1_1                              1
+#define CL_VERSION_1_2                              1
+#define CL_VERSION_2_0                              1
+#define CL_VERSION_2_1                              1
+#define CL_VERSION_2_2                              1
+#define CL_VERSION_3_0                              1
+
+/* cl_bool */
+#define CL_FALSE                                    0
+#define CL_TRUE                                     1
+#define CL_BLOCKING                                 CL_TRUE
+#define CL_NON_BLOCKING                             CL_FALSE
+
+/* cl_platform_info */
+#define CL_PLATFORM_PROFILE                         0x0900
+#define CL_PLATFORM_VERSION                         0x0901
+#define CL_PLATFORM_NAME                            0x0902
+#define CL_PLATFORM_VENDOR                          0x0903
+#define CL_PLATFORM_EXTENSIONS                      0x0904
+#define CL_PLATFORM_HOST_TIMER_RESOLUTION           0x0905
+#define CL_PLATFORM_NUMERIC_VERSION                 0x0906
+#define CL_PLATFORM_EXTENSIONS_WITH_VERSION         0x0907
+
+/* cl_device_type - bitfield */
+#define CL_DEVICE_TYPE_DEFAULT                      (1 << 0)
+#define CL_DEVICE_TYPE_CPU                          (1 << 1)
+#define CL_DEVICE_TYPE_GPU                          (1 << 2)
+#define CL_DEVICE_TYPE_ACCELERATOR                  (1 << 3)
+#define CL_DEVICE_TYPE_CUSTOM                       (1 << 4)
+#define CL_DEVICE_TYPE_ALL                          0xFFFFFFFF
+
+/* cl_device_info */
+#define CL_DEVICE_TYPE                                   0x1000
+#define CL_DEVICE_VENDOR_ID                              0x1001
+#define CL_DEVICE_MAX_COMPUTE_UNITS                      0x1002
+#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS               0x1003
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE                    0x1004
+#define CL_DEVICE_MAX_WORK_ITEM_SIZES                    0x1005
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR            0x1006
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT           0x1007
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT             0x1008
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG            0x1009
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT           0x100A
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE          0x100B
+#define CL_DEVICE_MAX_CLOCK_FREQUENCY                    0x100C
+#define CL_DEVICE_ADDRESS_BITS                           0x100D
+#define CL_DEVICE_MAX_READ_IMAGE_ARGS                    0x100E
+#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS                   0x100F
+#define CL_DEVICE_MAX_MEM_ALLOC_SIZE                     0x1010
+#define CL_DEVICE_IMAGE2D_MAX_WIDTH                      0x1011
+#define CL_DEVICE_IMAGE2D_MAX_HEIGHT                     0x1012
+#define CL_DEVICE_IMAGE3D_MAX_WIDTH                      0x1013
+#define CL_DEVICE_IMAGE3D_MAX_HEIGHT                     0x1014
+#define CL_DEVICE_IMAGE3D_MAX_DEPTH                      0x1015
+#define CL_DEVICE_IMAGE_SUPPORT                          0x1016
+#define CL_DEVICE_MAX_PARAMETER_SIZE                     0x1017
+#define CL_DEVICE_MAX_SAMPLERS                           0x1018
+#define CL_DEVICE_MEM_BASE_ADDR_ALIGN                    0x1019
+#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE               0x101A
+#define CL_DEVICE_SINGLE_FP_CONFIG                       0x101B
+#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE                  0x101C
+#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE              0x101D
+#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE                  0x101E
+#define CL_DEVICE_GLOBAL_MEM_SIZE                        0x101F
+#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE               0x1020
+#define CL_DEVICE_MAX_CONSTANT_ARGS                      0x1021
+#define CL_DEVICE_LOCAL_MEM_TYPE                         0x1022
+#define CL_DEVICE_LOCAL_MEM_SIZE                         0x1023
+#define CL_DEVICE_ERROR_CORRECTION_SUPPORT               0x1024
+#define CL_DEVICE_PROFILING_TIMER_RESOLUTION             0x1025
+#define CL_DEVICE_ENDIAN_LITTLE                          0x1026
+#define CL_DEVICE_AVAILABLE                              0x1027
+#define CL_DEVICE_COMPILER_AVAILABLE                     0x1028
+#define CL_DEVICE_EXECUTION_CAPABILITIES                 0x1029
+#define CL_DEVICE_QUEUE_PROPERTIES                       0x102A    /* deprecated */
+#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES               0x102A
+#define CL_DEVICE_NAME                                   0x102B
+#define CL_DEVICE_VENDOR                                 0x102C
+#define CL_DRIVER_VERSION                                0x102D
+#define CL_DEVICE_PROFILE                                0x102E
+#define CL_DEVICE_VERSION                                0x102F
+#define CL_DEVICE_EXTENSIONS                             0x1030
+#define CL_DEVICE_PLATFORM                               0x1031
+#define CL_DEVICE_DOUBLE_FP_CONFIG                       0x1032
+/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */
+#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF            0x1034
+#define CL_DEVICE_HOST_UNIFIED_MEMORY                    0x1035   /* deprecated */
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR               0x1036
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT              0x1037
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT                0x1038
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG               0x1039
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT              0x103A
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE             0x103B
+#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF               0x103C
+#define CL_DEVICE_OPENCL_C_VERSION                       0x103D
+#define CL_DEVICE_LINKER_AVAILABLE                       0x103E
+#define CL_DEVICE_BUILT_IN_KERNELS                       0x103F
+#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE                  0x1040
+#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE                   0x1041
+#define CL_DEVICE_PARENT_DEVICE                          0x1042
+#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES              0x1043
+#define CL_DEVICE_PARTITION_PROPERTIES                   0x1044
+#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN              0x1045
+#define CL_DEVICE_PARTITION_TYPE                         0x1046
+#define CL_DEVICE_REFERENCE_COUNT                        0x1047
+#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC            0x1048
+#define CL_DEVICE_PRINTF_BUFFER_SIZE                     0x1049
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT                  0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT           0x104B
+#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS              0x104C
+#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE               0x104D
+#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES             0x104E
+#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE         0x104F
+#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE               0x1050
+#define CL_DEVICE_MAX_ON_DEVICE_QUEUES                   0x1051
+#define CL_DEVICE_MAX_ON_DEVICE_EVENTS                   0x1052
+#define CL_DEVICE_SVM_CAPABILITIES                       0x1053
+#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE   0x1054
+#define CL_DEVICE_MAX_PIPE_ARGS                          0x1055
+#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS           0x1056
+#define CL_DEVICE_PIPE_MAX_PACKET_SIZE                   0x1057
+#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT    0x1058
+#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT      0x1059
+#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT       0x105A
+#define CL_DEVICE_IL_VERSION                             0x105B
+#define CL_DEVICE_MAX_NUM_SUB_GROUPS                     0x105C
+#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D
+#define CL_DEVICE_NUMERIC_VERSION                        0x105E
+#define CL_DEVICE_EXTENSIONS_WITH_VERSION                0x1060
+#define CL_DEVICE_ILS_WITH_VERSION                       0x1061
+#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION          0x1062
+#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES             0x1063
+#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES              0x1064
+#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT         0x1065
+#define CL_DEVICE_OPENCL_C_ALL_VERSIONS                  0x1066
+#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE     0x1067
+#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068
+#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT          0x1069
+/* 0x106A to 0x106E - Reserved for upcoming KHR extension */
+#define CL_DEVICE_OPENCL_C_FEATURES                      0x106F
+#define CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES            0x1070
+#define CL_DEVICE_PIPE_SUPPORT                           0x1071
+#define CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED      0x1072
+
+/* cl_device_fp_config - bitfield */
+#define CL_FP_DENORM                                (1 << 0)
+#define CL_FP_INF_NAN                               (1 << 1)
+#define CL_FP_ROUND_TO_NEAREST                      (1 << 2)
+#define CL_FP_ROUND_TO_ZERO                         (1 << 3)
+#define CL_FP_ROUND_TO_INF                          (1 << 4)
+#define CL_FP_FMA                                   (1 << 5)
+#define CL_FP_SOFT_FLOAT                            (1 << 6)
+#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT         (1 << 7)
+
+/* cl_device_mem_cache_type */
+#define CL_NONE                                     0x0
+#define CL_READ_ONLY_CACHE                          0x1
+#define CL_READ_WRITE_CACHE                         0x2
+
+/* cl_device_local_mem_type */
+#define CL_LOCAL                                    0x1
+#define CL_GLOBAL                                   0x2
+
+/* cl_device_exec_capabilities - bitfield */
+#define CL_EXEC_KERNEL                              (1 << 0)
+#define CL_EXEC_NATIVE_KERNEL                       (1 << 1)
+
+/* cl_command_queue_properties - bitfield */
+#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE      (1 << 0)
+#define CL_QUEUE_PROFILING_ENABLE                   (1 << 1)
+#define CL_QUEUE_ON_DEVICE                          (1 << 2)
+#define CL_QUEUE_ON_DEVICE_DEFAULT                  (1 << 3)
+
+/* cl_context_info */
+#define CL_CONTEXT_REFERENCE_COUNT                  0x1080
+#define CL_CONTEXT_DEVICES                          0x1081
+#define CL_CONTEXT_PROPERTIES                       0x1082
+#define CL_CONTEXT_NUM_DEVICES                      0x1083
+
+/* cl_context_properties */
+#define CL_CONTEXT_PLATFORM                         0x1084
+#define CL_CONTEXT_INTEROP_USER_SYNC                0x1085
+
+/* cl_device_partition_property */
+#define CL_DEVICE_PARTITION_EQUALLY                 0x1086
+#define CL_DEVICE_PARTITION_BY_COUNTS               0x1087
+#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END      0x0
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN      0x1088
+
+/* cl_device_affinity_domain */
+#define CL_DEVICE_AFFINITY_DOMAIN_NUMA               (1 << 0)
+#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE           (1 << 1)
+#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE           (1 << 2)
+#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE           (1 << 3)
+#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE           (1 << 4)
+#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5)
+
+/* cl_device_svm_capabilities */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS                       (1 << 3)
+
+/* cl_command_queue_info */
+#define CL_QUEUE_CONTEXT                            0x1090
+#define CL_QUEUE_DEVICE                             0x1091
+#define CL_QUEUE_REFERENCE_COUNT                    0x1092
+#define CL_QUEUE_PROPERTIES                         0x1093
+#define CL_QUEUE_SIZE                               0x1094
+#define CL_QUEUE_DEVICE_DEFAULT                     0x1095
+#define CL_QUEUE_PROPERTIES_ARRAY                   0x1098
+
+/* cl_mem_flags and cl_svm_mem_flags - bitfield */
+#define CL_MEM_READ_WRITE                           (1 << 0)
+#define CL_MEM_WRITE_ONLY                           (1 << 1)
+#define CL_MEM_READ_ONLY                            (1 << 2)
+#define CL_MEM_USE_HOST_PTR                         (1 << 3)
+#define CL_MEM_ALLOC_HOST_PTR                       (1 << 4)
+#define CL_MEM_COPY_HOST_PTR                        (1 << 5)
+/* reserved                                         (1 << 6)    */
+#define CL_MEM_HOST_WRITE_ONLY                      (1 << 7)
+#define CL_MEM_HOST_READ_ONLY                       (1 << 8)
+#define CL_MEM_HOST_NO_ACCESS                       (1 << 9)
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER                (1 << 10)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_SVM_ATOMICS                          (1 << 11)   /* used by cl_svm_mem_flags only */
+#define CL_MEM_KERNEL_READ_AND_WRITE                (1 << 12)
+
+/* cl_mem_migration_flags - bitfield */
+#define CL_MIGRATE_MEM_OBJECT_HOST                  (1 << 0)
+#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED     (1 << 1)
+
+/* cl_channel_order */
+#define CL_R                                        0x10B0
+#define CL_A                                        0x10B1
+#define CL_RG                                       0x10B2
+#define CL_RA                                       0x10B3
+#define CL_RGB                                      0x10B4
+#define CL_RGBA                                     0x10B5
+#define CL_BGRA                                     0x10B6
+#define CL_ARGB                                     0x10B7
+#define CL_INTENSITY                                0x10B8
+#define CL_LUMINANCE                                0x10B9
+#define CL_Rx                                       0x10BA
+#define CL_RGx                                      0x10BB
+#define CL_RGBx                                     0x10BC
+#define CL_DEPTH                                    0x10BD
+#define CL_DEPTH_STENCIL                            0x10BE
+#define CL_sRGB                                     0x10BF
+#define CL_sRGBx                                    0x10C0
+#define CL_sRGBA                                    0x10C1
+#define CL_sBGRA                                    0x10C2
+#define CL_ABGR                                     0x10C3
+
+/* cl_channel_type */
+#define CL_SNORM_INT8                               0x10D0
+#define CL_SNORM_INT16                              0x10D1
+#define CL_UNORM_INT8                               0x10D2
+#define CL_UNORM_INT16                              0x10D3
+#define CL_UNORM_SHORT_565                          0x10D4
+#define CL_UNORM_SHORT_555                          0x10D5
+#define CL_UNORM_INT_101010                         0x10D6
+#define CL_SIGNED_INT8                              0x10D7
+#define CL_SIGNED_INT16                             0x10D8
+#define CL_SIGNED_INT32                             0x10D9
+#define CL_UNSIGNED_INT8                            0x10DA
+#define CL_UNSIGNED_INT16                           0x10DB
+#define CL_UNSIGNED_INT32                           0x10DC
+#define CL_HALF_FLOAT                               0x10DD
+#define CL_FLOAT                                    0x10DE
+#define CL_UNORM_INT24                              0x10DF
+#define CL_UNORM_INT_101010_2                       0x10E0
+
+/* cl_mem_object_type */
+#define CL_MEM_OBJECT_BUFFER                        0x10F0
+#define CL_MEM_OBJECT_IMAGE2D                       0x10F1
+#define CL_MEM_OBJECT_IMAGE3D                       0x10F2
+#define CL_MEM_OBJECT_IMAGE2D_ARRAY                 0x10F3
+#define CL_MEM_OBJECT_IMAGE1D                       0x10F4
+#define CL_MEM_OBJECT_IMAGE1D_ARRAY                 0x10F5
+#define CL_MEM_OBJECT_IMAGE1D_BUFFER                0x10F6
+#define CL_MEM_OBJECT_PIPE                          0x10F7
+
+/* cl_mem_info */
+#define CL_MEM_TYPE                                 0x1100
+#define CL_MEM_FLAGS                                0x1101
+#define CL_MEM_SIZE                                 0x1102
+#define CL_MEM_HOST_PTR                             0x1103
+#define CL_MEM_MAP_COUNT                            0x1104
+#define CL_MEM_REFERENCE_COUNT                      0x1105
+#define CL_MEM_CONTEXT                              0x1106
+#define CL_MEM_ASSOCIATED_MEMOBJECT                 0x1107
+#define CL_MEM_OFFSET                               0x1108
+#define CL_MEM_USES_SVM_POINTER                     0x1109
+#define CL_MEM_PROPERTIES                           0x110A
+
+/* cl_image_info */
+#define CL_IMAGE_FORMAT                             0x1110
+#define CL_IMAGE_ELEMENT_SIZE                       0x1111
+#define CL_IMAGE_ROW_PITCH                          0x1112
+#define CL_IMAGE_SLICE_PITCH                        0x1113
+#define CL_IMAGE_WIDTH                              0x1114
+#define CL_IMAGE_HEIGHT                             0x1115
+#define CL_IMAGE_DEPTH                              0x1116
+#define CL_IMAGE_ARRAY_SIZE                         0x1117
+#define CL_IMAGE_BUFFER                             0x1118
+#define CL_IMAGE_NUM_MIP_LEVELS                     0x1119
+#define CL_IMAGE_NUM_SAMPLES                        0x111A
+
+/* cl_pipe_info */
+#define CL_PIPE_PACKET_SIZE                         0x1120
+#define CL_PIPE_MAX_PACKETS                         0x1121
+#define CL_PIPE_PROPERTIES                          0x1122
+
+/* cl_addressing_mode */
+#define CL_ADDRESS_NONE                             0x1130
+#define CL_ADDRESS_CLAMP_TO_EDGE                    0x1131
+#define CL_ADDRESS_CLAMP                            0x1132
+#define CL_ADDRESS_REPEAT                           0x1133
+#define CL_ADDRESS_MIRRORED_REPEAT                  0x1134
+
+/* cl_filter_mode */
+#define CL_FILTER_NEAREST                           0x1140
+#define CL_FILTER_LINEAR                            0x1141
+
+/* cl_sampler_info */
+#define CL_SAMPLER_REFERENCE_COUNT                  0x1150
+#define CL_SAMPLER_CONTEXT                          0x1151
+#define CL_SAMPLER_NORMALIZED_COORDS                0x1152
+#define CL_SAMPLER_ADDRESSING_MODE                  0x1153
+#define CL_SAMPLER_FILTER_MODE                      0x1154
+/* These enumerants are for the cl_khr_mipmap_image extension.
+   They have since been added to cl_ext.h with an appropriate
+   KHR suffix, but are left here for backwards compatibility. */
+#define CL_SAMPLER_MIP_FILTER_MODE                  0x1155
+#define CL_SAMPLER_LOD_MIN                          0x1156
+#define CL_SAMPLER_LOD_MAX                          0x1157
+#define CL_SAMPLER_PROPERTIES                       0x1158
+
+/* cl_map_flags - bitfield */
+#define CL_MAP_READ                                 (1 << 0)
+#define CL_MAP_WRITE                                (1 << 1)
+#define CL_MAP_WRITE_INVALIDATE_REGION              (1 << 2)
+
+/* cl_program_info */
+#define CL_PROGRAM_REFERENCE_COUNT                  0x1160
+#define CL_PROGRAM_CONTEXT                          0x1161
+#define CL_PROGRAM_NUM_DEVICES                      0x1162
+#define CL_PROGRAM_DEVICES                          0x1163
+#define CL_PROGRAM_SOURCE                           0x1164
+#define CL_PROGRAM_BINARY_SIZES                     0x1165
+#define CL_PROGRAM_BINARIES                         0x1166
+#define CL_PROGRAM_NUM_KERNELS                      0x1167
+#define CL_PROGRAM_KERNEL_NAMES                     0x1168
+#define CL_PROGRAM_IL                               0x1169
+#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT       0x116A
+#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT       0x116B
+
+/* cl_program_build_info */
+#define CL_PROGRAM_BUILD_STATUS                     0x1181
+#define CL_PROGRAM_BUILD_OPTIONS                    0x1182
+#define CL_PROGRAM_BUILD_LOG                        0x1183
+#define CL_PROGRAM_BINARY_TYPE                      0x1184
+#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185
+
+/* cl_program_binary_type */
+#define CL_PROGRAM_BINARY_TYPE_NONE                 0x0
+#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT      0x1
+#define CL_PROGRAM_BINARY_TYPE_LIBRARY              0x2
+#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE           0x4
+
+/* cl_build_status */
+#define CL_BUILD_SUCCESS                            0
+#define CL_BUILD_NONE                               -1
+#define CL_BUILD_ERROR                              -2
+#define CL_BUILD_IN_PROGRESS                        -3
+
+/* cl_kernel_info */
+#define CL_KERNEL_FUNCTION_NAME                     0x1190
+#define CL_KERNEL_NUM_ARGS                          0x1191
+#define CL_KERNEL_REFERENCE_COUNT                   0x1192
+#define CL_KERNEL_CONTEXT                           0x1193
+#define CL_KERNEL_PROGRAM                           0x1194
+#define CL_KERNEL_ATTRIBUTES                        0x1195
+
+/* cl_kernel_arg_info */
+#define CL_KERNEL_ARG_ADDRESS_QUALIFIER             0x1196
+#define CL_KERNEL_ARG_ACCESS_QUALIFIER              0x1197
+#define CL_KERNEL_ARG_TYPE_NAME                     0x1198
+#define CL_KERNEL_ARG_TYPE_QUALIFIER                0x1199
+#define CL_KERNEL_ARG_NAME                          0x119A
+
+/* cl_kernel_arg_address_qualifier */
+#define CL_KERNEL_ARG_ADDRESS_GLOBAL                0x119B
+#define CL_KERNEL_ARG_ADDRESS_LOCAL                 0x119C
+#define CL_KERNEL_ARG_ADDRESS_CONSTANT              0x119D
+#define CL_KERNEL_ARG_ADDRESS_PRIVATE               0x119E
+
+/* cl_kernel_arg_access_qualifier */
+#define CL_KERNEL_ARG_ACCESS_READ_ONLY              0x11A0
+#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY             0x11A1
+#define CL_KERNEL_ARG_ACCESS_READ_WRITE             0x11A2
+#define CL_KERNEL_ARG_ACCESS_NONE                   0x11A3
+
+/* cl_kernel_arg_type_qualifier */
+#define CL_KERNEL_ARG_TYPE_NONE                     0
+#define CL_KERNEL_ARG_TYPE_CONST                    (1 << 0)
+#define CL_KERNEL_ARG_TYPE_RESTRICT                 (1 << 1)
+#define CL_KERNEL_ARG_TYPE_VOLATILE                 (1 << 2)
+#define CL_KERNEL_ARG_TYPE_PIPE                     (1 << 3)
+
+/* cl_kernel_work_group_info */
+#define CL_KERNEL_WORK_GROUP_SIZE                   0x11B0
+#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE           0x11B1
+#define CL_KERNEL_LOCAL_MEM_SIZE                    0x11B2
+#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3
+#define CL_KERNEL_PRIVATE_MEM_SIZE                  0x11B4
+#define CL_KERNEL_GLOBAL_WORK_SIZE                  0x11B5
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE       0x2034
+#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT    0x11B8
+#define CL_KERNEL_MAX_NUM_SUB_GROUPS                0x11B9
+#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS            0x11BA
+
+/* cl_kernel_exec_info */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS                0x11B6
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM   0x11B7
+
+/* cl_event_info */
+#define CL_EVENT_COMMAND_QUEUE                      0x11D0
+#define CL_EVENT_COMMAND_TYPE                       0x11D1
+#define CL_EVENT_REFERENCE_COUNT                    0x11D2
+#define CL_EVENT_COMMAND_EXECUTION_STATUS           0x11D3
+#define CL_EVENT_CONTEXT                            0x11D4
+
+/* cl_command_type */
+#define CL_COMMAND_NDRANGE_KERNEL                   0x11F0
+#define CL_COMMAND_TASK                             0x11F1
+#define CL_COMMAND_NATIVE_KERNEL                    0x11F2
+#define CL_COMMAND_READ_BUFFER                      0x11F3
+#define CL_COMMAND_WRITE_BUFFER                     0x11F4
+#define CL_COMMAND_COPY_BUFFER                      0x11F5
+#define CL_COMMAND_READ_IMAGE                       0x11F6
+#define CL_COMMAND_WRITE_IMAGE                      0x11F7
+#define CL_COMMAND_COPY_IMAGE                       0x11F8
+#define CL_COMMAND_COPY_IMAGE_TO_BUFFER             0x11F9
+#define CL_COMMAND_COPY_BUFFER_TO_IMAGE             0x11FA
+#define CL_COMMAND_MAP_BUFFER                       0x11FB
+#define CL_COMMAND_MAP_IMAGE                        0x11FC
+#define CL_COMMAND_UNMAP_MEM_OBJECT                 0x11FD
+#define CL_COMMAND_MARKER                           0x11FE
+#define CL_COMMAND_ACQUIRE_GL_OBJECTS               0x11FF
+#define CL_COMMAND_RELEASE_GL_OBJECTS               0x1200
+#define CL_COMMAND_READ_BUFFER_RECT                 0x1201
+#define CL_COMMAND_WRITE_BUFFER_RECT                0x1202
+#define CL_COMMAND_COPY_BUFFER_RECT                 0x1203
+#define CL_COMMAND_USER                             0x1204
+#define CL_COMMAND_BARRIER                          0x1205
+#define CL_COMMAND_MIGRATE_MEM_OBJECTS              0x1206
+#define CL_COMMAND_FILL_BUFFER                      0x1207
+#define CL_COMMAND_FILL_IMAGE                       0x1208
+#define CL_COMMAND_SVM_FREE                         0x1209
+#define CL_COMMAND_SVM_MEMCPY                       0x120A
+#define CL_COMMAND_SVM_MEMFILL                      0x120B
+#define CL_COMMAND_SVM_MAP                          0x120C
+#define CL_COMMAND_SVM_UNMAP                        0x120D
+#define CL_COMMAND_SVM_MIGRATE_MEM                  0x120E
+
+/* command execution status */
+#define CL_COMPLETE                                 0x0
+#define CL_RUNNING                                  0x1
+#define CL_SUBMITTED                                0x2
+#define CL_QUEUED                                   0x3
+
+/* cl_buffer_create_type */
+#define CL_BUFFER_CREATE_TYPE_REGION                0x1220
+
+/* cl_profiling_info */
+#define CL_PROFILING_COMMAND_QUEUED                 0x1280
+#define CL_PROFILING_COMMAND_SUBMIT                 0x1281
+#define CL_PROFILING_COMMAND_START                  0x1282
+#define CL_PROFILING_COMMAND_END                    0x1283
+#define CL_PROFILING_COMMAND_COMPLETE               0x1284
+
+/* cl_device_atomic_capabilities - bitfield */
+#define CL_DEVICE_ATOMIC_ORDER_RELAXED          (1 << 0)
+#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL          (1 << 1)
+#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST          (1 << 2)
+#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM        (1 << 3)
+#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP       (1 << 4)
+#define CL_DEVICE_ATOMIC_SCOPE_DEVICE           (1 << 5)
+#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES      (1 << 6)
+
+/* cl_device_device_enqueue_capabilities - bitfield */
+#define CL_DEVICE_QUEUE_SUPPORTED               (1 << 0)
+#define CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT     (1 << 1)
+
+/* cl_khronos_vendor_id */
+#define CL_KHRONOS_VENDOR_ID_CODEPLAY               0x10004
+
+/* cl_version */
+#define CL_VERSION_MAJOR_BITS (10)
+#define CL_VERSION_MINOR_BITS (10)
+#define CL_VERSION_PATCH_BITS (12)
+
+#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1)
+#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1)
+#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1)
+
+#define CL_VERSION_MAJOR(version) \
+  ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS))
+
+#define CL_VERSION_MINOR(version) \
+  (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK)
+
+#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK)
+
+#define CL_MAKE_VERSION(major, minor, patch)                      \
+  ((((major) & CL_VERSION_MAJOR_MASK)                             \
+       << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) |      \
+   (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \
+   ((patch) & CL_VERSION_PATCH_MASK))
+
+/********************************************************************************************************/
+
+/* Platform API */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformIDs(cl_uint          num_entries,
+                 cl_platform_id * platforms,
+                 cl_uint *        num_platforms) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPlatformInfo(cl_platform_id   platform,
+                  cl_platform_info param_name,
+                  size_t           param_value_size,
+                  void *           param_value,
+                  size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Device APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceIDs(cl_platform_id   platform,
+               cl_device_type   device_type,
+               cl_uint          num_entries,
+               cl_device_id *   devices,
+               cl_uint *        num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceInfo(cl_device_id    device,
+                cl_device_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevices(cl_device_id                         in_device,
+                   const cl_device_partition_property * properties,
+                   cl_uint                              num_devices,
+                   cl_device_id *                       out_devices,
+                   cl_uint *                            num_devices_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetDefaultDeviceCommandQueue(cl_context           context,
+                               cl_device_id         device,
+                               cl_command_queue     command_queue) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceAndHostTimer(cl_device_id    device,
+                        cl_ulong*       device_timestamp,
+                        cl_ulong*       host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetHostTimer(cl_device_id device,
+               cl_ulong *   host_timestamp) CL_API_SUFFIX__VERSION_2_1;
+
+/* Context APIs */
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContext(const cl_context_properties * properties,
+                cl_uint              num_devices,
+                const cl_device_id * devices,
+                void (CL_CALLBACK * pfn_notify)(const char * errinfo,
+                                                const void * private_info,
+                                                size_t       cb,
+                                                void *       user_data),
+                void *               user_data,
+                cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_context CL_API_CALL
+clCreateContextFromType(const cl_context_properties * properties,
+                        cl_device_type      device_type,
+                        void (CL_CALLBACK * pfn_notify)(const char * errinfo,
+                                                        const void * private_info,
+                                                        size_t       cb,
+                                                        void *       user_data),
+                        void *              user_data,
+                        cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetContextInfo(cl_context         context,
+                 cl_context_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetContextDestructorCallback(cl_context         context,
+                               void (CL_CALLBACK* pfn_notify)(cl_context context,
+                                                              void* user_data),
+                               void*              user_data) CL_API_SUFFIX__VERSION_3_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueTask(cl_command_queue  command_queue,
+              cl_kernel         kernel,
+              cl_uint           num_events_in_wait_list,
+              const cl_event *  event_wait_list,
+              cl_event *        event) CL_API_SUFFIX__VERSION_1_0;
+
+/* Command Queue APIs */
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueue(cl_context                     context,
+                     cl_device_id                   device,
+                     cl_command_queue_properties    properties,
+                     cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithProperties(cl_context               context,
+                                   cl_device_id             device,
+                                   const cl_queue_properties *    properties,
+                                   cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetCommandQueueInfo(cl_command_queue      command_queue,
+                      cl_command_queue_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Memory Object APIs */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBuffer(cl_context   context,
+               cl_mem_flags flags,
+               size_t       size,
+               void *       host_ptr,
+               cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateSubBuffer(cl_mem                   buffer,
+                  cl_mem_flags             flags,
+                  cl_buffer_create_type    buffer_create_type,
+                  const void *             buffer_create_info,
+                  cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImage(cl_context              context,
+              cl_mem_flags            flags,
+              const cl_image_format * image_format,
+              const cl_image_desc *   image_desc,
+              void *                  host_ptr,
+              cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreatePipe(cl_context                 context,
+             cl_mem_flags               flags,
+             cl_uint                    pipe_packet_size,
+             cl_uint                    pipe_max_packets,
+             const cl_pipe_properties * properties,
+             cl_int *                   errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferWithProperties(cl_context                context,
+                             const cl_mem_properties * properties,
+                             cl_mem_flags              flags,
+                             size_t                    size,
+                             void *                    host_ptr,
+                             cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImageWithProperties(cl_context                context,
+                            const cl_mem_properties * properties,
+                            cl_mem_flags              flags,
+                            const cl_image_format *   image_format,
+                            const cl_image_desc *     image_desc,
+                            void *                    host_ptr,
+                            cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_3_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSupportedImageFormats(cl_context           context,
+                           cl_mem_flags         flags,
+                           cl_mem_object_type   image_type,
+                           cl_uint              num_entries,
+                           cl_image_format *    image_formats,
+                           cl_uint *            num_image_formats) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetMemObjectInfo(cl_mem           memobj,
+                   cl_mem_info      param_name,
+                   size_t           param_value_size,
+                   void *           param_value,
+                   size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetImageInfo(cl_mem           image,
+               cl_image_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetPipeInfo(cl_mem           pipe,
+              cl_pipe_info     param_name,
+              size_t           param_value_size,
+              void *           param_value,
+              size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetMemObjectDestructorCallback(cl_mem memobj,
+                                 void (CL_CALLBACK * pfn_notify)(cl_mem memobj,
+                                                                 void * user_data),
+                                 void * user_data) CL_API_SUFFIX__VERSION_1_1;
+
+/* SVM Allocation APIs */
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAlloc(cl_context       context,
+           cl_svm_mem_flags flags,
+           size_t           size,
+           cl_uint          alignment) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFree(cl_context        context,
+          void *            svm_pointer) CL_API_SUFFIX__VERSION_2_0;
+
+/* Sampler APIs */
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSampler(cl_context          context,
+                cl_bool             normalized_coords,
+                cl_addressing_mode  addressing_mode,
+                cl_filter_mode      filter_mode,
+                cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_sampler CL_API_CALL
+clCreateSamplerWithProperties(cl_context                     context,
+                              const cl_sampler_properties *  sampler_properties,
+                              cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSamplerInfo(cl_sampler         sampler,
+                 cl_sampler_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Program Object APIs */
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithSource(cl_context        context,
+                          cl_uint           count,
+                          const char **     strings,
+                          const size_t *    lengths,
+                          cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBinary(cl_context                     context,
+                          cl_uint                        num_devices,
+                          const cl_device_id *           device_list,
+                          const size_t *                 lengths,
+                          const unsigned char **         binaries,
+                          cl_int *                       binary_status,
+                          cl_int *                       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithBuiltInKernels(cl_context            context,
+                                  cl_uint               num_devices,
+                                  const cl_device_id *  device_list,
+                                  const char *          kernel_names,
+                                  cl_int *              errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithIL(cl_context    context,
+                     const void*    il,
+                     size_t         length,
+                     cl_int*        errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clBuildProgram(cl_program           program,
+               cl_uint              num_devices,
+               const cl_device_id * device_list,
+               const char *         options,
+               void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                                void * user_data),
+               void *               user_data) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCompileProgram(cl_program           program,
+                 cl_uint              num_devices,
+                 const cl_device_id * device_list,
+                 const char *         options,
+                 cl_uint              num_input_headers,
+                 const cl_program *   input_headers,
+                 const char **        header_include_names,
+                 void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                                  void * user_data),
+                 void *               user_data) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clLinkProgram(cl_context           context,
+              cl_uint              num_devices,
+              const cl_device_id * device_list,
+              const char *         options,
+              cl_uint              num_input_programs,
+              const cl_program *   input_programs,
+              void (CL_CALLBACK *  pfn_notify)(cl_program program,
+                                               void * user_data),
+              void *               user_data,
+              cl_int *             errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_2_2_DEPRECATED cl_int CL_API_CALL
+clSetProgramReleaseCallback(cl_program          program,
+                            void (CL_CALLBACK * pfn_notify)(cl_program program,
+                                                            void * user_data),
+                            void *              user_data) CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetProgramSpecializationConstant(cl_program  program,
+                                   cl_uint     spec_id,
+                                   size_t      spec_size,
+                                   const void* spec_value) CL_API_SUFFIX__VERSION_2_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramInfo(cl_program         program,
+                 cl_program_info    param_name,
+                 size_t             param_value_size,
+                 void *             param_value,
+                 size_t *           param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetProgramBuildInfo(cl_program            program,
+                      cl_device_id          device,
+                      cl_program_build_info param_name,
+                      size_t                param_value_size,
+                      void *                param_value,
+                      size_t *              param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Kernel Object APIs */
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCreateKernel(cl_program      program,
+               const char *    kernel_name,
+               cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateKernelsInProgram(cl_program     program,
+                         cl_uint        num_kernels,
+                         cl_kernel *    kernels,
+                         cl_uint *      num_kernels_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_kernel CL_API_CALL
+clCloneKernel(cl_kernel     source_kernel,
+              cl_int*       errcode_ret) CL_API_SUFFIX__VERSION_2_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainKernel(cl_kernel    kernel) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseKernel(cl_kernel   kernel) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArg(cl_kernel    kernel,
+               cl_uint      arg_index,
+               size_t       arg_size,
+               const void * arg_value) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointer(cl_kernel    kernel,
+                         cl_uint      arg_index,
+                         const void * arg_value) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfo(cl_kernel            kernel,
+                    cl_kernel_exec_info  param_name,
+                    size_t               param_value_size,
+                    const void *         param_value) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelInfo(cl_kernel       kernel,
+                cl_kernel_info  param_name,
+                size_t          param_value_size,
+                void *          param_value,
+                size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelArgInfo(cl_kernel       kernel,
+                   cl_uint         arg_indx,
+                   cl_kernel_arg_info  param_name,
+                   size_t          param_value_size,
+                   void *          param_value,
+                   size_t *        param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelWorkGroupInfo(cl_kernel                  kernel,
+                         cl_device_id               device,
+                         cl_kernel_work_group_info  param_name,
+                         size_t                     param_value_size,
+                         void *                     param_value,
+                         size_t *                   param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfo(cl_kernel                   kernel,
+                        cl_device_id                device,
+                        cl_kernel_sub_group_info    param_name,
+                        size_t                      input_value_size,
+                        const void*                 input_value,
+                        size_t                      param_value_size,
+                        void*                       param_value,
+                        size_t*                     param_value_size_ret) CL_API_SUFFIX__VERSION_2_1;
+
+/* Event Object APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clWaitForEvents(cl_uint             num_events,
+                const cl_event *    event_list) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventInfo(cl_event         event,
+               cl_event_info    param_name,
+               size_t           param_value_size,
+               void *           param_value,
+               size_t *         param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateUserEvent(cl_context    context,
+                  cl_int *      errcode_ret) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetUserEventStatus(cl_event   event,
+                     cl_int     execution_status) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetEventCallback(cl_event    event,
+                   cl_int      command_exec_callback_type,
+                   void (CL_CALLBACK * pfn_notify)(cl_event event,
+                                                   cl_int   event_command_status,
+                                                   void *   user_data),
+                   void *      user_data) CL_API_SUFFIX__VERSION_1_1;
+
+/* Profiling APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetEventProfilingInfo(cl_event            event,
+                        cl_profiling_info   param_name,
+                        size_t              param_value_size,
+                        void *              param_value,
+                        size_t *            param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+/* Flush and Finish APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0;
+
+/* Enqueued Commands APIs */
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBuffer(cl_command_queue    command_queue,
+                    cl_mem              buffer,
+                    cl_bool             blocking_read,
+                    size_t              offset,
+                    size_t              size,
+                    void *              ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadBufferRect(cl_command_queue    command_queue,
+                        cl_mem              buffer,
+                        cl_bool             blocking_read,
+                        const size_t *      buffer_offset,
+                        const size_t *      host_offset,
+                        const size_t *      region,
+                        size_t              buffer_row_pitch,
+                        size_t              buffer_slice_pitch,
+                        size_t              host_row_pitch,
+                        size_t              host_slice_pitch,
+                        void *              ptr,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBuffer(cl_command_queue   command_queue,
+                     cl_mem             buffer,
+                     cl_bool            blocking_write,
+                     size_t             offset,
+                     size_t             size,
+                     const void *       ptr,
+                     cl_uint            num_events_in_wait_list,
+                     const cl_event *   event_wait_list,
+                     cl_event *         event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteBufferRect(cl_command_queue    command_queue,
+                         cl_mem              buffer,
+                         cl_bool             blocking_write,
+                         const size_t *      buffer_offset,
+                         const size_t *      host_offset,
+                         const size_t *      region,
+                         size_t              buffer_row_pitch,
+                         size_t              buffer_slice_pitch,
+                         size_t              host_row_pitch,
+                         size_t              host_slice_pitch,
+                         const void *        ptr,
+                         cl_uint             num_events_in_wait_list,
+                         const cl_event *    event_wait_list,
+                         cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillBuffer(cl_command_queue   command_queue,
+                    cl_mem             buffer,
+                    const void *       pattern,
+                    size_t             pattern_size,
+                    size_t             offset,
+                    size_t             size,
+                    cl_uint            num_events_in_wait_list,
+                    const cl_event *   event_wait_list,
+                    cl_event *         event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBuffer(cl_command_queue    command_queue,
+                    cl_mem              src_buffer,
+                    cl_mem              dst_buffer,
+                    size_t              src_offset,
+                    size_t              dst_offset,
+                    size_t              size,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferRect(cl_command_queue    command_queue,
+                        cl_mem              src_buffer,
+                        cl_mem              dst_buffer,
+                        const size_t *      src_origin,
+                        const size_t *      dst_origin,
+                        const size_t *      region,
+                        size_t              src_row_pitch,
+                        size_t              src_slice_pitch,
+                        size_t              dst_row_pitch,
+                        size_t              dst_slice_pitch,
+                        cl_uint             num_events_in_wait_list,
+                        const cl_event *    event_wait_list,
+                        cl_event *          event) CL_API_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReadImage(cl_command_queue     command_queue,
+                   cl_mem               image,
+                   cl_bool              blocking_read,
+                   const size_t *       origin,
+                   const size_t *       region,
+                   size_t               row_pitch,
+                   size_t               slice_pitch,
+                   void *               ptr,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWriteImage(cl_command_queue    command_queue,
+                    cl_mem              image,
+                    cl_bool             blocking_write,
+                    const size_t *      origin,
+                    const size_t *      region,
+                    size_t              input_row_pitch,
+                    size_t              input_slice_pitch,
+                    const void *        ptr,
+                    cl_uint             num_events_in_wait_list,
+                    const cl_event *    event_wait_list,
+                    cl_event *          event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueFillImage(cl_command_queue   command_queue,
+                   cl_mem             image,
+                   const void *       fill_color,
+                   const size_t *     origin,
+                   const size_t *     region,
+                   cl_uint            num_events_in_wait_list,
+                   const cl_event *   event_wait_list,
+                   cl_event *         event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImage(cl_command_queue     command_queue,
+                   cl_mem               src_image,
+                   cl_mem               dst_image,
+                   const size_t *       src_origin,
+                   const size_t *       dst_origin,
+                   const size_t *       region,
+                   cl_uint              num_events_in_wait_list,
+                   const cl_event *     event_wait_list,
+                   cl_event *           event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyImageToBuffer(cl_command_queue command_queue,
+                           cl_mem           src_image,
+                           cl_mem           dst_buffer,
+                           const size_t *   src_origin,
+                           const size_t *   region,
+                           size_t           dst_offset,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueCopyBufferToImage(cl_command_queue command_queue,
+                           cl_mem           src_buffer,
+                           cl_mem           dst_image,
+                           size_t           src_offset,
+                           const size_t *   dst_origin,
+                           const size_t *   region,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event * event_wait_list,
+                           cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapBuffer(cl_command_queue command_queue,
+                   cl_mem           buffer,
+                   cl_bool          blocking_map,
+                   cl_map_flags     map_flags,
+                   size_t           offset,
+                   size_t           size,
+                   cl_uint          num_events_in_wait_list,
+                   const cl_event * event_wait_list,
+                   cl_event *       event,
+                   cl_int *         errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clEnqueueMapImage(cl_command_queue  command_queue,
+                  cl_mem            image,
+                  cl_bool           blocking_map,
+                  cl_map_flags      map_flags,
+                  const size_t *    origin,
+                  const size_t *    region,
+                  size_t *          image_row_pitch,
+                  size_t *          image_slice_pitch,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event,
+                  cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                        cl_mem           memobj,
+                        void *           mapped_ptr,
+                        cl_uint          num_events_in_wait_list,
+                        const cl_event * event_wait_list,
+                        cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjects(cl_command_queue       command_queue,
+                           cl_uint                num_mem_objects,
+                           const cl_mem *         mem_objects,
+                           cl_mem_migration_flags flags,
+                           cl_uint                num_events_in_wait_list,
+                           const cl_event *       event_wait_list,
+                           cl_event *             event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                       cl_kernel        kernel,
+                       cl_uint          work_dim,
+                       const size_t *   global_work_offset,
+                       const size_t *   global_work_size,
+                       const size_t *   local_work_size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event * event_wait_list,
+                       cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueNativeKernel(cl_command_queue  command_queue,
+                      void (CL_CALLBACK * user_func)(void *),
+                      void *            args,
+                      size_t            cb_args,
+                      cl_uint           num_mem_objects,
+                      const cl_mem *    mem_list,
+                      const void **     args_mem_loc,
+                      cl_uint           num_events_in_wait_list,
+                      const cl_event *  event_wait_list,
+                      cl_event *        event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMarkerWithWaitList(cl_command_queue  command_queue,
+                            cl_uint           num_events_in_wait_list,
+                            const cl_event *  event_wait_list,
+                            cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueBarrierWithWaitList(cl_command_queue  command_queue,
+                             cl_uint           num_events_in_wait_list,
+                             const cl_event *  event_wait_list,
+                             cl_event *        event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFree(cl_command_queue  command_queue,
+                 cl_uint           num_svm_pointers,
+                 void *            svm_pointers[],
+                 void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
+                                                    cl_uint          num_svm_pointers,
+                                                    void *           svm_pointers[],
+                                                    void *           user_data),
+                 void *            user_data,
+                 cl_uint           num_events_in_wait_list,
+                 const cl_event *  event_wait_list,
+                 cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpy(cl_command_queue  command_queue,
+                   cl_bool           blocking_copy,
+                   void *            dst_ptr,
+                   const void *      src_ptr,
+                   size_t            size,
+                   cl_uint           num_events_in_wait_list,
+                   const cl_event *  event_wait_list,
+                   cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFill(cl_command_queue  command_queue,
+                    void *            svm_ptr,
+                    const void *      pattern,
+                    size_t            pattern_size,
+                    size_t            size,
+                    cl_uint           num_events_in_wait_list,
+                    const cl_event *  event_wait_list,
+                    cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMap(cl_command_queue  command_queue,
+                cl_bool           blocking_map,
+                cl_map_flags      flags,
+                void *            svm_ptr,
+                size_t            size,
+                cl_uint           num_events_in_wait_list,
+                const cl_event *  event_wait_list,
+                cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmap(cl_command_queue  command_queue,
+                  void *            svm_ptr,
+                  cl_uint           num_events_in_wait_list,
+                  const cl_event *  event_wait_list,
+                  cl_event *        event) CL_API_SUFFIX__VERSION_2_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMigrateMem(cl_command_queue         command_queue,
+                       cl_uint                  num_svm_pointers,
+                       const void **            svm_pointers,
+                       const size_t *           sizes,
+                       cl_mem_migration_flags   flags,
+                       cl_uint                  num_events_in_wait_list,
+                       const cl_event *         event_wait_list,
+                       cl_event *               event) CL_API_SUFFIX__VERSION_2_1;
+
+/* Extension function access
+ *
+ * Returns the extension function address for the given function name,
+ * or NULL if a valid function can not be found.  The client must
+ * check to make sure the address is not NULL, before using or
+ * calling the returned function address.
+ */
+extern CL_API_ENTRY void * CL_API_CALL
+clGetExtensionFunctionAddressForPlatform(cl_platform_id platform,
+                                         const char *   func_name) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+    /*
+     *  WARNING:
+     *     This API introduces mutable state into the OpenCL implementation. It has been REMOVED
+     *  to better facilitate thread safety.  The 1.0 API is not thread safe. It is not tested by the
+     *  OpenCL 1.1 conformance test, and consequently may not work or may not work dependably.
+     *  It is likely to be non-performant. Use of this API is not advised. Use at your own risk.
+     *
+     *  Software developers previously relying on this API are instructed to set the command queue
+     *  properties when creating the queue, instead.
+     */
+    extern CL_API_ENTRY cl_int CL_API_CALL
+    clSetCommandQueueProperty(cl_command_queue              command_queue,
+                              cl_command_queue_properties   properties,
+                              cl_bool                       enable,
+                              cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED;
+#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage2D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_row_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateImage3D(cl_context              context,
+                cl_mem_flags            flags,
+                const cl_image_format * image_format,
+                size_t                  image_width,
+                size_t                  image_height,
+                size_t                  image_depth,
+                size_t                  image_row_pitch,
+                size_t                  image_slice_pitch,
+                void *                  host_ptr,
+                cl_int *                errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueMarker(cl_command_queue    command_queue,
+                cl_event *          event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueWaitForEvents(cl_command_queue  command_queue,
+                        cl_uint          num_events,
+                        const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL
+clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL
+clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_H */
diff --git a/ext/cudart/include/CL/cl.hpp b/ext/cudart/include/CL/cl.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..263e0ab4717ed411621f8cb3330aed385c672151
--- /dev/null
+++ b/ext/cudart/include/CL/cl.hpp
@@ -0,0 +1,12419 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2013 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ ******************************************************************************/
+
+/*! \file
+ *
+ *   \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and 
+ *       OpenCL 1.2 (rev 15)    
+ *   \author Benedict R. Gaster, Laurent Morichetti and Lee Howes
+ *   
+ *   Additions and fixes from:
+ *       Brian Cole, March 3rd 2010 and April 2012 
+ *       Matt Gruenke, April 2012.
+ *       Bruce Merry, February 2013.
+ *   
+ *   \version 1.2.5
+ *   \date June 2013
+ *
+ *   Optional extension support
+ *
+ *         cl
+ *         cl_ext_device_fission
+ *				#define USE_CL_DEVICE_FISSION
+ */
+
+/*! \mainpage
+ * \section intro Introduction
+ * For many large applications C++ is the language of choice and so it seems
+ * reasonable to define C++ bindings for OpenCL.
+ *
+ *
+ * The interface is contained with a single C++ header file \em cl.hpp and all
+ * definitions are contained within the namespace \em cl. There is no additional
+ * requirement to include \em cl.h and to use either the C++ or original C
+ * bindings it is enough to simply include \em cl.hpp.
+ *
+ * The bindings themselves are lightweight and correspond closely to the
+ * underlying C API. Using the C++ bindings introduces no additional execution
+ * overhead.
+ *
+ * For detail documentation on the bindings see:
+ *
+ * The OpenCL C++ Wrapper API 1.2 (revision 09)
+ *  http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf
+ *
+ * \section example Example
+ *
+ * The following example shows a general use case for the C++
+ * bindings, including support for the optional exception feature and
+ * also the supplied vector and string classes, see following sections for
+ * decriptions of these features.
+ *
+ * \code
+ * #define __CL_ENABLE_EXCEPTIONS
+ * 
+ * #if defined(__APPLE__) || defined(__MACOSX)
+ * #include <OpenCL/cl.hpp>
+ * #else
+ * #include <CL/cl.hpp>
+ * #endif
+ * #include <cstdio>
+ * #include <cstdlib>
+ * #include <iostream>
+ * 
+ *  const char * helloStr  = "__kernel void "
+ *                           "hello(void) "
+ *                           "{ "
+ *                           "  "
+ *                           "} ";
+ * 
+ *  int
+ *  main(void)
+ *  {
+ *     cl_int err = CL_SUCCESS;
+ *     try {
+ *
+ *       std::vector<cl::Platform> platforms;
+ *       cl::Platform::get(&platforms);
+ *       if (platforms.size() == 0) {
+ *           std::cout << "Platform size 0\n";
+ *           return -1;
+ *       }
+ *
+ *       cl_context_properties properties[] = 
+ *          { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0};
+ *       cl::Context context(CL_DEVICE_TYPE_CPU, properties); 
+ * 
+ *       std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+ * 
+ *       cl::Program::Sources source(1,
+ *           std::make_pair(helloStr,strlen(helloStr)));
+ *       cl::Program program_ = cl::Program(context, source);
+ *       program_.build(devices);
+ * 
+ *       cl::Kernel kernel(program_, "hello", &err);
+ * 
+ *       cl::Event event;
+ *       cl::CommandQueue queue(context, devices[0], 0, &err);
+ *       queue.enqueueNDRangeKernel(
+ *           kernel, 
+ *           cl::NullRange, 
+ *           cl::NDRange(4,4),
+ *           cl::NullRange,
+ *           NULL,
+ *           &event); 
+ * 
+ *       event.wait();
+ *     }
+ *     catch (cl::Error err) {
+ *        std::cerr 
+ *           << "ERROR: "
+ *           << err.what()
+ *           << "("
+ *           << err.err()
+ *           << ")"
+ *           << std::endl;
+ *     }
+ * 
+ *    return EXIT_SUCCESS;
+ *  }
+ * 
+ * \endcode
+ *
+ */
+#ifndef CL_HPP_
+#define CL_HPP_
+
+#ifdef _WIN32
+
+#include <windows.h>
+#include <malloc.h>
+#include <iterator>
+#include <intrin.h>
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+#include <exception>
+#endif // #if defined(__CL_ENABLE_EXCEPTIONS)
+
+#pragma push_macro("max")
+#undef max
+#if defined(USE_DX_INTEROP)
+#include <CL/cl_d3d10.h>
+#include <CL/cl_dx9_media_sharing.h>
+#endif
+#endif // _WIN32
+
+// 
+#if defined(USE_CL_DEVICE_FISSION)
+#include <CL/cl_ext.h>
+#endif
+
+#if defined(__APPLE__) || defined(__MACOSX)
+#include <OpenGL/OpenGL.h>
+#include <OpenCL/opencl.h>
+#include <libkern/OSAtomic.h>
+#else
+#include <GL/gl.h>
+#include <CL/opencl.h>
+#endif // !__APPLE__
+
+// To avoid accidentally taking ownership of core OpenCL types
+// such as cl_kernel constructors are made explicit
+// under OpenCL 1.2
+#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS explicit
+#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+#define __CL_EXPLICIT_CONSTRUCTORS 
+#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+// Define deprecated prefixes and suffixes to ensure compilation
+// in case they are not pre-defined
+#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED)
+#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED)
+#if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED)
+#if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED)
+#if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED)
+#define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED  
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED)
+#if !defined(CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED)
+#define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED
+#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED)
+
+#if !defined(CL_CALLBACK)
+#define CL_CALLBACK
+#endif //CL_CALLBACK
+
+#include <utility>
+#include <limits>
+
+#if !defined(__NO_STD_VECTOR)
+#include <vector>
+#endif
+
+#if !defined(__NO_STD_STRING)
+#include <string>
+#endif 
+
+#if defined(linux) || defined(__APPLE__) || defined(__MACOSX)
+#include <alloca.h>
+
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif // linux
+
+#include <cstring>
+
+
+/*! \namespace cl
+ *
+ * \brief The OpenCL C++ bindings are defined within this namespace.
+ *
+ */
+namespace cl {
+
+class Memory;
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __INIT_CL_EXT_FCN_PTR(name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddress(#name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_VERSION_1_2)
+#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \
+    if(!pfn_##name) { \
+        pfn_##name = (PFN_##name) \
+            clGetExtensionFunctionAddressForPlatform(platform, #name); \
+        if(!pfn_##name) { \
+        } \
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+class Program;
+class Device;
+class Context;
+class CommandQueue;
+class Memory;
+class Buffer;
+
+#if defined(__CL_ENABLE_EXCEPTIONS)
+/*! \brief Exception class 
+ * 
+ *  This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined.
+ */
+class Error : public std::exception
+{
+private:
+    cl_int err_;
+    const char * errStr_;
+public:
+    /*! \brief Create a new CL error exception for a given error code
+     *  and corresponding message.
+     * 
+     *  \param err error code value.
+     *
+     *  \param errStr a descriptive string that must remain in scope until
+     *                handling of the exception has concluded.  If set, it
+     *                will be returned by what().
+     */
+    Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr)
+    {}
+
+    ~Error() throw() {}
+
+    /*! \brief Get error string associated with exception
+     *
+     * \return A memory pointer to the error message string.
+     */
+    virtual const char * what() const throw ()
+    {
+        if (errStr_ == NULL) {
+            return "empty";
+        }
+        else {
+            return errStr_;
+        }
+    }
+
+    /*! \brief Get error code associated with exception
+     *
+     *  \return The error code.
+     */
+    cl_int err(void) const { return err_; }
+};
+
+#define __ERR_STR(x) #x
+#else
+#define __ERR_STR(x) NULL
+#endif // __CL_ENABLE_EXCEPTIONS
+
+
+namespace detail
+{
+#if defined(__CL_ENABLE_EXCEPTIONS)
+static inline cl_int errHandler (
+    cl_int err,
+    const char * errStr = NULL)
+{
+    if (err != CL_SUCCESS) {
+        throw Error(err, errStr);
+    }
+    return err;
+}
+#else
+static inline cl_int errHandler (cl_int err, const char * errStr = NULL)
+{
+    (void) errStr; // suppress unused variable warning
+    return err;
+}
+#endif // __CL_ENABLE_EXCEPTIONS
+}
+
+
+
+//! \cond DOXYGEN_DETAIL
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#define __GET_DEVICE_INFO_ERR               __ERR_STR(clGetDeviceInfo)
+#define __GET_PLATFORM_INFO_ERR             __ERR_STR(clGetPlatformInfo)
+#define __GET_DEVICE_IDS_ERR                __ERR_STR(clGetDeviceIDs)
+#define __GET_PLATFORM_IDS_ERR              __ERR_STR(clGetPlatformIDs)
+#define __GET_CONTEXT_INFO_ERR              __ERR_STR(clGetContextInfo)
+#define __GET_EVENT_INFO_ERR                __ERR_STR(clGetEventInfo)
+#define __GET_EVENT_PROFILE_INFO_ERR        __ERR_STR(clGetEventProfileInfo)
+#define __GET_MEM_OBJECT_INFO_ERR           __ERR_STR(clGetMemObjectInfo)
+#define __GET_IMAGE_INFO_ERR                __ERR_STR(clGetImageInfo)
+#define __GET_SAMPLER_INFO_ERR              __ERR_STR(clGetSamplerInfo)
+#define __GET_KERNEL_INFO_ERR               __ERR_STR(clGetKernelInfo)
+#if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_ARG_INFO_ERR               __ERR_STR(clGetKernelArgInfo)
+#endif // #if defined(CL_VERSION_1_2)
+#define __GET_KERNEL_WORK_GROUP_INFO_ERR    __ERR_STR(clGetKernelWorkGroupInfo)
+#define __GET_PROGRAM_INFO_ERR              __ERR_STR(clGetProgramInfo)
+#define __GET_PROGRAM_BUILD_INFO_ERR        __ERR_STR(clGetProgramBuildInfo)
+#define __GET_COMMAND_QUEUE_INFO_ERR        __ERR_STR(clGetCommandQueueInfo)
+
+#define __CREATE_CONTEXT_ERR                __ERR_STR(clCreateContext)
+#define __CREATE_CONTEXT_FROM_TYPE_ERR      __ERR_STR(clCreateContextFromType)
+#define __GET_SUPPORTED_IMAGE_FORMATS_ERR   __ERR_STR(clGetSupportedImageFormats)
+
+#define __CREATE_BUFFER_ERR                 __ERR_STR(clCreateBuffer)
+#define __COPY_ERR                          __ERR_STR(cl::copy)
+#define __CREATE_SUBBUFFER_ERR              __ERR_STR(clCreateSubBuffer)
+#define __CREATE_GL_BUFFER_ERR              __ERR_STR(clCreateFromGLBuffer)
+#define __CREATE_GL_RENDER_BUFFER_ERR       __ERR_STR(clCreateFromGLBuffer)
+#define __GET_GL_OBJECT_INFO_ERR            __ERR_STR(clGetGLObjectInfo)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_IMAGE_ERR                  __ERR_STR(clCreateImage)
+#define __CREATE_GL_TEXTURE_ERR             __ERR_STR(clCreateFromGLTexture)
+#define __IMAGE_DIMENSION_ERR               __ERR_STR(Incorrect image dimensions)
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_SAMPLER_PROPERTY_ERR       __ERR_STR(clCreateSamplerWithProperties)
+#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback)
+
+#define __CREATE_USER_EVENT_ERR             __ERR_STR(clCreateUserEvent)
+#define __SET_USER_EVENT_STATUS_ERR         __ERR_STR(clSetUserEventStatus)
+#define __SET_EVENT_CALLBACK_ERR            __ERR_STR(clSetEventCallback)
+#define __WAIT_FOR_EVENTS_ERR               __ERR_STR(clWaitForEvents)
+
+#define __CREATE_KERNEL_ERR                 __ERR_STR(clCreateKernel)
+#define __SET_KERNEL_ARGS_ERR               __ERR_STR(clSetKernelArg)
+#define __CREATE_PROGRAM_WITH_SOURCE_ERR    __ERR_STR(clCreateProgramWithSource)
+#define __CREATE_PROGRAM_WITH_BINARY_ERR    __ERR_STR(clCreateProgramWithBinary)
+#if defined(CL_VERSION_1_2)
+#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR    __ERR_STR(clCreateProgramWithBuiltInKernels)
+#endif // #if defined(CL_VERSION_1_2)
+#define __BUILD_PROGRAM_ERR                 __ERR_STR(clBuildProgram)
+#if defined(CL_VERSION_1_2)
+#define __COMPILE_PROGRAM_ERR                  __ERR_STR(clCompileProgram)
+
+#endif // #if defined(CL_VERSION_1_2)
+#define __CREATE_KERNELS_IN_PROGRAM_ERR     __ERR_STR(clCreateKernelsInProgram)
+
+#define __CREATE_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clCreateCommandQueueWithProperties)
+#define __SET_COMMAND_QUEUE_PROPERTY_ERR    __ERR_STR(clSetCommandQueueProperty)
+#define __ENQUEUE_READ_BUFFER_ERR           __ERR_STR(clEnqueueReadBuffer)
+#define __ENQUEUE_READ_BUFFER_RECT_ERR      __ERR_STR(clEnqueueReadBufferRect)
+#define __ENQUEUE_WRITE_BUFFER_ERR          __ERR_STR(clEnqueueWriteBuffer)
+#define __ENQUEUE_WRITE_BUFFER_RECT_ERR     __ERR_STR(clEnqueueWriteBufferRect)
+#define __ENQEUE_COPY_BUFFER_ERR            __ERR_STR(clEnqueueCopyBuffer)
+#define __ENQEUE_COPY_BUFFER_RECT_ERR       __ERR_STR(clEnqueueCopyBufferRect)
+#define __ENQUEUE_FILL_BUFFER_ERR           __ERR_STR(clEnqueueFillBuffer)
+#define __ENQUEUE_READ_IMAGE_ERR            __ERR_STR(clEnqueueReadImage)
+#define __ENQUEUE_WRITE_IMAGE_ERR           __ERR_STR(clEnqueueWriteImage)
+#define __ENQUEUE_COPY_IMAGE_ERR            __ERR_STR(clEnqueueCopyImage)
+#define __ENQUEUE_FILL_IMAGE_ERR           __ERR_STR(clEnqueueFillImage)
+#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR  __ERR_STR(clEnqueueCopyImageToBuffer)
+#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR  __ERR_STR(clEnqueueCopyBufferToImage)
+#define __ENQUEUE_MAP_BUFFER_ERR            __ERR_STR(clEnqueueMapBuffer)
+#define __ENQUEUE_MAP_IMAGE_ERR             __ERR_STR(clEnqueueMapImage)
+#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR      __ERR_STR(clEnqueueUnMapMemObject)
+#define __ENQUEUE_NDRANGE_KERNEL_ERR        __ERR_STR(clEnqueueNDRangeKernel)
+#define __ENQUEUE_NATIVE_KERNEL             __ERR_STR(clEnqueueNativeKernel)
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR   __ERR_STR(clEnqueueMigrateMemObjects)
+#endif // #if defined(CL_VERSION_1_2)
+
+#define __ENQUEUE_ACQUIRE_GL_ERR            __ERR_STR(clEnqueueAcquireGLObjects)
+#define __ENQUEUE_RELEASE_GL_ERR            __ERR_STR(clEnqueueReleaseGLObjects)
+
+
+#define __RETAIN_ERR                        __ERR_STR(Retain Object)
+#define __RELEASE_ERR                       __ERR_STR(Release Object)
+#define __FLUSH_ERR                         __ERR_STR(clFlush)
+#define __FINISH_ERR                        __ERR_STR(clFinish)
+#define __VECTOR_CAPACITY_ERR               __ERR_STR(Vector capacity error)
+
+/**
+ * CL 1.2 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_2)
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevices)
+#else
+#define __CREATE_SUB_DEVICES                __ERR_STR(clCreateSubDevicesEXT)
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+#define __ENQUEUE_MARKER_ERR                __ERR_STR(clEnqueueMarker)
+#define __ENQUEUE_WAIT_FOR_EVENTS_ERR       __ERR_STR(clEnqueueWaitForEvents)
+#define __ENQUEUE_BARRIER_ERR               __ERR_STR(clEnqueueBarrier)
+#define __UNLOAD_COMPILER_ERR               __ERR_STR(clUnloadCompiler)
+#define __CREATE_GL_TEXTURE_2D_ERR          __ERR_STR(clCreateFromGLTexture2D)
+#define __CREATE_GL_TEXTURE_3D_ERR          __ERR_STR(clCreateFromGLTexture3D)
+#define __CREATE_IMAGE2D_ERR                __ERR_STR(clCreateImage2D)
+#define __CREATE_IMAGE3D_ERR                __ERR_STR(clCreateImage3D)
+#endif // #if defined(CL_VERSION_1_1)
+
+#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) || !defined(CL_VERSION_2_0)
+#define __CREATE_COMMAND_QUEUE_ERR          __ERR_STR(clCreateCommandQueue)
+#define __CREATE_SAMPLER_ERR                __ERR_STR(clCreateSampler)
+#define __ENQUEUE_TASK_ERR                  __ERR_STR(clEnqueueTask)
+#endif // #if defined(CL_VERSION_1_2)
+
+#endif // __CL_USER_OVERRIDE_ERROR_STRINGS
+//! \endcond
+
+/**
+ * CL 1.2 marker and barrier commands
+ */
+#if defined(CL_VERSION_1_2)
+#define __ENQUEUE_MARKER_WAIT_LIST_ERR                __ERR_STR(clEnqueueMarkerWithWaitList)
+#define __ENQUEUE_BARRIER_WAIT_LIST_ERR               __ERR_STR(clEnqueueBarrierWithWaitList)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING)
+typedef std::string STRING_CLASS;
+#elif !defined(__USE_DEV_STRING) 
+
+/*! \class string
+ * \brief Simple string class, that provides a limited subset of std::string
+ * functionality but avoids many of the issues that come with that class.
+ 
+ *  \note Deprecated. Please use std::string as default or
+ *  re-define the string class to match the std::string
+ *  interface by defining STRING_CLASS
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    ::size_t size_;
+    char * str_;
+public:
+    //! \brief Constructs an empty string, allocating no memory.
+    string(void) : size_(0), str_(NULL)
+    {
+    }
+
+    /*! \brief Constructs a string populated from an arbitrary value of
+     *  specified size.
+     * 
+     *  An extra '\0' is added, in case none was contained in str.
+     *
+     *  \param str the initial value of the string instance.  Note that '\0'     
+     *             characters receive no special treatment.  If NULL,
+     *             the string is left empty, with a size of 0.
+     *
+     *  \param size the number of characters to copy from str.
+     */
+    string(const char * str, ::size_t size) :
+        size_(size),
+        str_(NULL)
+    {
+        if( size > 0 ) {
+            str_ = new char[size_+1];
+            if (str_ != NULL) {
+                memcpy(str_, str, size_  * sizeof(char));
+                str_[size_] = '\0';
+            }
+            else {
+                size_ = 0;
+            }
+        }
+    }
+
+    /*! \brief Constructs a string populated from a null-terminated value.
+     *
+     *  \param str the null-terminated initial value of the string instance.
+     *             If NULL, the string is left empty, with a size of 0.
+     */
+    string(const char * str) :
+        size_(0),
+        str_(NULL)
+    {
+        if( str ) {
+            size_= ::strlen(str);
+        }
+        if( size_ > 0 ) {
+            str_ = new char[size_ + 1];
+            if (str_ != NULL) {
+                memcpy(str_, str, (size_ + 1) * sizeof(char));
+            }
+        }
+    }
+
+    void resize( ::size_t n )
+    {
+        if( size_ == n ) {
+            return;
+        }
+        if (n == 0) {
+            if( str_ ) {
+                delete [] str_;
+            }
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            char *newString = new char[n + 1];
+            int copySize = n;
+            if( size_ < n ) {
+                copySize = size_;
+            }
+            size_ = n;
+            
+            if(str_) {
+                memcpy(newString, str_, (copySize + 1) * sizeof(char));
+            }
+            if( copySize < size_ ) {
+                memset(newString + copySize, 0, size_ - copySize);
+            }
+            newString[size_] = '\0';
+
+            delete [] str_;
+            str_ = newString;
+        }
+    }
+
+    const char& operator[] ( ::size_t pos ) const
+    {
+        return str_[pos];
+    }
+
+    char& operator[] ( ::size_t pos )
+    {
+        return str_[pos];
+    }
+
+    /*! \brief Copies the value of another string to this one.
+     *
+     *  \param rhs the string to copy.
+     *
+     *  \returns a reference to the modified instance.
+     */
+    string& operator=(const string& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if( str_ != NULL ) {
+            delete [] str_;
+            str_ = NULL;
+            size_ = 0;
+        }
+
+        if (rhs.size_ == 0 || rhs.str_ == NULL) {
+            str_ = NULL;
+            size_ = 0;
+        } 
+        else {
+            str_ = new char[rhs.size_ + 1];
+            size_ = rhs.size_;
+            
+            if (str_ != NULL) {
+                memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char));
+            }
+            else {
+                size_ = 0;
+            }
+        }
+
+        return *this;
+    }
+
+    /*! \brief Constructs a string by copying the value of another instance.
+     *
+     *  \param rhs the string to copy.
+     */
+    string(const string& rhs) :
+        size_(0),
+        str_(NULL)
+    {
+        *this = rhs;
+    }
+
+    //! \brief Destructor - frees memory used to hold the current value.
+    ~string()
+    {
+        delete[] str_;
+        str_ = NULL;
+    }
+    
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t size(void) const   { return size_; }
+
+    //! \brief Queries the length of the string, excluding any added '\0's.
+    ::size_t length(void) const { return size(); }
+
+    /*! \brief Returns a pointer to the private copy held by this instance,
+     *  or "" if empty/unset.
+     */
+    const char * c_str(void) const { return (str_) ? str_ : "";}
+};
+typedef cl::string STRING_CLASS;
+#endif // #elif !defined(__USE_DEV_STRING) 
+
+#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+#define VECTOR_CLASS std::vector
+#elif !defined(__USE_DEV_VECTOR) 
+#define VECTOR_CLASS cl::vector 
+
+#if !defined(__MAX_DEFAULT_VECTOR_SIZE)
+#define __MAX_DEFAULT_VECTOR_SIZE 10
+#endif
+
+/*! \class vector
+ * \brief Fixed sized vector implementation that mirroring 
+ *
+ *  \note Deprecated. Please use std::vector as default or
+ *  re-define the vector class to match the std::vector
+ *  interface by defining VECTOR_CLASS
+
+ *  \note Not recommended for use with custom objects as
+ *  current implementation will construct N elements
+ *
+ * std::vector functionality.
+ *  \brief Fixed sized vector compatible with std::vector.
+ *
+ *  \note
+ *  This differs from std::vector<> not just in memory allocation,
+ *  but also in terms of when members are constructed, destroyed,
+ *  and assigned instead of being copy constructed.
+ *
+ *  \param T type of element contained in the vector.
+ *
+ *  \param N maximum size of the vector.
+ */
+template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE>
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+{
+private:
+    T data_[N];
+    unsigned int size_;
+
+public:
+    //! \brief Constructs an empty vector with no memory allocated.
+    vector() :  
+        size_(static_cast<unsigned int>(0))
+    {}
+
+    //! \brief Deallocates the vector's memory and destroys all of its elements.
+    ~vector() 
+    {
+        clear();
+    }
+
+    //! \brief Returns the number of elements currently contained.
+    unsigned int size(void) const
+    {
+        return size_;
+    }
+    
+    /*! \brief Empties the vector of all elements.
+     *  \note
+     *  This does not deallocate memory but will invoke destructors
+     *  on contained elements.
+     */
+    void clear()
+    {
+        while(!empty()) {
+            pop_back();
+        }
+    }
+
+    /*! \brief Appends an element after the last valid element.
+     * Calling this on a vector that has reached capacity will throw an 
+     * exception if exceptions are enabled.
+     */
+    void push_back (const T& x)
+    { 
+        if (size() < N) {    
+            new (&data_[size_]) T(x);
+            size_++;
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+
+    /*! \brief Removes the last valid element from the vector.
+     * Calling this on an empty vector will throw an exception
+     * if exceptions are enabled.
+     */
+    void pop_back(void)
+    {
+        if (size_ != 0) {
+            --size_;
+            data_[size_].~T();
+        } else {
+            detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR);
+        }
+    }
+  
+    /*! \brief Constructs with a value copied from another.
+     *
+     *  \param vec the vector to copy.
+     */
+    vector(const vector<T, N>& vec) : 
+        size_(vec.size_)
+    {
+        if (size_ != 0) {	
+            assign(vec.begin(), vec.end());
+        }
+    } 
+
+    /*! \brief Constructs with a specified number of initial elements.
+     *
+     *  \param size number of initial elements.
+     *
+     *  \param val value of initial elements.
+     */
+    vector(unsigned int size, const T& val = T()) :
+        size_(0)
+    {
+        for (unsigned int i = 0; i < size; i++) {
+            push_back(val);
+        }
+    }
+
+    /*! \brief Overwrites the current content with that copied from another
+     *         instance.
+     *
+     *  \param rhs vector to copy.
+     *
+     *  \returns a reference to this.
+     */
+    vector<T, N>& operator=(const vector<T, N>& rhs)
+    {
+        if (this == &rhs) {
+            return *this;
+        }
+
+        if (rhs.size_ != 0) {	
+            assign(rhs.begin(), rhs.end());
+        } else {
+            clear();
+        }
+    
+        return *this;
+    }
+
+    /*! \brief Tests equality against another instance.
+     *
+     *  \param vec the vector against which to compare.
+     */
+    bool operator==(vector<T,N> &vec)
+    {
+        if (size() != vec.size()) {
+            return false;
+        }
+
+        for( unsigned int i = 0; i < size(); ++i ) {
+            if( operator[](i) != vec[i] ) {
+                return false;
+            }
+        }
+        return true;
+    }
+  
+    //! \brief Conversion operator to T*.
+    operator T* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const T* () const { return data_; }
+   
+    //! \brief Tests whether this instance has any elements.
+    bool empty (void) const
+    {
+        return size_==0;
+    }
+  
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int max_size (void) const
+    {
+        return N;
+    }
+
+    //! \brief Returns the maximum number of elements this instance can hold.
+    unsigned int capacity () const
+    {
+        return N;
+    }
+
+    /*! \brief Returns a reference to a given element.
+     *
+     *  \param index which element to access.     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    T& operator[](int index)
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Returns a const reference to a given element.
+     *
+     *  \param index which element to access.
+     *
+     *  \note
+     *  The caller is responsible for ensuring index is >= 0 and < size().
+     */
+    const T& operator[](int index) const
+    {
+        return data_[index];
+    }
+  
+    /*! \brief Assigns elements of the vector based on a source iterator range.
+     *
+     *  \param start Beginning iterator of source range
+     *  \param end Enditerator of source range
+     *
+     *  \note
+     *  Will throw an exception if exceptions are enabled and size exceeded.
+     */
+    template<class I>
+    void assign(I start, I end)
+    {
+        clear();   
+        while(start != end) {
+            push_back(*start);
+            start++;
+        }
+    }
+
+    /*! \class iterator
+     * \brief Const iterator class for vectors
+     */
+    class iterator
+    {
+    private:
+        const vector<T,N> *vec_;
+        int index_;
+
+        /**
+         * Internal iterator constructor to capture reference
+         * to the vector it iterates over rather than taking 
+         * the vector by copy.
+         */
+        iterator (const vector<T,N> &vec, int index) :
+            vec_(&vec)
+        {            
+            if( !vec.empty() ) {
+                index_ = index;
+            } else {
+                index_ = -1;
+            }
+        }
+
+    public:
+        iterator(void) : 
+            index_(-1),
+            vec_(NULL)
+        {
+        }
+
+        iterator(const iterator& rhs) :
+            vec_(rhs.vec_),
+            index_(rhs.index_)
+        {
+        }
+
+        ~iterator(void) {}
+
+        static iterator begin(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, 0);
+
+            return i;
+        }
+
+        static iterator end(const cl::vector<T,N> &vec)
+        {
+            iterator i(vec, vec.size());
+
+            return i;
+        }
+    
+        bool operator==(iterator i)
+        {
+            return ((vec_ == i.vec_) && 
+                    (index_ == i.index_));
+        }
+
+        bool operator!=(iterator i)
+        {
+            return (!(*this==i));
+        }
+
+        iterator& operator++()
+        {
+            ++index_;
+            return *this;
+        }
+
+        iterator operator++(int)
+        {
+            iterator retVal(*this);
+            ++index_;
+            return retVal;
+        }
+
+        iterator& operator--()
+        {
+            --index_;
+            return *this;
+        }
+
+        iterator operator--(int)
+        {
+            iterator retVal(*this);
+            --index_;
+            return retVal;
+        }
+
+        const T& operator *() const
+        {
+            return (*vec_)[index_];
+        }
+    };
+
+    iterator begin(void)
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator begin(void) const
+    {
+        return iterator::begin(*this);
+    }
+
+    iterator end(void)
+    {
+        return iterator::end(*this);
+    }
+
+    iterator end(void) const
+    {
+        return iterator::end(*this);
+    }
+
+    T& front(void)
+    {
+        return data_[0];
+    }
+
+    T& back(void)
+    {
+        return data_[size_];
+    }
+
+    const T& front(void) const
+    {
+        return data_[0];
+    }
+
+    const T& back(void) const
+    {
+        return data_[size_-1];
+    }
+};  
+#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR)
+
+
+
+
+
+namespace detail {
+#define __DEFAULT_NOT_INITIALIZED 1 
+#define __DEFAULT_BEING_INITIALIZED 2
+#define __DEFAULT_INITIALIZED 4
+
+    /*
+     * Compare and exchange primitives are needed for handling of defaults
+    */
+    inline int compare_exchange(volatile int * dest, int exchange, int comparand)
+    {
+#ifdef _WIN32
+        return (int)(InterlockedCompareExchange(
+           (volatile long*)dest, 
+           (long)exchange, 
+           (long)comparand));
+#elif defined(__APPLE__) || defined(__MACOSX)
+		return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest);
+#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX)
+        return (__sync_val_compare_and_swap(
+            dest, 
+            comparand, 
+            exchange));
+#endif // !_WIN32
+    }
+
+    inline void fence() { _mm_mfence(); }
+}; // namespace detail
+
+    
+/*! \brief class used to interface between C++ and
+ *  OpenCL C calls that require arrays of size_t values, whose
+ *  size is known statically.
+ */
+template <int N>
+class size_t
+{ 
+private:
+    ::size_t data_[N];
+
+public:
+    //! \brief Initialize size_t to all 0s
+    size_t()
+    {
+        for( int i = 0; i < N; ++i ) {
+            data_[i] = 0;
+        }
+    }
+
+    ::size_t& operator[](int index)
+    {
+        return data_[index];
+    }
+
+    const ::size_t& operator[](int index) const
+    {
+        return data_[index];
+    }
+
+    //! \brief Conversion operator to T*.
+    operator ::size_t* ()             { return data_; }
+
+    //! \brief Conversion operator to const T*.
+    operator const ::size_t* () const { return data_; }
+};
+
+namespace detail {
+
+// Generic getInfoHelper. The final parameter is used to guide overload
+// resolution: the actual parameter passed is an int, which makes this
+// a worse conversion sequence than a specialization that declares the
+// parameter as an int.
+template<typename Functor, typename T>
+inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long)
+{
+    return f(name, sizeof(T), param, NULL);
+}
+
+// Specialized getInfoHelper for VECTOR_CLASS params
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    T* value = (T*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    param->assign(&value[0], &value[required/sizeof(T)]);
+    return CL_SUCCESS;
+}
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template <typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    typename T::cl_type * value = (typename T::cl_type *) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t elements = required / sizeof(typename T::cl_type);
+    param->assign(&value[0], &value[elements]);
+    for (::size_t i = 0; i < elements; i++)
+    {
+        if (value[i] != NULL)
+        {
+            err = (*param)[i].retain();
+            if (err != CL_SUCCESS) {
+                return err;
+            }
+        }
+    }
+    return CL_SUCCESS;
+}
+
+// Specialized for getInfo<CL_PROGRAM_BINARIES>
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int)
+{
+    cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL);
+
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for STRING_CLASS params
+template <typename Func>
+inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    char* value = (char*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    *param = value;
+    return CL_SUCCESS;
+}
+
+// Specialized GetInfoHelper for cl::size_t params
+template <typename Func, ::size_t N>
+inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long)
+{
+    ::size_t required;
+    cl_int err = f(name, 0, NULL, &required);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    ::size_t* value = (::size_t*) alloca(required);
+    err = f(name, required, value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+
+    for(int i = 0; i < N; ++i) {
+        (*param)[i] = value[i];
+    }
+
+    return CL_SUCCESS;
+}
+
+template<typename T> struct ReferenceHandler;
+
+/* Specialization for reference-counted types. This depends on the
+ * existence of Wrapper<T>::cl_type, and none of the other types having the
+ * cl_type member. Note that simplify specifying the parameter as Wrapper<T>
+ * does not work, because when using a derived type (e.g. Context) the generic
+ * template will provide a better match.
+ */
+template<typename Func, typename T>
+inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0)
+{
+    typename T::cl_type value;
+    cl_int err = f(name, sizeof(value), &value, NULL);
+    if (err != CL_SUCCESS) {
+        return err;
+    }
+    *param = value;
+    if (value != NULL)
+    {
+        err = param->retain();
+        if (err != CL_SUCCESS) {
+            return err;
+        }
+    }
+    return CL_SUCCESS;
+}
+
+#define __PARAM_NAME_INFO_1_0(F) \
+    F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \
+    F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \
+    F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \
+    F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \
+    F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \
+    F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \
+    F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \
+    F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \
+    F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \
+    F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \
+    F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \
+    F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \
+    F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \
+    F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \
+    F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \
+    F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \
+    F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \
+    \
+    F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \
+    F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \
+    \
+    F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \
+    F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \
+    F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \
+    F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \
+    \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \
+    F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \
+    \
+    F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \
+    F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \
+    F(cl_mem_info, CL_MEM_SIZE, ::size_t) \
+    F(cl_mem_info, CL_MEM_HOST_PTR, void*) \
+    F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \
+    F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \
+    \
+    F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \
+    F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \
+    F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \
+    F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \
+    F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \
+    \
+    F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \
+    F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \
+    F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \
+    F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \
+    F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \
+    \
+    F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \
+    F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \
+    F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \
+    F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \
+    F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \
+    F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \
+    F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \
+    \
+    F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \
+    F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \
+    F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \
+    F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \
+    F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \
+    F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \
+    F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \
+    F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties)
+
+#if defined(CL_VERSION_1_1)
+#define __PARAM_NAME_INFO_1_1(F) \
+    F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\
+    F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \
+    F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \
+    F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \
+    F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \
+    F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \
+    \
+    F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \
+    F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \
+    \
+    F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \
+    F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \
+    \
+    F(cl_event_info, CL_EVENT_CONTEXT, cl::Context)
+#endif // CL_VERSION_1_1
+
+    
+#if defined(CL_VERSION_1_2)
+#define __PARAM_NAME_INFO_1_2(F) \
+    F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \
+    \
+    F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \
+    F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \
+    \
+    F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \
+    \
+    F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \
+    \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \
+    F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \
+    \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>)  \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \
+    F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \
+    F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \
+    F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS)
+#endif // #if defined(CL_VERSION_1_2)
+
+#if defined(USE_CL_DEVICE_FISSION)
+#define __PARAM_NAME_DEVICE_FISSION(F) \
+    F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \
+    F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \
+    F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \
+    F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>)
+#endif // USE_CL_DEVICE_FISSION
+
+template <typename enum_type, cl_int Name>
+struct param_traits {};
+
+#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \
+struct token;                                        \
+template<>                                           \
+struct param_traits<detail:: token,param_name>       \
+{                                                    \
+    enum { value = param_name };                     \
+    typedef T param_type;                            \
+};
+
+__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS)
+#if defined(CL_VERSION_1_1)
+__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+#if defined(CL_VERSION_1_2)
+__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS)
+#endif // CL_VERSION_1_1
+
+#if defined(USE_CL_DEVICE_FISSION)
+__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS);
+#endif // USE_CL_DEVICE_FISSION
+
+#ifdef CL_PLATFORM_ICD_SUFFIX_KHR
+__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS)
+#endif
+
+#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong)
+#endif
+
+#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>)
+#endif
+#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint)
+#endif
+#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint)
+#endif
+
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_WARP_SIZE_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint)
+#endif
+#ifdef CL_DEVICE_GPU_OVERLAP_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool)
+#endif
+#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV
+__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool)
+#endif
+
+// Convenience functions
+
+template <typename Func, typename T>
+inline cl_int
+getInfo(Func f, cl_uint name, T* param)
+{
+    return getInfoHelper(f, name, param, 0);
+}
+
+template <typename Func, typename Arg0>
+struct GetInfoFunctor0
+{
+    Func f_; const Arg0& arg0_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename Arg1>
+struct GetInfoFunctor1
+{
+    Func f_; const Arg0& arg0_; const Arg1& arg1_;
+    cl_int operator ()(
+        cl_uint param, ::size_t size, void* value, ::size_t* size_ret)
+    { return f_(arg0_, arg1_, param, size, value, size_ret); }
+};
+
+template <typename Func, typename Arg0, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, cl_uint name, T* param)
+{
+    GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template <typename Func, typename Arg0, typename Arg1, typename T>
+inline cl_int
+getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param)
+{
+    GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 };
+    return getInfoHelper(f0, name, param, 0);
+}
+
+template<typename T>
+struct ReferenceHandler
+{ };
+
+#if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.2 devices do have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int retain(cl_device_id device)
+    { return ::clRetainDevice(device); }
+    /**
+     * Retain the device.
+     * \param device A valid device created using createSubDevices
+     * \return 
+     *   CL_SUCCESS if the function executed successfully.
+     *   CL_INVALID_DEVICE if device was not a valid subdevice
+     *   CL_OUT_OF_RESOURCES
+     *   CL_OUT_OF_HOST_MEMORY
+     */
+    static cl_int release(cl_device_id device)
+    { return ::clReleaseDevice(device); }
+};
+#else // #if defined(CL_VERSION_1_2)
+/**
+ * OpenCL 1.1 devices do not have retain/release.
+ */
+template <>
+struct ReferenceHandler<cl_device_id>
+{
+    // cl_device_id does not have retain().
+    static cl_int retain(cl_device_id)
+    { return CL_SUCCESS; }
+    // cl_device_id does not have release().
+    static cl_int release(cl_device_id)
+    { return CL_SUCCESS; }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+template <>
+struct ReferenceHandler<cl_platform_id>
+{
+    // cl_platform_id does not have retain().
+    static cl_int retain(cl_platform_id)
+    { return CL_SUCCESS; }
+    // cl_platform_id does not have release().
+    static cl_int release(cl_platform_id)
+    { return CL_SUCCESS; }
+};
+
+template <>
+struct ReferenceHandler<cl_context>
+{
+    static cl_int retain(cl_context context)
+    { return ::clRetainContext(context); }
+    static cl_int release(cl_context context)
+    { return ::clReleaseContext(context); }
+};
+
+template <>
+struct ReferenceHandler<cl_command_queue>
+{
+    static cl_int retain(cl_command_queue queue)
+    { return ::clRetainCommandQueue(queue); }
+    static cl_int release(cl_command_queue queue)
+    { return ::clReleaseCommandQueue(queue); }
+};
+
+template <>
+struct ReferenceHandler<cl_mem>
+{
+    static cl_int retain(cl_mem memory)
+    { return ::clRetainMemObject(memory); }
+    static cl_int release(cl_mem memory)
+    { return ::clReleaseMemObject(memory); }
+};
+
+template <>
+struct ReferenceHandler<cl_sampler>
+{
+    static cl_int retain(cl_sampler sampler)
+    { return ::clRetainSampler(sampler); }
+    static cl_int release(cl_sampler sampler)
+    { return ::clReleaseSampler(sampler); }
+};
+
+template <>
+struct ReferenceHandler<cl_program>
+{
+    static cl_int retain(cl_program program)
+    { return ::clRetainProgram(program); }
+    static cl_int release(cl_program program)
+    { return ::clReleaseProgram(program); }
+};
+
+template <>
+struct ReferenceHandler<cl_kernel>
+{
+    static cl_int retain(cl_kernel kernel)
+    { return ::clRetainKernel(kernel); }
+    static cl_int release(cl_kernel kernel)
+    { return ::clReleaseKernel(kernel); }
+};
+
+template <>
+struct ReferenceHandler<cl_event>
+{
+    static cl_int retain(cl_event event)
+    { return ::clRetainEvent(event); }
+    static cl_int release(cl_event event)
+    { return ::clReleaseEvent(event); }
+};
+
+
+// Extracts version number with major in the upper 16 bits, minor in the lower 16
+static cl_uint getVersion(const char *versionInfo)
+{
+    int highVersion = 0;
+    int lowVersion = 0;
+    int index = 7;
+    while(versionInfo[index] != '.' ) {
+        highVersion *= 10;
+        highVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    ++index;
+    while(versionInfo[index] != ' ' ) {
+        lowVersion *= 10;
+        lowVersion += versionInfo[index]-'0';
+        ++index;
+    }
+    return (highVersion << 16) | lowVersion;
+}
+
+static cl_uint getPlatformVersion(cl_platform_id platform)
+{
+    ::size_t size = 0;
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size);
+    char *versionInfo = (char *) alloca(size);
+    clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size);
+    return getVersion(versionInfo);
+}
+
+static cl_uint getDevicePlatformVersion(cl_device_id device)
+{
+    cl_platform_id platform;
+    clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL);
+    return getPlatformVersion(platform);
+}
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+static cl_uint getContextPlatformVersion(cl_context context)
+{
+    // The platform cannot be queried directly, so we first have to grab a
+    // device and obtain its context
+    ::size_t size = 0;
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
+    if (size == 0)
+        return 0;
+    cl_device_id *devices = (cl_device_id *) alloca(size);
+    clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL);
+    return getDevicePlatformVersion(devices[0]);
+}
+#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+
+template <typename T>
+class Wrapper
+{
+public:
+    typedef T cl_type;
+
+protected:
+    cl_type object_;
+
+public:
+    Wrapper() : object_(NULL) { }
+
+    Wrapper(const cl_type &obj) : object_(obj) { }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        return ReferenceHandler<cl_type>::retain(object_);
+    }
+
+    cl_int release() const
+    {
+        return ReferenceHandler<cl_type>::release(object_);
+    }
+};
+
+template <>
+class Wrapper<cl_device_id>
+{
+public:
+    typedef cl_device_id cl_type;
+
+protected:
+    cl_type object_;
+    bool referenceCountable_;
+
+    static bool isReferenceCountable(cl_device_id device)
+    {
+        bool retVal = false;
+        if (device != NULL) {
+            int version = getDevicePlatformVersion(device);
+            if(version > ((1 << 16) + 1)) {
+                retVal = true;
+            }
+        }
+        return retVal;
+    }
+
+public:
+    Wrapper() : object_(NULL), referenceCountable_(false) 
+    { 
+    }
+    
+    Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) 
+    {
+        referenceCountable_ = isReferenceCountable(obj); 
+    }
+
+    ~Wrapper()
+    {
+        if (object_ != NULL) { release(); }
+    }
+    
+    Wrapper(const Wrapper<cl_type>& rhs)
+    {
+        object_ = rhs.object_;
+        referenceCountable_ = isReferenceCountable(object_); 
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+    }
+
+    Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs.object_;
+        referenceCountable_ = rhs.referenceCountable_;
+        if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); }
+        return *this;
+    }
+
+    Wrapper<cl_type>& operator = (const cl_type &rhs)
+    {
+        if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); }
+        object_ = rhs;
+        referenceCountable_ = isReferenceCountable(object_); 
+        return *this;
+    }
+
+    cl_type operator ()() const { return object_; }
+
+    cl_type& operator ()() { return object_; }
+
+protected:
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type);
+
+    template<typename Func, typename U>
+    friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type);
+
+    cl_int retain() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::retain(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+
+    cl_int release() const
+    {
+        if( referenceCountable_ ) {
+            return ReferenceHandler<cl_type>::release(object_);
+        }
+        else {
+            return CL_SUCCESS;
+        }
+    }
+};
+
+} // namespace detail
+//! \endcond
+
+/*! \stuct ImageFormat
+ *  \brief Adds constructors and member functions for cl_image_format.
+ *
+ *  \see cl_image_format
+ */
+struct ImageFormat : public cl_image_format
+{
+    //! \brief Default constructor - performs no initialization.
+    ImageFormat(){}
+
+    //! \brief Initializing constructor.
+    ImageFormat(cl_channel_order order, cl_channel_type type)
+    {
+        image_channel_order = order;
+        image_channel_data_type = type;
+    }
+
+    //! \brief Assignment operator.
+    ImageFormat& operator = (const ImageFormat& rhs)
+    {
+        if (this != &rhs) {
+            this->image_channel_data_type = rhs.image_channel_data_type;
+            this->image_channel_order     = rhs.image_channel_order;
+        }
+        return *this;
+    }
+};
+
+/*! \brief Class interface for cl_device_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_device_id
+ */
+class Device : public detail::Wrapper<cl_device_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Device() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const Device& device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Constructor from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { }
+
+    /*! \brief Returns the first device on the default context.
+     *
+     *  \see Context::getDefault()
+     */
+    static Device getDefault(cl_int * err = NULL);
+
+    /*! \brief Assignment operator from Device.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const Device& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_device_id.
+     * 
+     *  This simply copies the device ID value, which is an inexpensive operation.
+     */
+    Device& operator = (const cl_device_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo().
+    template <typename T>
+    cl_int getInfo(cl_device_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetDeviceInfo, object_, name, param),
+            __GET_DEVICE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetDeviceInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_device_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_device_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /**
+     * CL 1.2 version
+     */
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clCreateSubDevicesEXT().
+    cl_int createSubDevices(
+        const cl_device_partition_property * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        cl_uint n = 0;
+        cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = clCreateSubDevices(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+/**
+ * CL 1.1 version that uses device fission.
+ */
+#if defined(CL_VERSION_1_1)
+#if defined(USE_CL_DEVICE_FISSION)
+    cl_int createSubDevices(
+        const cl_device_partition_property_ext * properties,
+        VECTOR_CLASS<Device>* devices)
+    {
+        typedef CL_API_ENTRY cl_int 
+            ( CL_API_CALL * PFN_clCreateSubDevicesEXT)(
+                cl_device_id /*in_device*/,
+                const cl_device_partition_property_ext * /* properties */,
+                cl_uint /*num_entries*/,
+                cl_device_id * /*out_devices*/,
+                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
+
+        static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL;
+        __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_SUB_DEVICES);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif // #if defined(USE_CL_DEVICE_FISSION)
+#endif // #if defined(CL_VERSION_1_1)
+};
+
+/*! \brief Class interface for cl_platform_id.
+ *
+ *  \note Copies of these objects are inexpensive, since they don't 'own'
+ *        any underlying resources or data structures.
+ *
+ *  \see cl_platform_id
+ */
+class Platform : public detail::Wrapper<cl_platform_id>
+{
+public:
+    //! \brief Default constructor - initializes to NULL.
+    Platform() : detail::Wrapper<cl_type>()  { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Constructor from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { }
+
+    /*! \brief Assignment operator from Platform.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const Platform& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_platform_id.
+     * 
+     *  This simply copies the platform ID value, which is an inexpensive operation.
+     */
+    Platform& operator = (const cl_platform_id& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo().
+    cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetPlatformInfo, object_, name, param),
+            __GET_PLATFORM_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetPlatformInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_platform_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_platform_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of devices for this platform.
+     * 
+     *  Wraps clGetDeviceIDs().
+     */
+    cl_int getDevices(
+        cl_device_type type,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        cl_uint n = 0;
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+        cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = ::clGetDeviceIDs(object_, type, n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+#if defined(USE_DX_INTEROP)
+   /*! \brief Get the list of available D3D10 devices.
+     *
+     *  \param d3d_device_source.
+     *
+     *  \param d3d_object.
+     *
+     *  \param d3d_device_set.
+     *
+     *  \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device
+     *  values returned in devices can be used to identify a specific OpenCL
+     *  device. If \a devices argument is NULL, this argument is ignored.
+     *
+     *  \return One of the following values:
+     *    - CL_SUCCESS if the function is executed successfully.
+     *
+     *  The application can query specific capabilities of the OpenCL device(s)
+     *  returned by cl::getDevices. This can be used by the application to
+     *  determine which device(s) to use.
+     *
+     * \note In the case that exceptions are enabled and a return value
+     * other than CL_SUCCESS is generated, then cl::Error exception is
+     * generated.
+     */
+    cl_int getDevices(
+        cl_d3d10_device_source_khr d3d_device_source,
+        void *                     d3d_object,
+        cl_d3d10_device_set_khr    d3d_device_set,
+        VECTOR_CLASS<Device>* devices) const
+    {
+        typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)(
+            cl_platform_id platform, 
+            cl_d3d10_device_source_khr d3d_device_source, 
+            void * d3d_object,
+            cl_d3d10_device_set_khr d3d_device_set,
+            cl_uint num_entries,
+            cl_device_id * devices,
+            cl_uint* num_devices);
+
+        if( devices == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR);
+        }
+
+        static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL;
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR);
+
+        cl_uint n = 0;
+        cl_int err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set, 
+            0, 
+            NULL, 
+            &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id));
+        err = pfn_clGetDeviceIDsFromD3D10KHR(
+            object_, 
+            d3d_device_source, 
+            d3d_object,
+            d3d_device_set,
+            n, 
+            ids, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_DEVICE_IDS_ERR);
+        }
+
+        devices->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+#endif
+
+    /*! \brief Gets a list of available platforms.
+     * 
+     *  Wraps clGetPlatformIDs().
+     */
+    static cl_int get(
+        VECTOR_CLASS<Platform>* platforms)
+    {
+        cl_uint n = 0;
+
+        if( platforms == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        platforms->assign(&ids[0], &ids[n]);
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static cl_int get(
+        Platform * platform)
+    {
+        cl_uint n = 0;
+
+        if( platform == NULL ) {
+            return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        *platform = ids[0];
+        return CL_SUCCESS;
+    }
+
+    /*! \brief Gets the first available platform, returning it by value.
+     * 
+     *  Wraps clGetPlatformIDs(), returning the first result.
+     */
+    static Platform get(
+        cl_int * errResult = NULL)
+    {
+        Platform platform;
+        cl_uint n = 0;
+        cl_int err = ::clGetPlatformIDs(0, NULL, &n);
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+            if (errResult != NULL) {
+                *errResult = err;
+            }
+        }
+
+        cl_platform_id* ids = (cl_platform_id*) alloca(
+            n * sizeof(cl_platform_id));
+        err = ::clGetPlatformIDs(n, ids, NULL);
+
+        if (err != CL_SUCCESS) {
+            detail::errHandler(err, __GET_PLATFORM_IDS_ERR);
+        }
+
+        if (errResult != NULL) {
+            *errResult = err;
+        }
+        
+        return ids[0];
+    }
+
+    static Platform getDefault( 
+        cl_int *errResult = NULL )
+    {
+        return get(errResult);
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    //! \brief Wrapper for clUnloadCompiler().
+    cl_int
+    unloadCompiler()
+    {
+        return ::clUnloadPlatformCompiler(object_);
+    }
+#endif // #if defined(CL_VERSION_1_2)
+}; // class Platform
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2))
+/**
+ * Unload the OpenCL compiler.
+ * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int
+UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline cl_int
+UnloadCompiler()
+{
+    return ::clUnloadCompiler();
+}
+#endif // #if defined(CL_VERSION_1_1)
+
+/*! \brief Class interface for cl_context.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_context as the original.  For details, see
+ *        clRetainContext() and clReleaseContext().
+ *
+ *  \see cl_context
+ */
+class Context 
+    : public detail::Wrapper<cl_context>
+{
+private:
+    static volatile int default_initialized_;
+    static Context default_;
+    static volatile cl_int default_error_;
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseContext() on the value held by this instance.
+     */
+    ~Context() { }
+
+    /*! \brief Constructs a context including a list of specified devices.
+     *
+     *  Wraps clCreateContext().
+     */
+    Context(
+        const VECTOR_CLASS<Device>& devices,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        object_ = ::clCreateContext(
+            properties, (cl_uint) numDevices,
+            deviceIDs,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Context(
+        const Device& device,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        cl_device_id deviceID = device();
+
+        object_ = ::clCreateContext(
+            properties, 1,
+            &deviceID,
+            notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a context including all devices of a specified type.
+     *
+     *  Wraps clCreateContextFromType().
+     */
+    Context(
+        cl_device_type type,
+        cl_context_properties* properties = NULL,
+        void (CL_CALLBACK * notifyFptr)(
+            const char *,
+            const void *,
+            ::size_t,
+            void *) = NULL,
+        void* data = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+#if !defined(__APPLE__) || !defined(__MACOS)
+        cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 };	
+        if (properties == NULL) {
+            prop[1] = (cl_context_properties)Platform::get(&error)();
+            if (error != CL_SUCCESS) {
+                detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+                if (err != NULL) {
+                    *err = error;
+                    return;
+                }
+            }
+
+            properties = &prop[0];
+        }
+#endif
+        object_ = ::clCreateContextFromType(
+            properties, type, notifyFptr, data, &error);
+
+        detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT.
+     *
+     *  \note All calls to this function return the same cl_context as the first.
+     */
+    static Context getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+        default_ = Context(
+            CL_DEVICE_TYPE_DEFAULT,
+            NULL,
+            NULL,
+            NULL,
+            &error);
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Context() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainContext() on the parameter's cl_context.
+     */
+    Context(const Context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Constructor from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_context
+     *  into the new Context object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { }
+
+    /*! \brief Assignment operator from Context.
+     * 
+     *  This calls clRetainContext() on the parameter and clReleaseContext() on
+     *  the previous value held by this instance.
+     */
+    Context& operator = (const Context& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_context - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseContext() on the value previously held by this instance.
+     */
+    Context& operator = (const cl_context& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetContextInfo().
+    template <typename T>
+    cl_int getInfo(cl_context_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetContextInfo, object_, name, param),
+            __GET_CONTEXT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetContextInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_context_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_context_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Gets a list of supported image formats.
+     *  
+     *  Wraps clGetSupportedImageFormats().
+     */
+    cl_int getSupportedImageFormats(
+        cl_mem_flags flags,
+        cl_mem_object_type type,
+        VECTOR_CLASS<ImageFormat>* formats) const
+    {
+        cl_uint numEntries;
+        cl_int err = ::clGetSupportedImageFormats(
+           object_, 
+           flags,
+           type, 
+           0, 
+           NULL, 
+           &numEntries);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        ImageFormat* value = (ImageFormat*)
+            alloca(numEntries * sizeof(ImageFormat));
+        err = ::clGetSupportedImageFormats(
+            object_, 
+            flags, 
+            type, 
+            numEntries,
+            (cl_image_format*) value, 
+            NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR);
+        }
+
+        formats->assign(&value[0], &value[numEntries]);
+        return CL_SUCCESS;
+    }
+};
+
+inline Device Device::getDefault(cl_int * err)
+{
+    cl_int error;
+    Device device;
+
+    Context context = Context::getDefault(&error);
+#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+    detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+#else
+    detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR);
+#endif
+
+    if (error != CL_SUCCESS) {
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+    else {
+        device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+        if (err != NULL) {
+            *err = CL_SUCCESS;
+        }
+    }
+
+    return device;
+}
+
+
+#ifdef _WIN32
+__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) Context Context::default_;
+__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) Context Context::default_;
+__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS;
+#endif
+
+/*! \brief Class interface for cl_event.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_event as the original.  For details, see
+ *        clRetainEvent() and clReleaseEvent().
+ *
+ *  \see cl_event
+ */
+class Event : public detail::Wrapper<cl_event>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseEvent() on the value held by this instance.
+     */
+    ~Event() { }
+ 
+    //! \brief Default constructor - initializes to NULL.
+    Event() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor.
+     * 
+     *  This calls clRetainEvent() on the parameter's cl_event.
+     */
+    Event(const Event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Constructor from cl_event - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_event
+     *  into the new Event object.
+     */
+    Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { }
+
+    /*! \brief Assignment operator from cl_event - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseEvent() on the value previously held by this instance.
+     */
+    Event& operator = (const Event& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_event.
+     * 
+     *  This calls clRetainEvent() on the parameter and clReleaseEvent() on
+     *  the previous value held by this instance.
+     */
+    Event& operator = (const cl_event& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetEventInfo().
+    template <typename T>
+    cl_int getInfo(cl_event_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetEventInfo, object_, name, param),
+            __GET_EVENT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_event_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_event_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo().
+    template <typename T>
+    cl_int getProfilingInfo(cl_profiling_info name, T* param) const
+    {
+        return detail::errHandler(detail::getInfo(
+            &::clGetEventProfilingInfo, object_, name, param),
+            __GET_EVENT_PROFILE_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetEventProfilingInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_profiling_info, name>::param_type
+    getProfilingInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_profiling_info, name>::param_type param;
+        cl_int result = getProfilingInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    /*! \brief Blocks the calling thread until this event completes.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    cl_int wait() const
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(1, &object_),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a user callback function for a specific command execution status.
+     *
+     *  Wraps clSetEventCallback().
+     */
+    cl_int setCallback(
+        cl_int type,
+        void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetEventCallback(
+                object_,
+                type,
+                pfn_notify,
+                user_data), 
+            __SET_EVENT_CALLBACK_ERR);
+    }
+#endif
+
+    /*! \brief Blocks the calling thread until every event specified is complete.
+     * 
+     *  Wraps clWaitForEvents().
+     */
+    static cl_int
+    waitForEvents(const VECTOR_CLASS<Event>& events)
+    {
+        return detail::errHandler(
+            ::clWaitForEvents(
+                (cl_uint) events.size(), (cl_event*)&events.front()),
+            __WAIT_FOR_EVENTS_ERR);
+    }
+};
+
+#if defined(CL_VERSION_1_1)
+/*! \brief Class interface for user events (a subset of cl_event's).
+ * 
+ *  See Event for details about copy semantics, etc.
+ */
+class UserEvent : public Event
+{
+public:
+    /*! \brief Constructs a user event on a given context.
+     *
+     *  Wraps clCreateUserEvent().
+     */
+    UserEvent(
+        const Context& context,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateUserEvent(
+            context(),
+            &error);
+
+        detail::errHandler(error, __CREATE_USER_EVENT_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    UserEvent() : Event() { }
+
+    //! \brief Copy constructor - performs shallow copy.
+    UserEvent(const UserEvent& event) : Event(event) { }
+
+    //! \brief Assignment Operator - performs shallow copy.
+    UserEvent& operator = (const UserEvent& rhs)
+    {
+        if (this != &rhs) {
+            Event::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Sets the execution status of a user event object.
+     *
+     *  Wraps clSetUserEventStatus().
+     */
+    cl_int setStatus(cl_int status)
+    {
+        return detail::errHandler(
+            ::clSetUserEventStatus(object_,status), 
+            __SET_USER_EVENT_STATUS_ERR);
+    }
+};
+#endif
+
+/*! \brief Blocks the calling thread until every event specified is complete.
+ * 
+ *  Wraps clWaitForEvents().
+ */
+inline static cl_int
+WaitForEvents(const VECTOR_CLASS<Event>& events)
+{
+    return detail::errHandler(
+        ::clWaitForEvents(
+            (cl_uint) events.size(), (cl_event*)&events.front()),
+        __WAIT_FOR_EVENTS_ERR);
+}
+
+/*! \brief Class interface for cl_mem.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_mem as the original.  For details, see
+ *        clRetainMemObject() and clReleaseMemObject().
+ *
+ *  \see cl_mem
+ */
+class Memory : public detail::Wrapper<cl_mem>
+{
+public:
+ 
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseMemObject() on the value held by this instance.
+     */
+    ~Memory() {}
+
+    //! \brief Default constructor - initializes to NULL.
+    Memory() : detail::Wrapper<cl_type>() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainMemObject() on the parameter's cl_mem.
+     */
+    Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_mem
+     *  into the new Memory object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { }
+
+    /*! \brief Assignment operator from Memory.
+     * 
+     *  This calls clRetainMemObject() on the parameter and clReleaseMemObject()
+     *  on the previous value held by this instance.
+     */
+    Memory& operator = (const Memory& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_mem - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseMemObject() on the value previously held by this instance.
+     */
+    Memory& operator = (const cl_mem& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo().
+    template <typename T>
+    cl_int getInfo(cl_mem_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetMemObjectInfo, object_, name, param),
+            __GET_MEM_OBJECT_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetMemObjectInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_mem_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_mem_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Registers a callback function to be called when the memory object
+     *         is no longer needed.
+     *
+     *  Wraps clSetMemObjectDestructorCallback().
+     *
+     *  Repeated calls to this function, for a given cl_mem value, will append
+     *  to the list of functions called (in reverse order) when memory object's
+     *  resources are freed and the memory object is deleted.
+     *
+     *  \note
+     *  The registered callbacks are associated with the underlying cl_mem
+     *  value - not the Memory class instance.
+     */
+    cl_int setDestructorCallback(
+        void (CL_CALLBACK * pfn_notify)(cl_mem, void *),		
+        void * user_data = NULL)
+    {
+        return detail::errHandler(
+            ::clSetMemObjectDestructorCallback(
+                object_,
+                pfn_notify,
+                user_data), 
+            __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR);
+    }
+#endif
+
+};
+
+// Pre-declare copy functions
+class Buffer;
+template< typename IteratorType >
+cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer );
+template< typename IteratorType >
+cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator );
+
+/*! \brief Class interface for Buffer Memory Objects.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class Buffer : public Memory
+{
+public:
+
+    /*! \brief Constructs a Buffer in a specified context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     */
+    Buffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*! \brief Constructs a Buffer in the default context.
+     *
+     *  Wraps clCreateBuffer().
+     *
+     *  \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was
+     *                  specified.  Note alignment & exclusivity requirements.
+     *
+     *  \see Context::getDefault()
+     */
+    Buffer(
+         cl_mem_flags flags,
+        ::size_t size,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error);
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /*!
+     * \brief Construct a Buffer from a host container via iterators.
+     * If useHostPtr is specified iterators must be random access.
+     */
+    template< typename IteratorType >
+    Buffer(
+        IteratorType startIterator,
+        IteratorType endIterator,
+        bool readOnly,
+        bool useHostPtr = false,
+        cl_int* err = NULL)
+    {
+        typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+        cl_int error;
+
+        cl_mem_flags flags = 0;
+        if( readOnly ) {
+            flags |= CL_MEM_READ_ONLY;
+        }
+        else {
+            flags |= CL_MEM_READ_WRITE;
+        }
+        if( useHostPtr ) {
+            flags |= CL_MEM_USE_HOST_PTR;
+        }
+        
+        ::size_t size = sizeof(DataType)*(endIterator - startIterator);
+
+        Context context = Context::getDefault(err);
+
+        if( useHostPtr ) {
+            object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error);
+        } else {
+            object_ = ::clCreateBuffer(context(), flags, size, 0, &error);
+        }
+
+        detail::errHandler(error, __CREATE_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        if( !useHostPtr ) {
+            error = cl::copy(startIterator, endIterator, *this);
+            detail::errHandler(error, __CREATE_BUFFER_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Buffer() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer(const Buffer& buffer) : Memory(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { }
+
+    /*! \brief Assignment from Buffer - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const Buffer& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Buffer& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+#if defined(CL_VERSION_1_1)
+    /*! \brief Creates a new buffer object from this.
+     *
+     *  Wraps clCreateSubBuffer().
+     */
+    Buffer createSubBuffer(
+        cl_mem_flags flags,
+        cl_buffer_create_type buffer_create_type,
+        const void * buffer_create_info,
+        cl_int * err = NULL)
+    {
+        Buffer result;
+        cl_int error;
+        result.object_ = ::clCreateSubBuffer(
+            object_, 
+            flags, 
+            buffer_create_type, 
+            buffer_create_info, 
+            &error);
+
+        detail::errHandler(error, __CREATE_SUBBUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+        return result;
+    }		
+#endif
+};
+
+#if defined (USE_DX_INTEROP)
+/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's.
+ *
+ *  This is provided to facilitate interoperability with Direct3D.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ *
+ *  \see Memory
+ */
+class BufferD3D10 : public Buffer
+{
+public:
+    typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)(
+    cl_context context, cl_mem_flags flags, ID3D10Buffer*  buffer,
+    cl_int* errcode_ret);
+
+    /*! \brief Constructs a BufferD3D10, in a specified context, from a
+     *         given ID3D10Buffer.
+     *
+     *  Wraps clCreateFromD3D10BufferKHR().
+     */
+    BufferD3D10(
+        const Context& context,
+        cl_mem_flags flags,
+        ID3D10Buffer* bufobj,
+        cl_int * err = NULL)
+    {
+        static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL;
+
+#if defined(CL_VERSION_1_2)
+        vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>();
+        cl_platform platform = -1;
+        for( int i = 0; i < props.size(); ++i ) {
+            if( props[i] == CL_CONTEXT_PLATFORM ) {
+                platform = props[i+1];
+            }
+        }
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR);
+#endif
+
+        cl_int error;
+        object_ = pfn_clCreateFromD3D10BufferKHR(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferD3D10() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferD3D10 - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const BufferD3D10& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferD3D10& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+};
+#endif
+
+/*! \brief Class interface for GL Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferGL in a specified context, from a given
+     *         GL buffer.
+     *
+     *  Wraps clCreateFromGLBuffer().
+     */
+    BufferGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLBuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferGL() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const BufferGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief Class interface for GL Render Buffer Memory Objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class BufferRenderGL : public Buffer
+{
+public:
+    /*! \brief Constructs a BufferRenderGL in a specified context, from a given
+     *         GL Renderbuffer.
+     *
+     *  Wraps clCreateFromGLRenderbuffer().
+     */
+    BufferRenderGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLuint bufobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLRenderbuffer(
+            context(),
+            flags,
+            bufobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    BufferRenderGL() : Buffer() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { }
+
+    /*! \brief Assignment from BufferGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const BufferRenderGL& rhs)
+    {
+        if (this != &rhs) {
+            Buffer::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    BufferRenderGL& operator = (const cl_mem& rhs)
+    {
+        Buffer::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetGLObjectInfo().
+    cl_int getObjectInfo(
+        cl_gl_object_type *type,
+        GLuint * gl_object_name)
+    {
+        return detail::errHandler(
+            ::clGetGLObjectInfo(object_,type,gl_object_name),
+            __GET_GL_OBJECT_INFO_ERR);
+    }
+};
+
+/*! \brief C++ base class for Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image : public Memory
+{
+protected:
+    //! \brief Default constructor - initializes to NULL.
+    Image() : Memory() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image(const Image& image) : Memory(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { }
+
+    /*! \brief Assignment from Image - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const Image& rhs)
+    {
+        if (this != &rhs) {
+            Memory::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image& operator = (const cl_mem& rhs)
+    {
+        Memory::operator=(rhs);
+        return *this;
+    }
+
+public:
+    //! \brief Wrapper for clGetImageInfo().
+    template <typename T>
+    cl_int getImageInfo(cl_image_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetImageInfo, object_, name, param),
+            __GET_IMAGE_INFO_ERR);
+    }
+    
+    //! \brief Wrapper for clGetImageInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_image_info, name>::param_type
+    getImageInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_image_info, name>::param_type param;
+        cl_int result = getImageInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+/*! \brief Class interface for 1D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image1D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image1D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc;
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D;
+        desc.image_width = width;
+        desc.image_row_pitch = 0;
+        desc.num_mip_levels = 0;
+        desc.num_samples = 0;
+        desc.buffer = 0;
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image1D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D(const Image1D& image1D) : Image(image1D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { }
+
+    /*! \brief Assignment from Image1D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const Image1D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image1D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+/*! \class Image1DBuffer
+ * \brief Image interface for 1D buffer images.
+ */
+class Image1DBuffer : public Image
+{
+public:
+    Image1DBuffer(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        const Buffer &buffer,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc;
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
+        desc.image_width = width;
+        desc.image_row_pitch = 0;
+        desc.num_mip_levels = 0;
+        desc.num_samples = 0;
+        desc.buffer = buffer();
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            NULL, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DBuffer() { }
+
+    Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { }
+
+    Image1DBuffer& operator = (const Image1DBuffer& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image1DBuffer& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+/*! \class Image1DArray
+ * \brief Image interface for arrays of 1D images.
+ */
+class Image1DArray : public Image
+{
+public:
+    Image1DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t rowPitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc;
+        desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+        desc.image_array_size = arraySize;
+        desc.image_width = width;
+        desc.image_row_pitch = rowPitch;
+        desc.num_mip_levels = 0;
+        desc.num_samples = 0;
+        desc.buffer = 0;
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image1DArray() { }
+
+    Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image1DArray& operator = (const Image1DArray& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image1DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+
+/*! \brief Class interface for 2D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image2D : public Image
+{
+public:
+    /*! \brief Constructs a 1D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image2D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t row_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc;
+            desc.image_type = CL_MEM_OBJECT_IMAGE2D;
+            desc.image_width = width;
+            desc.image_height = height;
+            desc.image_row_pitch = row_pitch;
+            desc.num_mip_levels = 0;
+            desc.num_samples = 0;
+            desc.buffer = 0;
+            object_ = ::clCreateImage(
+                context(),
+                flags,
+                &format,
+                &desc,
+                host_ptr,
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage2D(
+                context(), flags,&format, width, height, row_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE2D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D(const Image2D& image2D) : Image(image2D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { }
+
+    /*! \brief Assignment from Image2D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const Image2D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 2D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ *  \note Deprecated for OpenCL 1.2. Please use ImageGL instead.
+ */
+class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D
+{
+public:
+    /*! \brief Constructs an Image2DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture2D().
+     */
+    Image2DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture2D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+
+    }
+    
+    //! \brief Default constructor - initializes to NULL.
+    Image2DGL() : Image2D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL(const Image2DGL& image) : Image2D(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { }
+
+    /*! \brief Assignment from Image2DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const Image2DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image2D::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image2DGL& operator = (const cl_mem& rhs)
+    {
+        Image2D::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class Image2DArray
+ * \brief Image interface for arrays of 2D images.
+ */
+class Image2DArray : public Image
+{
+public:
+    Image2DArray(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t arraySize,
+        ::size_t width,
+        ::size_t height,
+        ::size_t rowPitch,
+        ::size_t slicePitch,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        cl_image_desc desc;
+        desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+        desc.image_array_size = arraySize;
+        desc.image_width = width;
+        desc.image_height = height;
+        desc.image_row_pitch = rowPitch;
+        desc.image_slice_pitch = slicePitch;
+        desc.num_mip_levels = 0;
+        desc.num_samples = 0;
+        desc.buffer = 0;
+        object_ = ::clCreateImage(
+            context(), 
+            flags, 
+            &format, 
+            &desc, 
+            host_ptr, 
+            &error);
+
+        detail::errHandler(error, __CREATE_IMAGE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Image2DArray() { }
+
+    Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { }
+
+    Image2DArray& operator = (const Image2DArray& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Image2DArray& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for 3D Image Memory objects.
+ *
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3D : public Image
+{
+public:
+    /*! \brief Constructs a 3D Image in a specified context.
+     *
+     *  Wraps clCreateImage().
+     */
+    Image3D(
+        const Context& context,
+        cl_mem_flags flags,
+        ImageFormat format,
+        ::size_t width,
+        ::size_t height,
+        ::size_t depth,
+        ::size_t row_pitch = 0,
+        ::size_t slice_pitch = 0,
+        void* host_ptr = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        bool useCreateImage;
+
+#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        // Run-time decision based on the actual platform
+        {
+            cl_uint version = detail::getContextPlatformVersion(context());
+            useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above
+        }
+#elif defined(CL_VERSION_1_2)
+        useCreateImage = true;
+#else
+        useCreateImage = false;
+#endif
+
+#if defined(CL_VERSION_1_2)
+        if (useCreateImage)
+        {
+            cl_image_desc desc;
+            desc.image_type = CL_MEM_OBJECT_IMAGE3D;
+            desc.image_width = width;
+            desc.image_height = height;
+            desc.image_depth = depth;
+            desc.image_row_pitch = row_pitch;
+            desc.image_slice_pitch = slice_pitch;
+            desc.num_mip_levels = 0;
+            desc.num_samples = 0;
+            desc.buffer = 0;
+            object_ = ::clCreateImage(
+                context(), 
+                flags, 
+                &format, 
+                &desc, 
+                host_ptr, 
+                &error);
+
+            detail::errHandler(error, __CREATE_IMAGE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif  // #if defined(CL_VERSION_1_2)
+#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+        if (!useCreateImage)
+        {
+            object_ = ::clCreateImage3D(
+                context(), flags, &format, width, height, depth, row_pitch,
+                slice_pitch, host_ptr, &error);
+
+            detail::errHandler(error, __CREATE_IMAGE3D_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D(const Image3D& image3D) : Image(image3D) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { }
+
+    /*! \brief Assignment from Image3D - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const Image3D& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3D& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+
+#if !defined(CL_VERSION_1_2)
+/*! \brief Class interface for GL 3D Image Memory objects.
+ *
+ *  This is provided to facilitate interoperability with OpenGL.
+ * 
+ *  See Memory for details about copy semantics, etc.
+ * 
+ *  \see Memory
+ */
+class Image3DGL : public Image3D
+{
+public:
+    /*! \brief Constructs an Image3DGL in a specified context, from a given
+     *         GL Texture.
+     *
+     *  Wraps clCreateFromGLTexture3D().
+     */
+    Image3DGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture3D(
+            context(),
+            flags,
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    //! \brief Default constructor - initializes to NULL.
+    Image3DGL() : Image3D() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL(const Image3DGL& image) : Image3D(image) { }
+
+    /*! \brief Constructor from cl_mem - takes ownership.
+     *
+     *  See Memory for further details.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { }
+
+    /*! \brief Assignment from Image3DGL - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const Image3DGL& rhs)
+    {
+        if (this != &rhs) {
+            Image3D::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment from cl_mem - performs shallow copy.
+     *
+     *  See Memory for further details.
+     */
+    Image3DGL& operator = (const cl_mem& rhs)
+    {
+        Image3D::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if !defined(CL_VERSION_1_2)
+
+#if defined(CL_VERSION_1_2)
+/*! \class ImageGL
+ * \brief general image interface for GL interop.
+ * We abstract the 2D and 3D GL images into a single instance here
+ * that wraps all GL sourced images on the grounds that setup information
+ * was performed by OpenCL anyway.
+ */
+class ImageGL : public Image
+{
+public:
+    ImageGL(
+        const Context& context,
+        cl_mem_flags flags,
+        GLenum target,
+        GLint  miplevel,
+        GLuint texobj,
+        cl_int * err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateFromGLTexture(
+            context(), 
+            flags, 
+            target,
+            miplevel,
+            texobj,
+            &error);
+
+        detail::errHandler(error, __CREATE_GL_TEXTURE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    ImageGL() : Image() { }
+
+    ImageGL(const ImageGL& image) : Image(image) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { }
+
+    ImageGL& operator = (const ImageGL& rhs)
+    {
+        if (this != &rhs) {
+            Image::operator=(rhs);
+        }
+        return *this;
+    }
+
+    ImageGL& operator = (const cl_mem& rhs)
+    {
+        Image::operator=(rhs);
+        return *this;
+    }
+};
+#endif // #if defined(CL_VERSION_1_2)
+
+/*! \brief Class interface for cl_sampler.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_sampler as the original.  For details, see
+ *        clRetainSampler() and clReleaseSampler().
+ *
+ *  \see cl_sampler 
+ */
+class Sampler : public detail::Wrapper<cl_sampler>
+{
+public:
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseSampler() on the value held by this instance.
+     */
+    ~Sampler() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Sampler() { }
+
+    /*! \brief Constructs a Sampler in a specified context.
+     *
+     *  Wraps clCreateSampler().
+     */
+
+    #if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+    Sampler(
+        const Context& context,
+        cl_bool normalized_coords,
+        cl_addressing_mode addressing_mode,
+        cl_filter_mode filter_mode,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSampler(
+            context(), 
+            normalized_coords,
+            addressing_mode,
+            filter_mode,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#else
+    Sampler(
+        const Context& context,
+        const cl_sampler_properties *sampler_properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        object_ = ::clCreateSamplerWithProperties(
+            context(), 
+            sampler_properties,
+            &error);
+
+        detail::errHandler(error, __CREATE_SAMPLER_PROPERTY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainSampler() on the parameter's cl_sampler.
+     */
+    Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Constructor from cl_sampler - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_sampler
+     *  into the new Sampler object.
+     */
+    Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { }
+
+    /*! \brief Assignment operator from Sampler.
+     * 
+     *  This calls clRetainSampler() on the parameter and clReleaseSampler()
+     *  on the previous value held by this instance.
+     */
+    Sampler& operator = (const Sampler& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_sampler - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseSampler() on the value previously held by this instance.
+     */
+    Sampler& operator = (const cl_sampler& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo().
+    template <typename T>
+    cl_int getInfo(cl_sampler_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetSamplerInfo, object_, name, param),
+            __GET_SAMPLER_INFO_ERR);
+    }
+
+    //! \brief Wrapper for clGetSamplerInfo() that returns by value.
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_sampler_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_sampler_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+};
+
+class Program;
+class CommandQueue;
+class Kernel;
+
+//! \brief Class interface for specifying NDRange values.
+class NDRange
+{
+private:
+    size_t<3> sizes_;
+    cl_uint dimensions_;
+
+public:
+    //! \brief Default constructor - resulting range has zero dimensions.
+    NDRange()
+        : dimensions_(0)
+    { }
+
+    //! \brief Constructs one-dimensional range.
+    NDRange(::size_t size0)
+        : dimensions_(1)
+    {
+        sizes_[0] = size0;
+    }
+
+    //! \brief Constructs two-dimensional range.
+    NDRange(::size_t size0, ::size_t size1)
+        : dimensions_(2)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+    }
+
+    //! \brief Constructs three-dimensional range.
+    NDRange(::size_t size0, ::size_t size1, ::size_t size2)
+        : dimensions_(3)
+    {
+        sizes_[0] = size0;
+        sizes_[1] = size1;
+        sizes_[2] = size2;
+    }
+
+    /*! \brief Conversion operator to const ::size_t *.
+     *  
+     *  \returns a pointer to the size of the first dimension.
+     */
+    operator const ::size_t*() const { 
+        return (const ::size_t*) sizes_; 
+    }
+
+    //! \brief Queries the number of dimensions in the range.
+    ::size_t dimensions() const { return dimensions_; }
+};
+
+//! \brief A zero-dimensional range.
+static const NDRange NullRange;
+
+//! \brief Local address wrapper for use with Kernel::setArg
+struct LocalSpaceArg
+{
+    ::size_t size_;
+};
+
+namespace detail {
+
+template <typename T>
+struct KernelArgumentHandler
+{
+    static ::size_t size(const T&) { return sizeof(T); }
+    static T* ptr(T& value) { return &value; }
+};
+
+template <>
+struct KernelArgumentHandler<LocalSpaceArg>
+{
+    static ::size_t size(const LocalSpaceArg& value) { return value.size_; }
+    static void* ptr(LocalSpaceArg&) { return NULL; }
+};
+
+} 
+//! \endcond
+
+/*! __local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ * Deprecated. Replaced with Local.
+ */
+inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg
+__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+inline LocalSpaceArg
+__local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+/*! Local
+ * \brief Helper function for generating LocalSpaceArg objects.
+ */
+inline LocalSpaceArg
+Local(::size_t size)
+{
+    LocalSpaceArg ret = { size };
+    return ret;
+}
+
+//class KernelFunctor;
+
+/*! \brief Class interface for cl_kernel.
+ *
+ *  \note Copies of these objects are shallow, meaning that the copy will refer
+ *        to the same underlying cl_kernel as the original.  For details, see
+ *        clRetainKernel() and clReleaseKernel().
+ *
+ *  \see cl_kernel
+ */
+class Kernel : public detail::Wrapper<cl_kernel>
+{
+public:
+    inline Kernel(const Program& program, const char* name, cl_int* err = NULL);
+
+    /*! \brief Destructor.
+     *
+     *  This calls clReleaseKernel() on the value held by this instance.
+     */
+    ~Kernel() { }
+
+    //! \brief Default constructor - initializes to NULL.
+    Kernel() { }
+
+    /*! \brief Copy constructor - performs shallow copy.
+     * 
+     *  This calls clRetainKernel() on the parameter's cl_kernel.
+     */
+    Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Constructor from cl_kernel - takes ownership.
+     * 
+     *  This effectively transfers ownership of a refcount on the cl_kernel
+     *  into the new Kernel object.
+     */
+    __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { }
+
+    /*! \brief Assignment operator from Kernel.
+     * 
+     *  This calls clRetainKernel() on the parameter and clReleaseKernel()
+     *  on the previous value held by this instance.
+     */
+    Kernel& operator = (const Kernel& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    /*! \brief Assignment operator from cl_kernel - takes ownership.
+     *
+     *  This effectively transfers ownership of a refcount on the rhs and calls
+     *  clReleaseKernel() on the value previously held by this instance.
+     */
+    Kernel& operator = (const cl_kernel& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_kernel_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelInfo, object_, name, param),
+            __GET_KERNEL_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+#if defined(CL_VERSION_1_2)
+    template <typename T>
+    cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param),
+            __GET_KERNEL_ARG_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_arg_info, name>::param_type
+    getArgInfo(cl_uint argIndex, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_kernel_arg_info, name>::param_type param;
+        cl_int result = getArgInfo(argIndex, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    template <typename T>
+    cl_int getWorkGroupInfo(
+        const Device& device, cl_kernel_work_group_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetKernelWorkGroupInfo, object_, device(), name, param),
+                __GET_KERNEL_WORK_GROUP_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type
+        getWorkGroupInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+        detail::cl_kernel_work_group_info, name>::param_type param;
+        cl_int result = getWorkGroupInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int setArg(cl_uint index, T value)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(
+                object_,
+                index,
+                detail::KernelArgumentHandler<T>::size(value),
+                detail::KernelArgumentHandler<T>::ptr(value)),
+            __SET_KERNEL_ARGS_ERR);
+    }
+
+    cl_int setArg(cl_uint index, ::size_t size, void* argPtr)
+    {
+        return detail::errHandler(
+            ::clSetKernelArg(object_, index, size, argPtr),
+            __SET_KERNEL_ARGS_ERR);
+    }
+};
+
+/*! \class Program
+ * \brief Program interface that implements cl_program.
+ */
+class Program : public detail::Wrapper<cl_program>
+{
+public:
+    typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries;
+    typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources;
+
+    Program(
+        const STRING_CLASS& source,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const STRING_CLASS& source,
+		bool build,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        Context context = Context::getDefault(err);
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const STRING_CLASS& source,
+        bool build = false,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const char * strings = source.c_str();
+        const ::size_t length  = source.size();
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)1, &strings, &length, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+
+        if (error == CL_SUCCESS && build) {
+
+            error = ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                "",
+                NULL,
+                NULL);
+
+            detail::errHandler(error, __BUILD_PROGRAM_ERR);
+        }
+
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    Program(
+        const Context& context,
+        const Sources& sources,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        const ::size_t n = (::size_t)sources.size();
+        ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t));
+        const char** strings = (const char**) alloca(n * sizeof(const char*));
+
+        for (::size_t i = 0; i < n; ++i) {
+            strings[i] = sources[(int)i].first;
+            lengths[i] = sources[(int)i].second;
+        }
+
+        object_ = ::clCreateProgramWithSource(
+            context(), (cl_uint)n, strings, lengths, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    /**
+     * Construct a program object from a list of devices and a per-device list of binaries.
+     * \param context A valid OpenCL context in which to construct the program.
+     * \param devices A vector of OpenCL device objects for which the program will be created.
+     * \param binaries A vector of pairs of a pointer to a binary object and its length.
+     * \param binaryStatus An optional vector that on completion will be resized to
+     *   match the size of binaries and filled with values to specify if each binary
+     *   was successfully loaded.
+     *   Set to CL_SUCCESS if the binary was successfully loaded.
+     *   Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL.
+     *   Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device.
+     * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors:
+     *   CL_INVALID_CONTEXT if context is not a valid context.
+     *   CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; 
+     *     or if any entry in binaries is NULL or has length 0.
+     *   CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context.
+     *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
+     *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const Binaries& binaries,
+        VECTOR_CLASS<cl_int>* binaryStatus = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+        
+        const ::size_t numDevices = devices.size();
+        
+        // Catch size mismatch early and return
+        if(binaries.size() != numDevices) {
+            error = CL_INVALID_VALUE;
+            detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+            return;
+        }
+
+        ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t));
+        const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**));
+
+        for (::size_t i = 0; i < numDevices; ++i) {
+            images[i] = (const unsigned char*)binaries[i].first;
+            lengths[i] = binaries[(int)i].second;
+        }
+
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        if(binaryStatus) {
+            binaryStatus->resize(numDevices);
+        }
+        
+        object_ = ::clCreateProgramWithBinary(
+            context(), (cl_uint) devices.size(),
+            deviceIDs,
+            lengths, images, binaryStatus != NULL
+               ? &binaryStatus->front()
+               : NULL, &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+
+    
+#if defined(CL_VERSION_1_2)
+    /**
+     * Create program using builtin kernels.
+     * \param kernelNames Semi-colon separated list of builtin kernel names
+     */
+    Program(
+        const Context& context,
+        const VECTOR_CLASS<Device>& devices,
+        const STRING_CLASS& kernelNames,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+        
+        object_ = ::clCreateProgramWithBuiltInKernels(
+            context(), 
+            (cl_uint) devices.size(),
+            deviceIDs,
+            kernelNames.c_str(), 
+            &error);
+
+        detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    Program() { }
+
+    Program(const Program& program) : detail::Wrapper<cl_type>(program) { }
+
+    __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { }
+
+    Program& operator = (const Program& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    Program& operator = (const cl_program& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    cl_int build(
+        const VECTOR_CLASS<Device>& devices,
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        ::size_t numDevices = devices.size();
+        cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id));
+        for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) {
+            deviceIDs[deviceIndex] = (devices[deviceIndex])();
+        }
+
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                (cl_uint)
+                devices.size(),
+                deviceIDs,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+    cl_int build(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clBuildProgram(
+                object_,
+                0,
+                NULL,
+                options,
+                notifyFptr,
+                data),
+                __BUILD_PROGRAM_ERR);
+    }
+
+#if defined(CL_VERSION_1_2)
+	cl_int compile(
+        const char* options = NULL,
+        void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+        void* data = NULL) const
+    {
+        return detail::errHandler(
+            ::clCompileProgram(
+                object_,
+                0,
+                NULL,
+                options,
+				0,
+				NULL,
+				NULL,
+                notifyFptr,
+                data),
+                __COMPILE_PROGRAM_ERR);
+    }
+#endif
+
+    template <typename T>
+    cl_int getInfo(cl_program_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(&::clGetProgramInfo, object_, name, param),
+            __GET_PROGRAM_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    template <typename T>
+    cl_int getBuildInfo(
+        const Device& device, cl_program_build_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetProgramBuildInfo, object_, device(), name, param),
+                __GET_PROGRAM_BUILD_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_program_build_info, name>::param_type
+    getBuildInfo(const Device& device, cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_program_build_info, name>::param_type param;
+        cl_int result = getBuildInfo(device, name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int createKernels(VECTOR_CLASS<Kernel>* kernels)
+    {
+        cl_uint numKernels;
+        cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel));
+        err = ::clCreateKernelsInProgram(
+            object_, numKernels, (cl_kernel*) value, NULL);
+        if (err != CL_SUCCESS) {
+            return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR);
+        }
+
+        kernels->assign(&value[0], &value[numKernels]);
+        return CL_SUCCESS;
+    }
+};
+
+#if defined(CL_VERSION_1_2)
+inline Program linkProgram(
+    Program input1,
+    Program input2,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program programs[2] = { input1(), input2() };
+
+    Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>();
+
+    cl_program prog = ::clLinkProgram(
+        ctx(),
+        0,
+        NULL,
+        options,
+        2,
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+
+inline Program linkProgram(
+    VECTOR_CLASS<Program> inputPrograms,
+    const char* options = NULL,
+    void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL,
+    void* data = NULL,
+    cl_int* err = NULL) 
+{
+    cl_int err_local = CL_SUCCESS;
+
+    cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program));
+
+    if (programs != NULL) {
+        for (unsigned int i = 0; i < inputPrograms.size(); i++) {
+          programs[i] = inputPrograms[i]();
+        }
+    } 
+
+    cl_program prog = ::clLinkProgram(
+        Context::getDefault()(),
+        0,
+        NULL,
+        options,
+        (cl_uint)inputPrograms.size(),
+        programs,
+        notifyFptr,
+        data,
+        &err_local);
+
+    detail::errHandler(err_local,__COMPILE_PROGRAM_ERR);
+    if (err != NULL) {
+        *err = err_local;
+    }
+
+    return Program(prog);
+}
+#endif
+
+template<>
+inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const
+{
+    VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>();
+    VECTOR_CLASS<char *> binaries;
+    for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) 
+    {
+        char *ptr = NULL;
+        if (*s != 0) 
+            ptr = new char[*s];
+        binaries.push_back(ptr);
+    }
+    
+    cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries);
+    if (err != NULL) {
+        *err = result;
+    }
+    return binaries;
+}
+
+inline Kernel::Kernel(const Program& program, const char* name, cl_int* err)
+{
+    cl_int error;
+
+    object_ = ::clCreateKernel(program(), name, &error);
+    detail::errHandler(error, __CREATE_KERNEL_ERR);
+
+    if (err != NULL) {
+        *err = error;
+    }
+
+}
+
+/*! \class CommandQueue
+ * \brief CommandQueue interface for cl_command_queue.
+ */
+class CommandQueue : public detail::Wrapper<cl_command_queue>
+{
+private:
+    static volatile int default_initialized_;
+    static CommandQueue default_;
+    static volatile cl_int default_error_;
+public:
+#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+   CommandQueue(
+        cl_command_queue_properties properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueue(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+#else
+    CommandQueue(
+        const cl_queue_properties *properties,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR);
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            object_ = ::clCreateCommandQueueWithProperties(
+                context(), device(), properties, &error);
+
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR);
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+    }
+#endif
+
+#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        cl_command_queue_properties properties = 0,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        object_ = ::clCreateCommandQueue(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#else
+    CommandQueue(
+        const Context& context,
+        const Device& device,
+        const cl_queue_properties *properties = NULL,
+        cl_int* err = NULL)
+    {
+        cl_int error;
+
+        object_ = ::clCreateCommandQueueWithProperties(
+            context(), device(), properties, &error);
+
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+    }
+#endif
+
+    static CommandQueue getDefault(cl_int * err = NULL) 
+    {
+        int state = detail::compare_exchange(
+            &default_initialized_, 
+            __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED);
+        
+        if (state & __DEFAULT_INITIALIZED) {
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        if (state & __DEFAULT_BEING_INITIALIZED) {
+              // Assume writes will propagate eventually...
+              while(default_initialized_ != __DEFAULT_INITIALIZED) {
+                  detail::fence();
+              }
+
+            if (err != NULL) {
+                *err = default_error_;
+            }
+            return default_;
+        }
+
+        cl_int error;
+
+        Context context = Context::getDefault(&error);
+#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+#else
+        detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR);
+#endif
+
+        if (error != CL_SUCCESS) {
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+        else {
+            Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0];
+
+            default_ = CommandQueue(context, device, 0, &error);
+
+#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR);
+#else
+            detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR);
+#endif
+            if (err != NULL) {
+                *err = error;
+            }
+        }
+
+        detail::fence();
+
+        default_error_ = error;
+        // Assume writes will propagate eventually...
+        default_initialized_ = __DEFAULT_INITIALIZED;
+
+        detail::fence();
+
+        if (err != NULL) {
+            *err = default_error_;
+        }
+        return default_;
+
+    }
+
+    CommandQueue() { }
+
+    CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { }
+
+    CommandQueue& operator = (const CommandQueue& rhs)
+    {
+        if (this != &rhs) {
+            detail::Wrapper<cl_type>::operator=(rhs);
+        }
+        return *this;
+    }
+
+    CommandQueue& operator = (const cl_command_queue& rhs)
+    {
+        detail::Wrapper<cl_type>::operator=(rhs);
+        return *this;
+    }
+
+    template <typename T>
+    cl_int getInfo(cl_command_queue_info name, T* param) const
+    {
+        return detail::errHandler(
+            detail::getInfo(
+                &::clGetCommandQueueInfo, object_, name, param),
+                __GET_COMMAND_QUEUE_INFO_ERR);
+    }
+
+    template <cl_int name> typename
+    detail::param_traits<detail::cl_command_queue_info, name>::param_type
+    getInfo(cl_int* err = NULL) const
+    {
+        typename detail::param_traits<
+            detail::cl_command_queue_info, name>::param_type param;
+        cl_int result = getInfo(name, &param);
+        if (err != NULL) {
+            *err = result;
+        }
+        return param;
+    }
+
+    cl_int enqueueReadBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBuffer(
+                object_, buffer(), blocking, offset, size,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBuffer(
+                object_, src(), dst(), src_offset, dst_offset, size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueReadBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_READ_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteBufferRect(
+        const Buffer& buffer,
+        cl_bool blocking,
+        const size_t<3>& buffer_offset,
+        const size_t<3>& host_offset,
+        const size_t<3>& region,
+        ::size_t buffer_row_pitch,
+        ::size_t buffer_slice_pitch,
+        ::size_t host_row_pitch,
+        ::size_t host_slice_pitch,
+        void *ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteBufferRect(
+                object_, 
+                buffer(), 
+                blocking, 
+                (const ::size_t *)buffer_offset,
+                (const ::size_t *)host_offset,
+                (const ::size_t *)region,
+                buffer_row_pitch,
+                buffer_slice_pitch,
+                host_row_pitch,
+                host_slice_pitch,
+                ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_WRITE_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferRect(
+        const Buffer& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        ::size_t src_row_pitch,
+        ::size_t src_slice_pitch,
+        ::size_t dst_row_pitch,
+        ::size_t dst_slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferRect(
+                object_, 
+                src(), 
+                dst(), 
+                (const ::size_t *)src_origin, 
+                (const ::size_t *)dst_origin, 
+                (const ::size_t *)region,
+                src_row_pitch,
+                src_slice_pitch,
+                dst_row_pitch,
+                dst_slice_pitch,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQEUE_COPY_BUFFER_RECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill a buffer object with a pattern
+     * of a given size. The pattern is specified a as vector.
+     * \tparam PatternType The datatype of the pattern field. 
+     *     The pattern type must be an accepted OpenCL data type.
+     */
+    template<typename PatternType>
+    cl_int enqueueFillBuffer(
+        const Buffer& buffer,
+        PatternType pattern,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillBuffer(
+                object_, 
+                buffer(),
+                static_cast<void*>(&pattern),
+                sizeof(PatternType), 
+                offset, 
+                size,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueReadImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueReadImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_READ_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueWriteImage(
+        const Image& image,
+        cl_bool blocking,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t row_pitch,
+        ::size_t slice_pitch,
+        void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueWriteImage(
+                object_, image(), blocking, (const ::size_t *) origin,
+                (const ::size_t *) region, row_pitch, slice_pitch, ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_WRITE_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyImage(
+        const Image& src,
+        const Image& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImage(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *)dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA floating-point color value if
+     *     the image channel data type is not an unnormalized signed or
+     *     unsigned data type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_float4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA signed integer color value if
+     *     the image channel data type is an unnormalized signed integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_int4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * Enqueue a command to fill an image object with a specified color.
+     * \param fillColor is the color to use to fill the image.
+     *     This is a four component RGBA unsigned integer color value if
+     *     the image channel data type is an unnormalized unsigned integer
+     *     type.
+     */
+    cl_int enqueueFillImage(
+        const Image& image,
+        cl_uint4 fillColor,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueFillImage(
+                object_, 
+                image(),
+                static_cast<void*>(&fillColor), 
+                (const ::size_t *) origin, 
+                (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+                __ENQUEUE_FILL_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueCopyImageToBuffer(
+        const Image& src,
+        const Buffer& dst,
+        const size_t<3>& src_origin,
+        const size_t<3>& region,
+        ::size_t dst_offset,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyImageToBuffer(
+                object_, src(), dst(), (const ::size_t *) src_origin,
+                (const ::size_t *) region, dst_offset,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    cl_int enqueueCopyBufferToImage(
+        const Buffer& src,
+        const Image& dst,
+        ::size_t src_offset,
+        const size_t<3>& dst_origin,
+        const size_t<3>& region,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueCopyBufferToImage(
+                object_, src(), dst(), src_offset,
+                (const ::size_t *) dst_origin, (const ::size_t *) region,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapBuffer(
+            object_, buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+        if (err != NULL) {
+            *err = error;
+        }
+        return result;
+    }
+
+    void* enqueueMapImage(
+        const Image& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        const size_t<3>& origin,
+        const size_t<3>& region,
+        ::size_t * row_pitch,
+        ::size_t * slice_pitch,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL) const
+    {
+        cl_int error;
+        void * result = ::clEnqueueMapImage(
+            object_, buffer(), blocking, flags,
+            (const ::size_t *) origin, (const ::size_t *) region,
+            row_pitch, slice_pitch,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+        detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR);
+        if (err != NULL) {
+              *err = error;
+        }
+        return result;
+    }
+
+    cl_int enqueueUnmapMemObject(
+        const Memory& memory,
+        void* mapped_ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueUnmapMemObject(
+                object_, memory(), mapped_ptr,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if defined(CL_VERSION_1_2)
+    /**
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or all previously enqueued commands to complete.
+     *
+     * Enqueues a marker command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command returns an event which can be waited on, 
+     * i.e. this event can be waited on to insure that all events either in the event_wait_list 
+     * or all previously enqueued commands, queued before this command to command_queue, 
+     * have completed.
+     */
+    cl_int enqueueMarkerWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueMarkerWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_MARKER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+    /**
+     * A synchronization point that enqueues a barrier operation.
+     *
+     * Enqueues a barrier command which waits for either a list of events to complete, 
+     * or if the list is empty it waits for all commands previously enqueued in command_queue 
+     * to complete before it completes. This command blocks command execution, that is, any 
+     * following commands enqueued after it do not execute until it completes. This command 
+     * returns an event which can be waited on, i.e. this event can be waited on to insure that 
+     * all events either in the event_wait_list or all previously enqueued commands, queued 
+     * before this command to command_queue, have completed.
+     */
+    cl_int enqueueBarrierWithWaitList(
+        const VECTOR_CLASS<Event> *events = 0,
+        Event *event = 0)
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueBarrierWithWaitList(
+                object_,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_BARRIER_WAIT_LIST_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+    
+    /**
+     * Enqueues a command to indicate with which device a set of memory objects
+     * should be associated.
+     */
+    cl_int enqueueMigrateMemObjects(
+        const VECTOR_CLASS<Memory> &memObjects,
+        cl_mem_migration_flags flags,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL
+        )
+    {
+        cl_event tmp;
+        
+        cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem)));
+        for( int i = 0; i < (int)memObjects.size(); ++i ) {
+            localMemObjects[i] = memObjects[i]();
+        }
+
+
+        cl_int err = detail::errHandler(
+            ::clEnqueueMigrateMemObjects(
+                object_, 
+                (cl_uint)memObjects.size(), 
+                static_cast<const cl_mem*>(localMemObjects),
+                flags,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif // #if defined(CL_VERSION_1_2)
+
+    cl_int enqueueNDRangeKernel(
+        const Kernel& kernel,
+        const NDRange& offset,
+        const NDRange& global,
+        const NDRange& local = NullRange,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNDRangeKernel(
+                object_, kernel(), (cl_uint) global.dimensions(),
+                offset.dimensions() != 0 ? (const ::size_t*) offset : NULL,
+                (const ::size_t*) global,
+                local.dimensions() != 0 ? (const ::size_t*) local : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NDRANGE_KERNEL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
+    cl_int enqueueTask(
+        const Kernel& kernel,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueTask(
+                object_, kernel(),
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_TASK_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+    cl_int enqueueNativeKernel(
+        void (CL_CALLBACK *userFptr)(void *),
+        std::pair<void*, ::size_t> args,
+        const VECTOR_CLASS<Memory>* mem_objects = NULL,
+        const VECTOR_CLASS<const void*>* mem_locs = NULL,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL) const
+    {
+        cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) 
+            ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem))
+            : NULL;
+
+        if (mems != NULL) {
+            for (unsigned int i = 0; i < mem_objects->size(); i++) {
+                mems[i] = ((*mem_objects)[i])();
+            }
+        }
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            ::clEnqueueNativeKernel(
+                object_, userFptr, args.first, args.second,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                mems,
+                (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_NATIVE_KERNEL);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED 
+    cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueMarker(object_, (cl_event*) event),
+            __ENQUEUE_MARKER_ERR);
+    }
+
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueWaitForEvents(
+                object_,
+                (cl_uint) events.size(),
+                (const cl_event*) &events.front()),
+            __ENQUEUE_WAIT_FOR_EVENTS_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int enqueueAcquireGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueAcquireGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseGLObjects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+     {
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             ::clEnqueueReleaseGLObjects(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+#if defined (USE_DX_INTEROP)
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects, cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)(
+    cl_command_queue command_queue, cl_uint num_objects,
+    const cl_mem* mem_objects,  cl_uint num_events_in_wait_list,
+    const cl_event* event_wait_list, cl_event* event);
+
+    cl_int enqueueAcquireD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR);
+#endif
+        
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+             pfn_clEnqueueAcquireD3D10ObjectsKHR(
+                 object_,
+                 (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                 (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                 (events != NULL) ? (cl_uint) events->size() : 0,
+                 (events != NULL) ? (cl_event*) &events->front() : NULL,
+                 (event != NULL) ? &tmp : NULL),
+             __ENQUEUE_ACQUIRE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+     }
+
+    cl_int enqueueReleaseD3D10Objects(
+         const VECTOR_CLASS<Memory>* mem_objects = NULL,
+         const VECTOR_CLASS<Event>* events = NULL,
+         Event* event = NULL) const
+    {
+        static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL;
+#if defined(CL_VERSION_1_2)
+        cl_context context = getInfo<CL_QUEUE_CONTEXT>();
+        cl::Device device(getInfo<CL_QUEUE_DEVICE>());
+        cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>();
+        __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_2)
+#if defined(CL_VERSION_1_1)
+        __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR);
+#endif // #if defined(CL_VERSION_1_1)
+
+        cl_event tmp;
+        cl_int err = detail::errHandler(
+            pfn_clEnqueueReleaseD3D10ObjectsKHR(
+                object_,
+                (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0,
+                (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL,
+                (events != NULL) ? (cl_uint) events->size() : 0,
+                (events != NULL) ? (cl_event*) &events->front() : NULL,
+                (event != NULL) ? &tmp : NULL),
+            __ENQUEUE_RELEASE_GL_ERR);
+
+        if (event != NULL && err == CL_SUCCESS)
+            *event = tmp;
+
+        return err;
+    }
+#endif
+
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) 
+    CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+    {
+        return detail::errHandler(
+            ::clEnqueueBarrier(object_),
+            __ENQUEUE_BARRIER_ERR);
+    }
+#endif // #if defined(CL_VERSION_1_1)
+
+    cl_int flush() const
+    {
+        return detail::errHandler(::clFlush(object_), __FLUSH_ERR);
+    }
+
+    cl_int finish() const
+    {
+        return detail::errHandler(::clFinish(object_), __FINISH_ERR);
+    }
+};
+
+#ifdef _WIN32
+__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__declspec(selectany) CommandQueue CommandQueue::default_;
+__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#else
+__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED;
+__attribute__((weak)) CommandQueue CommandQueue::default_;
+__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS;
+#endif
+
+inline cl_int enqueueReadBuffer(
+    const Buffer& buffer,
+    cl_bool blocking,
+    ::size_t offset,
+    ::size_t size,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline cl_int enqueueWriteBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        ::size_t offset,
+        ::size_t size,
+        const void* ptr,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event);
+}
+
+inline void* enqueueMapBuffer(
+        const Buffer& buffer,
+        cl_bool blocking,
+        cl_map_flags flags,
+        ::size_t offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL,
+        cl_int* err = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+
+    void * result = ::clEnqueueMapBuffer(
+            queue(), buffer(), blocking, flags, offset, size,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (cl_event*) event,
+            &error);
+
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (err != NULL) {
+        *err = error;
+    }
+    return result;
+}
+
+inline cl_int enqueueUnmapMemObject(
+    const Memory& memory,
+    void* mapped_ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+    detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR);
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    cl_event tmp;
+    cl_int err = detail::errHandler(
+        ::clEnqueueUnmapMemObject(
+            queue(), memory(), mapped_ptr,
+            (events != NULL) ? (cl_uint) events->size() : 0,
+            (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL,
+            (event != NULL) ? &tmp : NULL),
+        __ENQUEUE_UNMAP_MEM_OBJECT_ERR);
+
+    if (event != NULL && err == CL_SUCCESS)
+        *event = tmp;
+
+    return err;
+}
+
+inline cl_int enqueueCopyBuffer(
+        const Buffer& src,
+        const Buffer& dst,
+        ::size_t src_offset,
+        ::size_t dst_offset,
+        ::size_t size,
+        const VECTOR_CLASS<Event>* events = NULL,
+        Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event);
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ */
+template< typename IteratorType >
+inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+    
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+#if defined(_MSC_VER)
+    std::copy(
+        startIterator, 
+        endIterator, 
+        stdext::checked_array_iterator<DataType*>(
+            pointer, length));
+#else
+    std::copy(startIterator, endIterator, pointer);
+#endif
+    Event endEvent;
+    error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+/**
+ * Blocking copy operation between iterators and a buffer.
+ */
+template< typename IteratorType >
+inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator )
+{
+    typedef typename std::iterator_traits<IteratorType>::value_type DataType;
+    cl_int error;
+        
+    ::size_t length = endIterator-startIterator;
+    ::size_t byteLength = length*sizeof(DataType);
+
+    DataType *pointer = 
+        static_cast<DataType*>(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error));
+    // if exceptions enabled, enqueueMapBuffer will throw
+    if( error != CL_SUCCESS ) {
+        return error;
+    }
+    std::copy(pointer, pointer + length, startIterator);
+    Event endEvent;
+    error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent);
+    // if exceptions enabled, enqueueUnmapMemObject will throw
+    if( error != CL_SUCCESS ) { 
+        return error;
+    }
+    endEvent.wait();
+    return CL_SUCCESS;
+}
+
+#if defined(CL_VERSION_1_1)
+inline cl_int enqueueReadBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteBufferRect(
+    const Buffer& buffer,
+    cl_bool blocking,
+    const size_t<3>& buffer_offset,
+    const size_t<3>& host_offset,
+    const size_t<3>& region,
+    ::size_t buffer_row_pitch,
+    ::size_t buffer_slice_pitch,
+    ::size_t host_row_pitch,
+    ::size_t host_slice_pitch,
+    void *ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteBufferRect(
+        buffer, 
+        blocking, 
+        buffer_offset, 
+        host_offset,
+        region,
+        buffer_row_pitch,
+        buffer_slice_pitch,
+        host_row_pitch,
+        host_slice_pitch,
+        ptr, 
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyBufferRect(
+    const Buffer& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    ::size_t src_row_pitch,
+    ::size_t src_slice_pitch,
+    ::size_t dst_row_pitch,
+    ::size_t dst_slice_pitch,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferRect(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        src_row_pitch,
+        src_slice_pitch,
+        dst_row_pitch,
+        dst_slice_pitch,
+        events, 
+        event);
+}
+#endif
+
+inline cl_int enqueueReadImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL) 
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueReadImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueWriteImage(
+    const Image& image,
+    cl_bool blocking,
+    const size_t<3>& origin,
+    const size_t<3>& region,
+    ::size_t row_pitch,
+    ::size_t slice_pitch,
+    void* ptr,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueWriteImage(
+        image,
+        blocking,
+        origin,
+        region,
+        row_pitch,
+        slice_pitch,
+        ptr,
+        events, 
+        event);
+}
+
+inline cl_int enqueueCopyImage(
+    const Image& src,
+    const Image& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImage(
+        src,
+        dst,
+        src_origin,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyImageToBuffer(
+    const Image& src,
+    const Buffer& dst,
+    const size_t<3>& src_origin,
+    const size_t<3>& region,
+    ::size_t dst_offset,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyImageToBuffer(
+        src,
+        dst,
+        src_origin,
+        region,
+        dst_offset,
+        events,
+        event);
+}
+
+inline cl_int enqueueCopyBufferToImage(
+    const Buffer& src,
+    const Image& dst,
+    ::size_t src_offset,
+    const size_t<3>& dst_origin,
+    const size_t<3>& region,
+    const VECTOR_CLASS<Event>* events = NULL,
+    Event* event = NULL)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.enqueueCopyBufferToImage(
+        src,
+        dst,
+        src_offset,
+        dst_origin,
+        region,
+        events,
+        event);
+}
+
+
+inline cl_int flush(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    }
+
+    return queue.flush();
+}
+
+inline cl_int finish(void)
+{
+    cl_int error;
+    CommandQueue queue = CommandQueue::getDefault(&error);
+
+    if (error != CL_SUCCESS) {
+        return error;
+    } 
+
+
+    return queue.finish();
+}
+
+// Kernel Functor support
+// New interface as of September 2011
+// Requires the C++11 std::tr1::function (note do not support TR1)
+// Visual Studio 2010 and GCC 4.2
+
+struct EnqueueArgs
+{
+    CommandQueue queue_;
+    const NDRange offset_;
+    const NDRange global_;
+    const NDRange local_;
+    VECTOR_CLASS<Event> events_;
+
+    EnqueueArgs(NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(Event e, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(CommandQueue::getDefault()),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local)
+    {
+        events_.push_back(e);
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(NullRange),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(NullRange), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+
+    EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : 
+      queue_(queue),
+      offset_(offset), 
+      global_(global),
+      local_(local),
+      events_(events)
+    {
+
+    }
+};
+
+namespace detail {
+
+class NullType {};
+
+template<int index, typename T0>
+struct SetArg
+{
+    static void set (Kernel kernel, T0 arg)
+    {
+        kernel.setArg(index, arg);
+    }
+};  
+
+template<int index>
+struct SetArg<index, NullType>
+{
+    static void set (Kernel, NullType)
+    { 
+    }
+};
+
+template <
+   typename T0,   typename T1,   typename T2,   typename T3,
+   typename T4,   typename T5,   typename T6,   typename T7,
+   typename T8,   typename T9,   typename T10,   typename T11,
+   typename T12,   typename T13,   typename T14,   typename T15,
+   typename T16,   typename T17,   typename T18,   typename T19,
+   typename T20,   typename T21,   typename T22,   typename T23,
+   typename T24,   typename T25,   typename T26,   typename T27,
+   typename T28,   typename T29,   typename T30,   typename T31
+>
+class KernelFunctorGlobal
+{
+private:
+    Kernel kernel_;
+
+public:
+   KernelFunctorGlobal(
+        Kernel kernel) :
+            kernel_(kernel)
+    {}
+
+   KernelFunctorGlobal(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+            kernel_(program, name.c_str(), err)
+    {}
+
+    Event operator() (
+        const EnqueueArgs& args,
+        T0 t0,
+        T1 t1 = NullType(),
+        T2 t2 = NullType(),
+        T3 t3 = NullType(),
+        T4 t4 = NullType(),
+        T5 t5 = NullType(),
+        T6 t6 = NullType(),
+        T7 t7 = NullType(),
+        T8 t8 = NullType(),
+        T9 t9 = NullType(),
+        T10 t10 = NullType(),
+        T11 t11 = NullType(),
+        T12 t12 = NullType(),
+        T13 t13 = NullType(),
+        T14 t14 = NullType(),
+        T15 t15 = NullType(),
+        T16 t16 = NullType(),
+        T17 t17 = NullType(),
+        T18 t18 = NullType(),
+        T19 t19 = NullType(),
+        T20 t20 = NullType(),
+        T21 t21 = NullType(),
+        T22 t22 = NullType(),
+        T23 t23 = NullType(),
+        T24 t24 = NullType(),
+        T25 t25 = NullType(),
+        T26 t26 = NullType(),
+        T27 t27 = NullType(),
+        T28 t28 = NullType(),
+        T29 t29 = NullType(),
+        T30 t30 = NullType(),
+        T31 t31 = NullType()
+        )
+    {
+        Event event;
+        SetArg<0, T0>::set(kernel_, t0);
+        SetArg<1, T1>::set(kernel_, t1);
+        SetArg<2, T2>::set(kernel_, t2);
+        SetArg<3, T3>::set(kernel_, t3);
+        SetArg<4, T4>::set(kernel_, t4);
+        SetArg<5, T5>::set(kernel_, t5);
+        SetArg<6, T6>::set(kernel_, t6);
+        SetArg<7, T7>::set(kernel_, t7);
+        SetArg<8, T8>::set(kernel_, t8);
+        SetArg<9, T9>::set(kernel_, t9);
+        SetArg<10, T10>::set(kernel_, t10);
+        SetArg<11, T11>::set(kernel_, t11);
+        SetArg<12, T12>::set(kernel_, t12);
+        SetArg<13, T13>::set(kernel_, t13);
+        SetArg<14, T14>::set(kernel_, t14);
+        SetArg<15, T15>::set(kernel_, t15);
+        SetArg<16, T16>::set(kernel_, t16);
+        SetArg<17, T17>::set(kernel_, t17);
+        SetArg<18, T18>::set(kernel_, t18);
+        SetArg<19, T19>::set(kernel_, t19);
+        SetArg<20, T20>::set(kernel_, t20);
+        SetArg<21, T21>::set(kernel_, t21);
+        SetArg<22, T22>::set(kernel_, t22);
+        SetArg<23, T23>::set(kernel_, t23);
+        SetArg<24, T24>::set(kernel_, t24);
+        SetArg<25, T25>::set(kernel_, t25);
+        SetArg<26, T26>::set(kernel_, t26);
+        SetArg<27, T27>::set(kernel_, t27);
+        SetArg<28, T28>::set(kernel_, t28);
+        SetArg<29, T29>::set(kernel_, t29);
+        SetArg<30, T30>::set(kernel_, t30);
+        SetArg<31, T31>::set(kernel_, t31);
+        
+        args.queue_.enqueueNDRangeKernel(
+            kernel_,
+            args.offset_,
+            args.global_,
+            args.local_,
+            &args.events_,
+            &event);
+        
+        return event;
+    }
+
+};
+
+//------------------------------------------------------------------------------------------------------
+
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30,
+	typename T31>
+struct functionImplementation_
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		T31);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30,
+		T31 arg31)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30,
+			arg31);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29,
+	typename T30>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	T30,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		T30);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29,
+		T30 arg30)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29,
+			arg30);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28,
+	typename T29>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	T29,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		T29);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28,
+		T29 arg29)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28,
+			arg29);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27,
+	typename T28>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	T28,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		T28);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27,
+		T28 arg28)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27,
+			arg28);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26,
+	typename T27>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	T27,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		T27);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26,
+		T27 arg27)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26,
+			arg27);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25,
+	typename T26>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	T26,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		T26);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25,
+		T26 arg26)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25,
+			arg26);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24,
+	typename T25>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	T25,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		T25);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24,
+		T25 arg25)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24,
+			arg25);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23,
+	typename T24>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	T24,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		T24);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23,
+		T24 arg24)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23,
+			arg24);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22,
+	typename T23>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	T23,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		T23);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22,
+		T23 arg23)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22,
+			arg23);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21,
+	typename T22>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	T22,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		T22);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21,
+		T22 arg22)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21,
+			arg22);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20,
+	typename T21>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	T21,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		T21);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20,
+		T21 arg21)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20,
+			arg21);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19,
+	typename T20>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	T20,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		T20);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19,
+		T20 arg20)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19,
+			arg20);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18,
+	typename T19>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	T19,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		T19);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18,
+		T19 arg19)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18,
+			arg19);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17,
+	typename T18>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	T18,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		T18);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17,
+		T18 arg18)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17,
+			arg18);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16,
+	typename T17>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	T17,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		T17);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16,
+		T17 arg17)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16,
+			arg17);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15,
+	typename T16>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	T16,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		T16);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15,
+		T16 arg16)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15,
+			arg16);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14,
+	typename T15>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	T15,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		T15);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14,
+		T15 arg15)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14,
+			arg15);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13,
+	typename T14>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	T14,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		T14);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13,
+		T14 arg14)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13,
+			arg14);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12,
+	typename T13>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	T13,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		T13);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12,
+		T13 arg13)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12,
+			arg13);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11,
+	typename T12>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	T12,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		T12);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11,
+		T12 arg12)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11,
+			arg12);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10,
+	typename T11>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	T11,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		T11);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10,
+		T11 arg11)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10,
+			arg11);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9,
+	typename T10>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	T10,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		T10);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9,
+		T10 arg10)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9,
+			arg10);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8,
+	typename T9>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	T9,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		T9);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8,
+		T9 arg9)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8,
+			arg9);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7,
+	typename T8>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	T8,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		T8);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7,
+		T8 arg8)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7,
+			arg8);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6,
+	typename T7>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	T7,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		T7);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6,
+		T7 arg7)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6,
+			arg7);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5,
+	typename T6>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	T6,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		T6);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5,
+		T6 arg6)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5,
+			arg6);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4,
+	typename T5>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	T5,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		T5);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4,
+		T5 arg5)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4,
+			arg5);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3,
+	typename T4>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	T4,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		T4,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3,
+		T4);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3,
+		T4 arg4)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3,
+			arg4);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2,
+	typename T3>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	T3,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		T3,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2,
+		T3);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2,
+		T3 arg3)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2,
+			arg3);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1,
+	typename T2>
+struct functionImplementation_
+<	T0,
+	T1,
+	T2,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		T2,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1,
+		T2);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1,
+		T2 arg2)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1,
+			arg2);
+	}
+
+
+};
+
+template<
+	typename T0,
+	typename T1>
+struct functionImplementation_
+<	T0,
+	T1,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		T1,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0,
+		T1);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0,
+		T1 arg1)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0,
+			arg1);
+	}
+
+
+};
+
+template<
+	typename T0>
+struct functionImplementation_
+<	T0,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType,
+	NullType>
+{
+	typedef detail::KernelFunctorGlobal<
+		T0,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType,
+		NullType> FunctorType;
+
+    FunctorType functor_;
+
+    functionImplementation_(const FunctorType &functor) :
+        functor_(functor)
+    {
+    
+        #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1))
+        // Fail variadic expansion for dev11
+        static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it.");
+        #endif
+            
+    }
+
+	//! \brief Return type of the functor
+	typedef Event result_type;
+
+	//! \brief Function signature of kernel functor with no event dependency.
+	typedef Event type_(
+		const EnqueueArgs&,
+		T0);
+
+	Event operator()(
+		const EnqueueArgs& enqueueArgs,
+		T0 arg0)
+	{
+		return functor_(
+			enqueueArgs,
+			arg0);
+	}
+
+
+};
+
+
+
+
+
+} // namespace detail
+
+//----------------------------------------------------------------------------------------------
+
+template <
+   typename T0,   typename T1 = detail::NullType,   typename T2 = detail::NullType,
+   typename T3 = detail::NullType,   typename T4 = detail::NullType,
+   typename T5 = detail::NullType,   typename T6 = detail::NullType,
+   typename T7 = detail::NullType,   typename T8 = detail::NullType,
+   typename T9 = detail::NullType,   typename T10 = detail::NullType,
+   typename T11 = detail::NullType,   typename T12 = detail::NullType,
+   typename T13 = detail::NullType,   typename T14 = detail::NullType,
+   typename T15 = detail::NullType,   typename T16 = detail::NullType,
+   typename T17 = detail::NullType,   typename T18 = detail::NullType,
+   typename T19 = detail::NullType,   typename T20 = detail::NullType,
+   typename T21 = detail::NullType,   typename T22 = detail::NullType,
+   typename T23 = detail::NullType,   typename T24 = detail::NullType,
+   typename T25 = detail::NullType,   typename T26 = detail::NullType,
+   typename T27 = detail::NullType,   typename T28 = detail::NullType,
+   typename T29 = detail::NullType,   typename T30 = detail::NullType,
+   typename T31 = detail::NullType
+>
+struct make_kernel :
+    public detail::functionImplementation_<
+               T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    >
+{
+public:
+	typedef detail::KernelFunctorGlobal<             
+		       T0,   T1,   T2,   T3,
+               T4,   T5,   T6,   T7,
+               T8,   T9,   T10,   T11,
+               T12,   T13,   T14,   T15,
+               T16,   T17,   T18,   T19,
+               T20,   T21,   T22,   T23,
+               T24,   T25,   T26,   T27,
+               T28,   T29,   T30,   T31
+    > FunctorType;
+
+    make_kernel(
+        const Program& program,
+        const STRING_CLASS name,
+        cl_int * err = NULL) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(program, name, err)) 
+    {}
+
+    make_kernel(
+        const Kernel kernel) :
+           detail::functionImplementation_<
+                    T0,   T1,   T2,   T3,
+                       T4,   T5,   T6,   T7,
+                       T8,   T9,   T10,   T11,
+                       T12,   T13,   T14,   T15,
+                       T16,   T17,   T18,   T19,
+                       T20,   T21,   T22,   T23,
+                       T24,   T25,   T26,   T27,
+                       T28,   T29,   T30,   T31
+           >(
+            FunctorType(kernel)) 
+    {}    
+};
+
+
+//----------------------------------------------------------------------------------------------------------------------
+
+#undef __ERR_STR
+#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS)
+#undef __GET_DEVICE_INFO_ERR
+#undef __GET_PLATFORM_INFO_ERR
+#undef __GET_DEVICE_IDS_ERR
+#undef __GET_CONTEXT_INFO_ERR
+#undef __GET_EVENT_INFO_ERR
+#undef __GET_EVENT_PROFILE_INFO_ERR
+#undef __GET_MEM_OBJECT_INFO_ERR
+#undef __GET_IMAGE_INFO_ERR
+#undef __GET_SAMPLER_INFO_ERR
+#undef __GET_KERNEL_INFO_ERR
+#undef __GET_KERNEL_ARG_INFO_ERR
+#undef __GET_KERNEL_WORK_GROUP_INFO_ERR
+#undef __GET_PROGRAM_INFO_ERR
+#undef __GET_PROGRAM_BUILD_INFO_ERR
+#undef __GET_COMMAND_QUEUE_INFO_ERR
+
+#undef __CREATE_CONTEXT_ERR
+#undef __CREATE_CONTEXT_FROM_TYPE_ERR
+#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR
+
+#undef __CREATE_BUFFER_ERR
+#undef __CREATE_SUBBUFFER_ERR
+#undef __CREATE_IMAGE2D_ERR
+#undef __CREATE_IMAGE3D_ERR
+#undef __CREATE_SAMPLER_ERR
+#undef __CREATE_SAMPLER_PROPERTY_ERR
+#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR
+
+#undef __CREATE_USER_EVENT_ERR
+#undef __SET_USER_EVENT_STATUS_ERR
+#undef __SET_EVENT_CALLBACK_ERR
+#undef __SET_PRINTF_CALLBACK_ERR
+
+#undef __WAIT_FOR_EVENTS_ERR
+
+#undef __CREATE_KERNEL_ERR
+#undef __SET_KERNEL_ARGS_ERR
+#undef __CREATE_PROGRAM_WITH_SOURCE_ERR
+#undef __CREATE_PROGRAM_WITH_BINARY_ERR
+#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR
+#undef __BUILD_PROGRAM_ERR
+#undef __CREATE_KERNELS_IN_PROGRAM_ERR
+
+#undef __CREATE_COMMAND_QUEUE_ERR
+#undef __CREATE_COMMAND_QUEUE_PROPERTY_ERR
+#undef __SET_COMMAND_QUEUE_PROPERTY_ERR
+#undef __ENQUEUE_READ_BUFFER_ERR
+#undef __ENQUEUE_WRITE_BUFFER_ERR
+#undef __ENQUEUE_READ_BUFFER_RECT_ERR
+#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR
+#undef __ENQEUE_COPY_BUFFER_ERR
+#undef __ENQEUE_COPY_BUFFER_RECT_ERR
+#undef __ENQUEUE_READ_IMAGE_ERR
+#undef __ENQUEUE_WRITE_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_ERR
+#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR
+#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR
+#undef __ENQUEUE_MAP_BUFFER_ERR
+#undef __ENQUEUE_MAP_IMAGE_ERR
+#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR
+#undef __ENQUEUE_NDRANGE_KERNEL_ERR
+#undef __ENQUEUE_TASK_ERR
+#undef __ENQUEUE_NATIVE_KERNEL
+
+#undef __CL_EXPLICIT_CONSTRUCTORS
+
+#undef __UNLOAD_COMPILER_ERR
+#endif //__CL_USER_OVERRIDE_ERROR_STRINGS
+
+#undef __CL_FUNCTION_TYPE
+
+// Extensions
+/**
+ * Deprecated APIs for 1.2
+ */
+#if defined(CL_VERSION_1_1)
+#undef __INIT_CL_EXT_FCN_PTR
+#endif // #if defined(CL_VERSION_1_1)
+#undef __CREATE_SUB_DEVICES
+
+#if defined(USE_CL_DEVICE_FISSION)
+#undef __PARAM_NAME_DEVICE_FISSION
+#endif // USE_CL_DEVICE_FISSION
+
+#undef __DEFAULT_NOT_INITIALIZED 
+#undef __DEFAULT_BEING_INITIALIZED 
+#undef __DEFAULT_INITIALIZED
+
+} // namespace cl
+
+#ifdef _WIN32
+#pragma pop_macro("max")
+#endif // _WIN32
+
+#endif // CL_HPP_
diff --git a/ext/cudart/include/CL/cl_d3d10.h b/ext/cudart/include/CL/cl_d3d10.h
new file mode 100644
index 0000000000000000000000000000000000000000..89f4bfba1f46d894289a885984c309f0f99f1165
--- /dev/null
+++ b/ext/cudart/include/CL/cl_d3d10.h
@@ -0,0 +1,129 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_D3D10_H
+#define __OPENCL_CL_D3D10_H
+
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#endif
+#endif
+#include <d3d10.h>
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( pop )
+#endif
+#endif
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d10_sharing                                                       */
+#define cl_khr_d3d10_sharing 1
+
+typedef cl_uint cl_d3d10_device_source_khr;
+typedef cl_uint cl_d3d10_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
+#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
+
+/* cl_d3d10_device_source_nv */
+#define CL_D3D10_DEVICE_KHR                          0x4010
+#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
+
+/* cl_d3d10_device_set_nv */
+#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
+#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
+
+/* cl_mem_info */
+#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
+
+/* cl_image_info */
+#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d10_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d10_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D10_H */
+
diff --git a/ext/cudart/include/CL/cl_d3d10_ext.h b/ext/cudart/include/CL/cl_d3d10_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..acbc1b52d984df54ee0b76acf45f46e78c6e8dbf
--- /dev/null
+++ b/ext/cudart/include/CL/cl_d3d10_ext.h
@@ -0,0 +1,122 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2009 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_D3D10_EXT_H
+#define __OPENCL_CL_D3D10_EXT_H
+
+#include <d3d10.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_nv_d3d10_sharing                                                        */
+
+typedef cl_uint cl_d3d10_device_source_nv;
+typedef cl_uint cl_d3d10_device_set_nv;
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_D3D10_DEVICE_NV             -1002
+#define CL_INVALID_D3D10_RESOURCE_NV           -1003
+#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_NV  -1004
+#define CL_D3D10_RESOURCE_NOT_ACQUIRED_NV      -1005
+
+// cl_d3d10_device_source_nv
+#define CL_D3D10_DEVICE_NV                     0x4010
+#define CL_D3D10_DXGI_ADAPTER_NV               0x4011
+
+// cl_d3d10_device_set_nv
+#define CL_PREFERRED_DEVICES_FOR_D3D10_NV      0x4012
+#define CL_ALL_DEVICES_FOR_D3D10_NV            0x4013
+
+// cl_context_info
+#define CL_CONTEXT_D3D10_DEVICE_NV             0x4014
+
+// cl_mem_info
+#define CL_MEM_D3D10_RESOURCE_NV               0x4015
+
+// cl_image_info
+#define CL_IMAGE_D3D10_SUBRESOURCE_NV          0x4016
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_NV    0x4017
+#define CL_COMMAND_RELEASE_D3D10_OBJECTS_NV    0x4018
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10NV_fn)(
+    cl_platform_id            platform,
+    cl_d3d10_device_source_nv d3d_device_source,
+    void *                    d3d_object,
+    cl_d3d10_device_set_nv    d3d_device_set,
+    cl_uint                   num_entries, 
+    cl_device_id *            devices, 
+    cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferNV_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D10Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DNV_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DNV_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D10Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsNV_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsNV_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    cl_mem *         mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __OPENCL_CL_D3D10_H
+
diff --git a/ext/cudart/include/CL/cl_d3d11.h b/ext/cudart/include/CL/cl_d3d11.h
new file mode 100644
index 0000000000000000000000000000000000000000..10023dde0e3715d0f973eed3b7e6e0a397382fd2
--- /dev/null
+++ b/ext/cudart/include/CL/cl_d3d11.h
@@ -0,0 +1,128 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_D3D11_H
+#define __OPENCL_CL_D3D11_H
+
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( push )
+#pragma warning( disable : 4201 )
+#endif
+#endif
+#include <d3d11.h>
+#if defined(_MSC_VER)
+#if _MSC_VER >=1500
+#pragma warning( pop )
+#endif
+#endif
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_khr_d3d11_sharing                                                       */
+#define cl_khr_d3d11_sharing 1
+
+typedef cl_uint cl_d3d11_device_source_khr;
+typedef cl_uint cl_d3d11_device_set_khr;
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
+#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
+
+/* cl_d3d11_device_source */
+#define CL_D3D11_DEVICE_KHR                          0x4019
+#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
+
+/* cl_d3d11_device_set */
+#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
+
+/* cl_context_info */
+#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
+#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
+
+/* cl_mem_info */
+#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
+
+/* cl_image_info */
+#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
+    cl_platform_id             platform,
+    cl_d3d11_device_source_khr d3d_device_source,
+    void *                     d3d_object,
+    cl_d3d11_device_set_khr    d3d_device_set,
+    cl_uint                    num_entries,
+    cl_device_id *             devices,
+    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_D3D11_H */
+
diff --git a/ext/cudart/include/CL/cl_d3d11_ext.h b/ext/cudart/include/CL/cl_d3d11_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..99b2c5bda5049a51d6bd32225308a081d1ed696b
--- /dev/null
+++ b/ext/cudart/include/CL/cl_d3d11_ext.h
@@ -0,0 +1,122 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2009 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_D3D11_EXT_H
+#define __OPENCL_CL_D3D11_EXT_H
+
+#include <d3d11.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_nv_d3d11_sharing                                                        */
+
+typedef cl_uint cl_d3d11_device_source_nv;
+typedef cl_uint cl_d3d11_device_set_nv;
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_D3D11_DEVICE_NV             -1006
+#define CL_INVALID_D3D11_RESOURCE_NV           -1007
+#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_NV  -1008
+#define CL_D3D11_RESOURCE_NOT_ACQUIRED_NV      -1009
+
+// cl_d3d11_device_source_nv
+#define CL_D3D11_DEVICE_NV                     0x4019
+#define CL_D3D11_DXGI_ADAPTER_NV               0x401A
+
+// cl_d3d11_device_set_nv
+#define CL_PREFERRED_DEVICES_FOR_D3D11_NV      0x401B
+#define CL_ALL_DEVICES_FOR_D3D11_NV            0x401C
+
+// cl_context_info
+#define CL_CONTEXT_D3D11_DEVICE_NV             0x401D
+
+// cl_mem_info
+#define CL_MEM_D3D11_RESOURCE_NV               0x401E
+
+// cl_image_info
+#define CL_IMAGE_D3D11_SUBRESOURCE_NV          0x401F
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_NV    0x4020
+#define CL_COMMAND_RELEASE_D3D11_OBJECTS_NV    0x4021
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11NV_fn)(
+    cl_platform_id            platform,
+    cl_d3d11_device_source_nv d3d_device_source,
+    void *                    d3d_object,
+    cl_d3d11_device_set_nv    d3d_device_set,
+    cl_uint                   num_entries, 
+    cl_device_id *            devices, 
+    cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferNV_fn)(
+    cl_context     context,
+    cl_mem_flags   flags,
+    ID3D11Buffer * resource,
+    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DNV_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture2D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DNV_fn)(
+    cl_context        context,
+    cl_mem_flags      flags,
+    ID3D11Texture3D * resource,
+    UINT              subresource,
+    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsNV_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsNV_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    cl_mem *         mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __OPENCL_CL_D3D11_H
+
diff --git a/ext/cudart/include/CL/cl_d3d9_ext.h b/ext/cudart/include/CL/cl_d3d9_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab8d6cf047d0179f78dff78264d1133fce09774a
--- /dev/null
+++ b/ext/cudart/include/CL/cl_d3d9_ext.h
@@ -0,0 +1,143 @@
+/**********************************************************************************
+ * Copyright (c) 2008-2009 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ **********************************************************************************/
+
+#ifndef __OPENCL_CL_D3D9_EXT_H
+#define __OPENCL_CL_D3D9_EXT_H
+
+#include <d3d9.h>
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************
+ * cl_nv_d3d9_sharing                                                         */
+
+typedef cl_uint cl_d3d9_device_source_nv;
+typedef cl_uint cl_d3d9_device_set_nv;
+
+/******************************************************************************/
+
+// Error Codes
+#define CL_INVALID_D3D9_DEVICE_NV              -1010
+#define CL_INVALID_D3D9_RESOURCE_NV            -1011
+#define CL_D3D9_RESOURCE_ALREADY_ACQUIRED_NV   -1012
+#define CL_D3D9_RESOURCE_NOT_ACQUIRED_NV       -1013
+
+// cl_d3d9_device_source_nv
+#define CL_D3D9_DEVICE_NV                      0x4022
+#define CL_D3D9_ADAPTER_NAME_NV                0x4023
+
+// cl_d3d9_device_set_nv
+#define CL_PREFERRED_DEVICES_FOR_D3D9_NV       0x4024
+#define CL_ALL_DEVICES_FOR_D3D9_NV             0x4025
+
+// cl_context_info
+#define CL_CONTEXT_D3D9_DEVICE_NV              0x4026
+
+// cl_mem_info
+#define CL_MEM_D3D9_RESOURCE_NV                0x4027
+
+// cl_image_info
+#define CL_IMAGE_D3D9_FACE_NV                  0x4028
+#define CL_IMAGE_D3D9_LEVEL_NV                 0x4029
+
+// cl_command_type
+#define CL_COMMAND_ACQUIRE_D3D9_OBJECTS_NV     0x402A
+#define CL_COMMAND_RELEASE_D3D9_OBJECTS_NV     0x402B
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D9NV_fn)(
+    cl_platform_id            platform,
+    cl_d3d9_device_source_nv  d3d_device_source,
+    void *                    d3d_object,
+    cl_d3d9_device_set_nv     d3d_device_set,
+    cl_uint                   num_entries, 
+    cl_device_id *            devices, 
+    cl_uint *                 num_devices) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VertexBufferNV_fn)(
+    cl_context               context,
+    cl_mem_flags             flags,
+    IDirect3DVertexBuffer9 * resource,
+    cl_int *                 errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9IndexBufferNV_fn)(
+    cl_context              context,
+    cl_mem_flags            flags,
+    IDirect3DIndexBuffer9 * resource,
+    cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9SurfaceNV_fn)(
+    cl_context          context,
+    cl_mem_flags        flags,
+    IDirect3DSurface9 * resource,
+    cl_int *            errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9TextureNV_fn)(
+    cl_context         context,
+    cl_mem_flags       flags,
+    IDirect3DTexture9 *resource,
+    UINT               miplevel,
+    cl_int *           errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9CubeTextureNV_fn)(
+    cl_context              context,
+    cl_mem_flags            flags,
+    IDirect3DCubeTexture9 * resource,
+    D3DCUBEMAP_FACES        facetype,
+    UINT                    miplevel,
+    cl_int *                errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VolumeTextureNV_fn)(
+    cl_context                context,
+    cl_mem_flags              flags,
+    IDirect3DVolumeTexture9 * resource,
+    UINT                      miplevel,
+    cl_int *                  errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D9ObjectsNV_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    const cl_mem *mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D9ObjectsNV_fn)(
+    cl_command_queue command_queue,
+    cl_uint num_objects,
+    cl_mem *mem_objects,
+    cl_uint num_events_in_wait_list,
+    const cl_event *event_wait_list,
+    cl_event *event) CL_API_SUFFIX__VERSION_1_0;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // __OPENCL_CL_D3D9_H
+
diff --git a/ext/cudart/include/CL/cl_dx9_media_sharing.h b/ext/cudart/include/CL/cl_dx9_media_sharing.h
new file mode 100644
index 0000000000000000000000000000000000000000..048937005353a8fb8c4fee624d3f36b9c894fe3e
--- /dev/null
+++ b/ext/cudart/include/CL/cl_dx9_media_sharing.h
@@ -0,0 +1,118 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
+#define __OPENCL_CL_DX9_MEDIA_SHARING_H
+
+#include <CL/cl.h>
+#include <CL/cl_platform.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/******************************************************************************/
+/* cl_khr_dx9_media_sharing                                                   */
+#define cl_khr_dx9_media_sharing 1
+
+typedef cl_uint             cl_dx9_media_adapter_type_khr;
+typedef cl_uint             cl_dx9_media_adapter_set_khr;
+    
+#if defined(_WIN32)
+#include <d3d9.h>
+typedef struct _cl_dx9_surface_info_khr
+{
+    IDirect3DSurface9 *resource;
+    HANDLE shared_handle;
+} cl_dx9_surface_info_khr;
+#endif
+
+
+/******************************************************************************/
+
+/* Error Codes */
+#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
+#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
+#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
+#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
+
+/* cl_media_adapter_type_khr */
+#define CL_ADAPTER_D3D9_KHR                              0x2020
+#define CL_ADAPTER_D3D9EX_KHR                            0x2021
+#define CL_ADAPTER_DXVA_KHR                              0x2022
+
+/* cl_media_adapter_set_khr */
+#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
+#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
+
+/* cl_context_info */
+#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
+#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
+#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
+
+/* cl_mem_info */
+#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
+#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
+
+/* cl_image_info */
+#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
+
+/* cl_command_type */
+#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
+#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
+
+/******************************************************************************/
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
+    cl_platform_id                   platform,
+    cl_uint                          num_media_adapters,
+    cl_dx9_media_adapter_type_khr *  media_adapter_type,
+    void *                           media_adapters,
+    cl_dx9_media_adapter_set_khr     media_adapter_set,
+    cl_uint                          num_entries,
+    cl_device_id *                   devices,
+    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
+    cl_context                    context,
+    cl_mem_flags                  flags,
+    cl_dx9_media_adapter_type_khr adapter_type,
+    void *                        surface_info,
+    cl_uint                       plane,                                                                          
+    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
+
diff --git a/ext/cudart/include/CL/cl_egl.h b/ext/cudart/include/CL/cl_egl.h
new file mode 100644
index 0000000000000000000000000000000000000000..6fb3721f12533925ba6ca3737eb48bd864aae444
--- /dev/null
+++ b/ext/cudart/include/CL/cl_egl.h
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_EGL_H
+#define __OPENCL_CL_EGL_H
+
+#ifdef __APPLE__
+#else
+#include <CL/cl.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
+#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
+#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
+#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
+
+/* Error type for clCreateFromEGLImageKHR */
+#define CL_INVALID_EGL_OBJECT_KHR             -1093
+#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
+
+/* CLeglImageKHR is an opaque handle to an EGLImage */
+typedef void* CLeglImageKHR;
+
+/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
+typedef void* CLeglDisplayKHR;
+
+/* CLeglSyncKHR is an opaque handle to an EGLSync object */
+typedef void* CLeglSyncKHR;
+
+/* properties passed to clCreateFromEGLImageKHR */
+typedef intptr_t cl_egl_image_properties_khr;
+
+
+#define cl_khr_egl_image 1
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromEGLImageKHR(cl_context                  context,
+                        CLeglDisplayKHR             egldisplay,
+                        CLeglImageKHR               eglimage,
+                        cl_mem_flags                flags,
+                        const cl_egl_image_properties_khr * properties,
+                        cl_int *                    errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
+    cl_context                  context,
+    CLeglDisplayKHR             egldisplay,
+    CLeglImageKHR               eglimage,
+    cl_mem_flags                flags,
+    const cl_egl_image_properties_khr * properties,
+    cl_int *                    errcode_ret);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
+                              cl_uint          num_objects,
+                              const cl_mem *   mem_objects,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event * event_wait_list,
+                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event);
+
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
+                              cl_uint          num_objects,
+                              const cl_mem *   mem_objects,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event * event_wait_list,
+                              cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
+    cl_command_queue command_queue,
+    cl_uint          num_objects,
+    const cl_mem *   mem_objects,
+    cl_uint          num_events_in_wait_list,
+    const cl_event * event_wait_list,
+    cl_event *       event);
+
+
+#define cl_khr_egl_event 1
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromEGLSyncKHR(cl_context      context,
+                            CLeglSyncKHR    sync,
+                            CLeglDisplayKHR display,
+                            cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
+    cl_context      context,
+    CLeglSyncKHR    sync,
+    CLeglDisplayKHR display,
+    cl_int *        errcode_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __OPENCL_CL_EGL_H */
diff --git a/ext/cudart/include/CL/cl_ext.h b/ext/cudart/include/CL/cl_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..b8b26159e64945dc54c0457c7d6acd92487bd325
--- /dev/null
+++ b/ext/cudart/include/CL/cl_ext.h
@@ -0,0 +1,1131 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+/* cl_ext.h contains OpenCL extensions which don't have external */
+/* (OpenGL, D3D) dependencies.                                   */
+
+#ifndef __CL_EXT_H
+#define __CL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl.h>
+    #include <AvailabilityMacros.h>
+#else
+    #include <CL/cl.h>
+#endif
+
+/* cl_khr_fp64 extension - no extension #define since it has no functions  */
+/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
+#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
+
+/* cl_khr_fp16 extension - no extension #define since it has no functions  */
+#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
+
+/* Memory object destruction
+ *
+ * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
+ *
+ * Registers a user callback function that will be called when the memory object is deleted and its resources
+ * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
+ * stack associated with memobj. The registered user callback functions are called in the reverse order in
+ * which they were registered. The user callback functions are called and then the memory object is deleted
+ * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
+ * notified when the memory referenced by host_ptr, specified when the memory object is created and used as
+ * the storage bits for the memory object, can be reused or freed.
+ *
+ * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
+ *
+ * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ */
+#define cl_APPLE_SetMemObjectDestructor 1
+cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem memobj,
+                                        void (* pfn_notify)(cl_mem memobj, void * user_data),
+                                        void * user_data)             CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/* Context Logging Functions
+ *
+ * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
+ * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
+ * before using.
+ *
+ * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
+ */
+#define cl_APPLE_ContextLoggingFunctions 1
+extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * errstr,
+                                            const void * private_info,
+                                            size_t       cb,
+                                            void *       user_data)  CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
+extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * errstr,
+                                          const void * private_info,
+                                          size_t       cb,
+                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
+
+/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
+extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * errstr,
+                                          const void * private_info,
+                                          size_t       cb,
+                                          void *       user_data)    CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/************************
+* cl_khr_icd extension *
+************************/
+#define cl_khr_icd 1
+
+/* cl_platform_info                                                        */
+#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
+
+/* Additional Error Codes                                                  */
+#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clIcdGetPlatformIDsKHR(cl_uint          num_entries,
+                       cl_platform_id * platforms,
+                       cl_uint *        num_platforms);
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint          num_entries,
+                                         cl_platform_id * platforms,
+                                         cl_uint *        num_platforms);
+
+
+/*******************************
+ * cl_khr_il_program extension *
+ *******************************/
+#define cl_khr_il_program 1
+
+/* New property to clGetDeviceInfo for retrieving supported intermediate
+ * languages
+ */
+#define CL_DEVICE_IL_VERSION_KHR                    0x105B
+
+/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
+ * program
+ */
+#define CL_PROGRAM_IL_KHR                           0x1169
+
+extern CL_API_ENTRY cl_program CL_API_CALL
+clCreateProgramWithILKHR(cl_context   context,
+                         const void * il,
+                         size_t       length,
+                         cl_int *     errcode_ret);
+
+typedef CL_API_ENTRY cl_program
+(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context   context,
+                                           const void * il,
+                                           size_t       length,
+                                           cl_int *     errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+/* Extension: cl_khr_image2d_from_buffer
+ *
+ * This extension allows a 2D image to be created from a cl_mem buffer without
+ * a copy. The type associated with a 2D image created from a buffer in an
+ * OpenCL program is image2d_t. Both the sampler and sampler-less read_image
+ * built-in functions are supported for 2D images and 2D images created from
+ * a buffer.  Similarly, the write_image built-ins are also supported for 2D
+ * images created from a buffer.
+ *
+ * When the 2D image from buffer is created, the client must specify the
+ * width, height, image format (i.e. channel order and channel data type)
+ * and optionally the row pitch.
+ *
+ * The pitch specified must be a multiple of
+ * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
+ * The base address of the buffer must be aligned to
+ * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
+ */
+
+#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR              0x104A
+#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR       0x104B
+
+
+/**************************************
+ * cl_khr_initialize_memory extension *
+ **************************************/
+
+#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
+
+
+/**************************************
+ * cl_khr_terminate_context extension *
+ **************************************/
+
+#define CL_CONTEXT_TERMINATED_KHR                   -1121
+
+#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
+#define CL_CONTEXT_TERMINATE_KHR                    0x2032
+
+#define cl_khr_terminate_context 1
+extern CL_API_ENTRY cl_int CL_API_CALL
+clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*
+ * Extension: cl_khr_spir
+ *
+ * This extension adds support to create an OpenCL program object from a
+ * Standard Portable Intermediate Representation (SPIR) instance
+ */
+
+#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
+#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
+
+
+/*****************************************
+ * cl_khr_create_command_queue extension *
+ *****************************************/
+#define cl_khr_create_command_queue 1
+
+typedef cl_properties cl_queue_properties_khr;
+
+extern CL_API_ENTRY cl_command_queue CL_API_CALL
+clCreateCommandQueueWithPropertiesKHR(cl_context context,
+                                      cl_device_id device,
+                                      const cl_queue_properties_khr* properties,
+                                      cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+typedef CL_API_ENTRY cl_command_queue
+(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
+                                                        cl_device_id device,
+                                                        const cl_queue_properties_khr* properties,
+                                                        cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/******************************************
+* cl_khr_semaphore extension *
+******************************************/
+
+typedef enum _cl_semaphore_type__enum {
+    CL_SEMAPHORE_TYPE_BINARY_KHR       = 1,
+} cl_semaphore_type;
+
+typedef cl_properties cl_semaphore_properties_khr;
+
+typedef cl_uint cl_semaphore_info_khr;
+
+typedef struct _cl_semaphore* cl_semaphore_khr;
+typedef cl_ulong cl_semaphore_payload_khr;
+
+extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL
+clCreateSemaphoreWithPropertiesKHR(cl_context context,
+                                   cl_semaphore_properties_khr *sema_props,
+                                   cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueWaitSemaphoresKHR(cl_command_queue command_queue,
+                              cl_uint num_sema_objects,
+                              const cl_semaphore_khr *sema_objects,
+                              const cl_semaphore_payload_khr *sema_payload_list,
+                              cl_uint num_events_in_wait_list,
+                              const cl_event *event_wait_list,
+                              cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSignalSemaphoresKHR(cl_command_queue command_queue,
+                             cl_uint num_sema_objects,
+                             const cl_semaphore_khr *sema_objects,
+                             const cl_semaphore_payload_khr *sema_payload_list,
+                             cl_uint num_events_in_wait_list,
+                             const cl_event *event_wait_list,
+                             cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSemaphoreInfoKHR(const cl_semaphore_khr sema_object,
+                      cl_semaphore_info_khr param_name,
+                      size_t param_value_size,
+                      void *param_value,
+                      size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSemaphoreKHR(cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainSemaphoreKHR(cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseSemaphoreObjectKHR(cl_semaphore_khr sema_object)  CL_API_SUFFIX__VERSION_1_2;
+
+#define CL_COMMAND_SEMAPHORE_WAIT_KHR               0x2042
+#define CL_COMMAND_SEMAPHORE_SIGNAL_KHR             0x2043
+#define CL_SEMAPHORE_CONTEXT_KHR                    0x2039
+#define CL_SEMAPHORE_REFERENCE_COUNT_KHR            0x203A
+#define CL_SEMAPHORE_PROPERTIES_KHR                 0x203B
+#define CL_SEMAPHORE_TYPE_KHR                       0x203D
+#define CL_PLATFORM_SEMAPHORE_TYPES_KHR             0x2036
+#define CL_SEMAPHORE_PAYLOAD_KHR                    0x203C
+
+/******************************************
+* cl_khr_external_semaphore extension *
+******************************************/
+
+typedef enum _cl_external_context_type_enum {
+    CL_EXTERNAL_CONTEXT_TYPE_NONE       = 0,
+    CL_EXTERNAL_CONTEXT_TYPE_CL         = 1,
+    CL_EXTERNAL_CONTEXT_TYPE_VULKAN     = 2,
+} cl_external_context_type_khr;
+
+typedef cl_uint cl_external_semaphore_handle_type_khr;
+// API-agnostic semaphore handles are defined here in this spec.
+#define CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR                      0x2055
+#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR                   0x2056
+#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR               0x2057
+
+typedef struct _cl_semaphore_desc_khr_st {
+    cl_external_semaphore_handle_type_khr type;
+    void *handle_ptr;
+} cl_semaphore_desc_khr;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetSemaphoreHandleForTypeKHR(const cl_semaphore_khr                sema_object,
+                               const cl_device_id                    device,
+                               cl_external_semaphore_handle_type_khr handle_type,
+                               size_t                                handle_size,
+                               void                                  *handle_ptr,
+                               size_t                                *handle_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+#define CL_SEMAPHORE_DESC_KHR                                   0x2460
+#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR                    0x203F
+
+#define CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR           0x2037
+#define CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR           0x2038
+
+#define CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR                         0x204D
+#define CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR                         0x204E
+
+// error codes
+#define CL_INVALID_SEMAPHORE_KHR                                -1142
+
+/******************************************
+* cl_khr_external_memory extension *
+******************************************/
+
+typedef cl_uint cl_external_context_info;
+
+typedef enum _cl_external_context_type_enum cl_external_context_type_khr;
+
+typedef cl_properties cl_mem_properties_khr;
+
+typedef cl_uint cl_external_mem_handle_type_khr;
+// API-agnostic memory handles are defined here in this spec.
+#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR            0x2060
+#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR         0x2061
+#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR     0x2062
+
+typedef struct _cl_external_mem_desc_khr_st {
+    cl_external_mem_handle_type_khr type;
+    void *handle_ptr;
+    size_t offset;
+    unsigned long long size;
+} cl_external_mem_desc_khr;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetExternalContextInfoKHR(const cl_context_properties  *properties,
+                            cl_external_context_info  param_name,   
+                            size_t  param_value_size,
+                            void  *param_value,
+                            size_t  *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireExternalMemObjectsKHR(cl_command_queue command_queue, 
+                                      cl_uint num_mem_objects,
+                                      const cl_mem *mem_objects,
+                                      cl_uint num_events_in_wait_list,
+                                      const cl_event *event_wait_list,
+                                      cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseExternalMemObjectsKHR(cl_command_queue command_queue,
+                                      cl_uint num_mem_objects,
+                                      const cl_mem *mem_objects,
+                                      cl_uint num_events_in_wait_list,
+                                      const cl_event *event_wait_list,
+                                      cl_event *event) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferFromExternalMemoryKHR(cl_context context,
+                                    const cl_mem_properties_khr* properties,
+                                    cl_mem_flags flags,
+                                    cl_external_mem_desc_khr extMem,
+                                    cl_int *errcode_ret)  CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateImageFromExternalMemoryKHR(cl_context context,
+                                   const cl_mem_properties_khr* properties,
+                                   cl_mem_flags flags,
+                                   cl_external_mem_desc_khr extMem,
+                                   const cl_image_format *image_format,
+                                   const cl_image_desc *image_desc,
+                                   cl_int *errcode_ret)  CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL
+clCreateFromExternalSemaphoreKHR(cl_context context,
+                                 cl_semaphore_properties_khr *sema_props,
+                                 cl_semaphore_desc_khr sema_desc,
+                                 cl_int *errcode_ret)  
+                                 CL_API_SUFFIX__VERSION_1_2;
+
+#define CL_INVALID_EXTERNAL_DEVICEGROUP_REFERENCE_KHR   -1122
+#define CL_INVALID_EXT_MEM_DESC_KHR                     -1123
+#define CL_INVALID_EXT_MEM_HANDLE_TYPE_KHR              -1148
+#define CL_INVALID_EXT_MEM_HANDLE_KHR                   -1149
+#define CL_INVALID_EXT_MEM_OFFSET_KHR                   -1150
+#define CL_INVALID_EXT_MEM_SIZE_KHR                     -1140
+
+#define CL_CURRENT_DEVICE_FOR_EXTERNAL_CONTEXT_KHR      0x2036
+#define CL_DEVICES_FOR_EXTERNAL_CONTEXT_KHR             0x2037
+#define CL_EXTERNAL_DEVICE_KHR                          0x2038
+#define CL_EXTERNAL_DEVICEGROUP_KHR                     0x2039
+#define CL_EXTERNAL_CONTEXT_TYPE_KHR                    0x204B
+#define CL_DEVICE_HANDLE_LIST_KHR                       0x2051
+#define CL_DEVICE_HANDLE_LIST_END_KHR                   0x0
+
+
+#define CL_COMMAND_ACQUIRE_EXTERNAL_MEM_OBJECTS_KHR     0x2047
+#define CL_COMMAND_RELEASE_EXTERNAL_MEM_OBJECTS_KHR     0x2048
+#define CL_EXTERNAL_MEM_DESC_KHR                        0x203C
+#define CL_EXTERNAL_IMAGE_INFO_KHR                      0x203D
+#define CL_PLATFORM_EXTERNAL_HANDLE_TYPES_KHR           0x203E
+#define CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR  0x2044
+#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR    0x204F
+#define CL_DEVICE_EXTERNAL_MEMORY_PROPERTIES_KHR             0x2050
+
+
+/******************************************
+* cl_nv_device_attribute_query extension *
+******************************************/
+
+/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
+#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
+#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
+#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
+#define CL_DEVICE_WARP_SIZE_NV                      0x4003
+#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
+#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
+#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
+#define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV   0x4007
+#define CL_DEVICE_PCI_BUS_ID_NV                     0x4008
+#define CL_DEVICE_PCI_SLOT_ID_NV                    0x4009
+#define CL_DEVICE_PCI_DOMAIN_ID_NV                  0x400A
+#define CL_DEVICE_MAX_LOCAL_MEMORY_PER_SM_NV        0x400B
+#define CL_DEVICE_UUID_KHR                          0x106A 
+#define CL_DRIVER_UUID_KHR                          0x106B    
+#define CL_DEVICE_LUID_VALID_KHR                    0x106C     
+#define CL_DEVICE_LUID_KHR                          0x106D     
+#define CL_DEVICE_NODE_MASK_KHR                     0x106E     
+#define CL_UUID_SIZE_KHR                            16
+#define CL_LUID_SIZE_KHR                            8
+
+/******************************************
+* cl_nv_create_buffer extension *
+******************************************/
+
+typedef cl_bitfield         cl_mem_flags_NV;
+CL_API_ENTRY cl_mem CL_API_CALL
+clCreateBufferNV(cl_context     context,
+               cl_mem_flags     flags,
+               cl_mem_flags_NV  flags_NV,
+               size_t           size,
+               void             *host_ptr,
+               cl_int           *errcode_ret);
+
+/******************************************
+* cl_kernel_attribute_nv extension *
+*******************************************/
+
+typedef enum kernel_attribute_enum {
+    CL_KERNEL_PREFERRED_LOCAL_MEMORY_SIZE_NV = 0,   /* setting preferred shared memory size */
+} cl_kernel_attribute_nv;
+
+CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelAttributeNV(cl_kernel kernel,
+                       cl_device_id device,
+                       cl_kernel_attribute_nv k_attr,
+                       size_t param_value_size,
+                       const void *param_value);
+
+CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelAttributeNV(cl_kernel kernel,
+                       cl_device_id device,
+                       cl_kernel_attribute_nv k_attr,
+                       size_t param_value_size,
+                       void *param_value,
+                       size_t *param_value_size_ret);
+
+#define CL_MEM_LOCATION_HOST_NV                     (1 << 0)
+#define CL_MEM_PINNED_NV                            (1 << 1)
+
+/*********************************
+* cl_amd_device_attribute_query *
+*********************************/
+
+#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD            0x4036
+#define CL_DEVICE_TOPOLOGY_AMD                          0x4037
+#define CL_DEVICE_BOARD_NAME_AMD                        0x4038
+#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD                0x4039
+#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD             0x4040
+#define CL_DEVICE_SIMD_WIDTH_AMD                        0x4041
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD            0x4042
+#define CL_DEVICE_WAVEFRONT_WIDTH_AMD                   0x4043
+#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD               0x4044
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD          0x4045
+#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD     0x4046
+#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
+#define CL_DEVICE_LOCAL_MEM_BANKS_AMD                   0x4048
+#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD            0x4049
+#define CL_DEVICE_GFXIP_MAJOR_AMD                       0x404A
+#define CL_DEVICE_GFXIP_MINOR_AMD                       0x404B
+#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD            0x404C
+#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD         0x4030
+#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD               0x4031
+#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD    0x4033
+#define CL_DEVICE_PCIE_ID_AMD                           0x4034
+
+
+/*********************************
+* cl_arm_printf extension
+*********************************/
+
+#define CL_PRINTF_CALLBACK_ARM                      0x40B0
+#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
+
+
+/***********************************
+* cl_ext_device_fission extension
+***********************************/
+#define cl_ext_device_fission   1
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef cl_ulong  cl_device_partition_property_ext;
+extern CL_API_ENTRY cl_int CL_API_CALL
+clCreateSubDevicesEXT(cl_device_id   in_device,
+                      const cl_device_partition_property_ext * properties,
+                      cl_uint        num_entries,
+                      cl_device_id * out_devices,
+                      cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id   in_device,
+                                         const cl_device_partition_property_ext * properties,
+                                         cl_uint        num_entries,
+                                         cl_device_id * out_devices,
+                                         cl_uint *      num_devices) CL_EXT_SUFFIX__VERSION_1_1;
+
+/* cl_device_partition_property_ext */
+#define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
+#define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
+#define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
+#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
+
+/* clDeviceGetInfo selectors */
+#define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
+#define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
+#define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
+#define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
+#define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
+
+/* error codes */
+#define CL_DEVICE_PARTITION_FAILED_EXT              -1057
+#define CL_INVALID_PARTITION_COUNT_EXT              -1058
+#define CL_INVALID_PARTITION_NAME_EXT               -1059
+
+/* CL_AFFINITY_DOMAINs */
+#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
+#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
+#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
+#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
+#define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
+#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
+
+/* cl_device_partition_property_ext list terminators */
+#define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
+#define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
+
+
+/***********************************
+ * cl_ext_migrate_memobject extension definitions
+ ***********************************/
+#define cl_ext_migrate_memobject 1
+
+typedef cl_bitfield cl_mem_migration_flags_ext;
+
+#define CL_MIGRATE_MEM_OBJECT_HOST_EXT              0x1
+
+#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT           0x4040
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
+                             cl_uint          num_mem_objects,
+                             const cl_mem *   mem_objects,
+                             cl_mem_migration_flags_ext flags,
+                             cl_uint          num_events_in_wait_list,
+                             const cl_event * event_wait_list,
+                             cl_event *       event);
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
+                                               cl_uint          num_mem_objects,
+                                               const cl_mem *   mem_objects,
+                                               cl_mem_migration_flags_ext flags,
+                                               cl_uint          num_events_in_wait_list,
+                                               const cl_event * event_wait_list,
+                                               cl_event *       event);
+
+
+/*********************************
+* cl_ext_cxx_for_opencl extension
+*********************************/
+#define cl_ext_cxx_for_opencl 1
+
+#define CL_DEVICE_CXX_FOR_OPENCL_NUMERIC_VERSION_EXT 0x4230
+
+/*********************************
+* cl_qcom_ext_host_ptr extension
+*********************************/
+#define cl_qcom_ext_host_ptr 1
+
+#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
+
+#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0
+#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
+#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
+#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
+#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
+#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
+#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
+#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
+
+typedef cl_uint                                   cl_image_pitch_info_qcom;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetDeviceImageInfoQCOM(cl_device_id             device,
+                         size_t                   image_width,
+                         size_t                   image_height,
+                         const cl_image_format   *image_format,
+                         cl_image_pitch_info_qcom param_name,
+                         size_t                   param_value_size,
+                         void                    *param_value,
+                         size_t                  *param_value_size_ret);
+
+typedef struct _cl_mem_ext_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Legal values will be defined in layered extensions. */
+    cl_uint  allocation_type;
+
+    /* Host cache policy for this external memory allocation. */
+    cl_uint  host_cache_policy;
+
+} cl_mem_ext_host_ptr;
+
+
+/*******************************************
+* cl_qcom_ext_host_ptr_iocoherent extension
+********************************************/
+
+/* Cache policy specifying io-coherence */
+#define CL_MEM_HOST_IOCOHERENT_QCOM               0x40A9
+
+
+/*********************************
+* cl_qcom_ion_host_ptr extension
+*********************************/
+
+#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
+
+typedef struct _cl_mem_ion_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* ION file descriptor */
+    int                  ion_filedesc;
+
+    /* Host pointer to the ION allocated memory */
+    void*                ion_hostptr;
+
+} cl_mem_ion_host_ptr;
+
+
+/*********************************
+* cl_qcom_android_native_buffer_host_ptr extension
+*********************************/
+
+#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM                  0x40C6
+
+typedef struct _cl_mem_android_native_buffer_host_ptr
+{
+    /* Type of external memory allocation. */
+    /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
+    cl_mem_ext_host_ptr  ext_host_ptr;
+
+    /* Virtual pointer to the android native buffer */
+    void*                anb_ptr;
+
+} cl_mem_android_native_buffer_host_ptr;
+
+
+/******************************************
+ * cl_img_yuv_image extension *
+ ******************************************/
+
+/* Image formats used in clCreateImage */
+#define CL_NV21_IMG                                 0x40D0
+#define CL_YV12_IMG                                 0x40D1
+
+
+/******************************************
+ * cl_img_cached_allocations extension *
+ ******************************************/
+
+/* Flag values used by clCreateBuffer */
+#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG          (1 << 26)
+#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG            (1 << 27)
+
+
+/******************************************
+ * cl_img_use_gralloc_ptr extension *
+ ******************************************/
+#define cl_img_use_gralloc_ptr 1
+
+/* Flag values used by clCreateBuffer */
+#define CL_MEM_USE_GRALLOC_PTR_IMG                  (1 << 28)
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
+#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
+
+/* Error code from clEnqueueReleaseGrallocObjectsIMG */
+#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      command_queue,
+                                  cl_uint               num_objects,
+                                  const cl_mem *        mem_objects,
+                                  cl_uint               num_events_in_wait_list,
+                                  const cl_event *      event_wait_list,
+                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      command_queue,
+                                  cl_uint               num_objects,
+                                  const cl_mem *        mem_objects,
+                                  cl_uint               num_events_in_wait_list,
+                                  const cl_event *      event_wait_list,
+                                  cl_event *            event) CL_EXT_SUFFIX__VERSION_1_2;
+
+
+/*********************************
+* cl_khr_subgroups extension
+*********************************/
+#define cl_khr_subgroups 1
+
+#if !defined(CL_VERSION_2_1)
+/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
+   In hindsight, there should have been a khr suffix on this type for
+   the extension, but keeping it un-suffixed to maintain backwards
+   compatibility. */
+typedef cl_uint             cl_kernel_sub_group_info;
+#endif
+
+/* cl_kernel_sub_group_info */
+#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR    0x2033
+#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR       0x2034
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetKernelSubGroupInfoKHR(cl_kernel    in_kernel,
+                           cl_device_id in_device,
+                           cl_kernel_sub_group_info param_name,
+                           size_t       input_value_size,
+                           const void * input_value,
+                           size_t       param_value_size,
+                           void *       param_value,
+                           size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+typedef CL_API_ENTRY cl_int
+(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel    in_kernel,
+                                              cl_device_id in_device,
+                                              cl_kernel_sub_group_info param_name,
+                                              size_t       input_value_size,
+                                              const void * input_value,
+                                              size_t       param_value_size,
+                                              void *       param_value,
+                                              size_t *     param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
+
+
+/*********************************
+* cl_khr_mipmap_image extension
+*********************************/
+
+/* cl_sampler_properties */
+#define CL_SAMPLER_MIP_FILTER_MODE_KHR              0x1155
+#define CL_SAMPLER_LOD_MIN_KHR                      0x1156
+#define CL_SAMPLER_LOD_MAX_KHR                      0x1157
+
+
+/*********************************
+* cl_khr_priority_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_priority_hints 1
+
+typedef cl_uint  cl_queue_priority_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_PRIORITY_KHR 0x1096
+
+/* cl_queue_priority_khr */
+#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
+#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
+#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_throttle_hints extension
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_throttle_hints 1
+
+typedef cl_uint  cl_queue_throttle_khr;
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_THROTTLE_KHR 0x1097
+
+/* cl_queue_throttle_khr */
+#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
+#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
+#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
+
+
+/*********************************
+* cl_khr_subgroup_named_barrier
+*********************************/
+/* This extension define is for backwards compatibility.
+   It shouldn't be required since this extension has no new functions. */
+#define cl_khr_subgroup_named_barrier 1
+
+/* cl_device_info */
+#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
+
+
+/*********************************
+* cl_khr_extended_versioning
+*********************************/
+
+#define cl_khr_extended_versioning 1
+
+#define CL_VERSION_MAJOR_BITS_KHR (10)
+#define CL_VERSION_MINOR_BITS_KHR (10)
+#define CL_VERSION_PATCH_BITS_KHR (12)
+
+#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1)
+#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1)
+#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1)
+
+#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR))
+#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR)
+#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR)
+
+#define CL_MAKE_VERSION_KHR(major, minor, patch) \
+    ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \
+    (((minor) &  CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \
+    ((patch) & CL_VERSION_PATCH_MASK_KHR))
+
+typedef cl_uint cl_version_khr;
+
+#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64
+
+typedef struct _cl_name_version_khr
+{
+    cl_version_khr version;
+    char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR];
+} cl_name_version_khr;
+
+/* cl_platform_info */
+#define CL_PLATFORM_NUMERIC_VERSION_KHR                  0x0906
+#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR          0x0907
+
+/* cl_device_info */
+#define CL_DEVICE_NUMERIC_VERSION_KHR                    0x105E
+#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR           0x105F
+#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR            0x1060
+#define CL_DEVICE_ILS_WITH_VERSION_KHR                   0x1061
+#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR      0x1062
+
+
+/*********************************
+* cl_khr_device_uuid extension
+*********************************/
+#define cl_khr_device_uuid 1
+
+#define CL_UUID_SIZE_KHR 16
+#define CL_LUID_SIZE_KHR 8
+
+#define CL_DEVICE_UUID_KHR          0x106A
+#define CL_DRIVER_UUID_KHR          0x106B
+#define CL_DEVICE_LUID_VALID_KHR    0x106C
+#define CL_DEVICE_LUID_KHR          0x106D
+#define CL_DEVICE_NODE_MASK_KHR     0x106E
+
+/**********************************
+ * cl_khr_pci_bus_info extension *
+ **********************************/
+#define cl_khr_pci_bus_info 1
+
+#define CL_DEVICE_PCI_BUS_INFO_KHR  0x410F 
+
+typedef struct _cl_device_pci_bus_info_khr {
+    cl_uint   pci_domain;
+    cl_uint   pci_bus;
+    cl_uint   pci_device;
+    cl_uint   pci_function;
+} cl_device_pci_bus_info_khr;
+
+/**********************************
+ * cl_arm_import_memory extension *
+ **********************************/
+#define cl_arm_import_memory 1
+
+typedef intptr_t cl_import_properties_arm;
+
+/* Default and valid proporties name for cl_arm_import_memory */
+#define CL_IMPORT_TYPE_ARM                        0x40B2
+
+/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
+
+/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
+
+/* Protected memory property */
+#define CL_IMPORT_TYPE_PROTECTED_ARM              0x40B5
+
+/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */
+#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2
+
+/* Data consistency with host property */
+#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3
+
+/* Import memory size value to indicate a size for the whole buffer */
+#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX
+
+/* This extension adds a new function that allows for direct memory import into
+ * OpenCL via the clImportMemoryARM function.
+ *
+ * Memory imported through this interface will be mapped into the device's page
+ * tables directly, providing zero copy access. It will never fall back to copy
+ * operations and aliased buffers.
+ *
+ * Types of memory supported for import are specified as additional extension
+ * strings.
+ *
+ * This extension produces cl_mem allocations which are compatible with all other
+ * users of cl_mem in the standard API.
+ *
+ * This extension maps pages with the same properties as the normal buffer creation
+ * function clCreateBuffer.
+ */
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clImportMemoryARM( cl_context context,
+                   cl_mem_flags flags,
+                   const cl_import_properties_arm *properties,
+                   void *memory,
+                   size_t size,
+                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
+
+
+/******************************************
+ * cl_arm_shared_virtual_memory extension *
+ ******************************************/
+#define cl_arm_shared_virtual_memory 1
+
+/* Used by clGetDeviceInfo */
+#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
+
+/* Used by clGetMemObjectInfo */
+#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
+
+/* Used by clSetKernelExecInfoARM: */
+#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
+#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
+
+/* To be used by clGetEventInfo: */
+#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
+#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
+#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
+#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
+#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
+
+/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
+#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
+#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
+#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
+#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
+
+/* Flag values used by clSVMAllocARM: */
+#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
+#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
+
+typedef cl_bitfield cl_svm_mem_flags_arm;
+typedef cl_uint     cl_kernel_exec_info_arm;
+typedef cl_bitfield cl_device_svm_capabilities_arm;
+
+extern CL_API_ENTRY void * CL_API_CALL
+clSVMAllocARM(cl_context       context,
+              cl_svm_mem_flags_arm flags,
+              size_t           size,
+              cl_uint          alignment) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY void CL_API_CALL
+clSVMFreeARM(cl_context        context,
+             void *            svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMFreeARM(cl_command_queue  command_queue,
+                    cl_uint           num_svm_pointers,
+                    void *            svm_pointers[],
+                    void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
+                                                       cl_uint          num_svm_pointers,
+                                                       void *           svm_pointers[],
+                                                       void *           user_data),
+                    void *            user_data,
+                    cl_uint           num_events_in_wait_list,
+                    const cl_event *  event_wait_list,
+                    cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemcpyARM(cl_command_queue  command_queue,
+                      cl_bool           blocking_copy,
+                      void *            dst_ptr,
+                      const void *      src_ptr,
+                      size_t            size,
+                      cl_uint           num_events_in_wait_list,
+                      const cl_event *  event_wait_list,
+                      cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMemFillARM(cl_command_queue  command_queue,
+                       void *            svm_ptr,
+                       const void *      pattern,
+                       size_t            pattern_size,
+                       size_t            size,
+                       cl_uint           num_events_in_wait_list,
+                       const cl_event *  event_wait_list,
+                       cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMMapARM(cl_command_queue  command_queue,
+                   cl_bool           blocking_map,
+                   cl_map_flags      flags,
+                   void *            svm_ptr,
+                   size_t            size,
+                   cl_uint           num_events_in_wait_list,
+                   const cl_event *  event_wait_list,
+                   cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueSVMUnmapARM(cl_command_queue  command_queue,
+                     void *            svm_ptr,
+                     cl_uint           num_events_in_wait_list,
+                     const cl_event *  event_wait_list,
+                     cl_event *        event) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelArgSVMPointerARM(cl_kernel    kernel,
+                            cl_uint      arg_index,
+                            const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clSetKernelExecInfoARM(cl_kernel            kernel,
+                       cl_kernel_exec_info_arm  param_name,
+                       size_t               param_value_size,
+                       const void *         param_value) CL_EXT_SUFFIX__VERSION_1_2;
+
+/********************************
+ * cl_arm_get_core_id extension *
+ ********************************/
+
+#ifdef CL_VERSION_1_2
+
+#define cl_arm_get_core_id 1
+
+/* Device info property for bitfield of cores present */
+#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM      0x40BF
+
+#endif  /* CL_VERSION_1_2 */
+
+/*********************************
+* cl_arm_job_slot_selection
+*********************************/
+
+#define cl_arm_job_slot_selection 1
+
+/* cl_device_info */
+#define CL_DEVICE_JOB_SLOTS_ARM                   0x41E0
+
+/* cl_command_queue_properties */
+#define CL_QUEUE_JOB_SLOT_ARM                     0x41E1
+
+/*********************************
+* cl_arm_scheduling_controls
+*********************************/
+
+#define cl_arm_scheduling_controls 1
+
+/* cl_device_info */
+#define CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM          0x41E4
+
+#define CL_DEVICE_SCHEDULING_KERNEL_BATCHING_ARM               (1 << 0)
+#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_ARM          (1 << 1)
+#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_MODIFIER_ARM (1 << 2)
+
+/* cl_kernel_info */
+#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM            0x41E5
+#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM   0x41E6
+
+/* cl_queue_properties */
+#define CL_QUEUE_KERNEL_BATCHING_ARM                            0x41E7
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CL_EXT_H */
diff --git a/ext/cudart/include/CL/cl_gl.h b/ext/cudart/include/CL/cl_gl.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e390d197fd1ef1b36b17c0bcabbabbac7a14d66
--- /dev/null
+++ b/ext/cudart/include/CL/cl_gl.h
@@ -0,0 +1,154 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_GL_H
+#define __OPENCL_CL_GL_H
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#else
+#include <CL/cl.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef cl_uint     cl_gl_object_type;
+typedef cl_uint     cl_gl_texture_info;
+typedef cl_uint     cl_gl_platform_info;
+typedef struct __GLsync *cl_GLsync;
+
+/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
+#define CL_GL_OBJECT_BUFFER                     0x2000
+#define CL_GL_OBJECT_TEXTURE2D                  0x2001
+#define CL_GL_OBJECT_TEXTURE3D                  0x2002
+#define CL_GL_OBJECT_RENDERBUFFER               0x2003
+#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
+#define CL_GL_OBJECT_TEXTURE1D                  0x200F
+#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
+#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
+
+/* cl_gl_texture_info           */
+#define CL_GL_TEXTURE_TARGET                    0x2004
+#define CL_GL_MIPMAP_LEVEL                      0x2005
+#define CL_GL_NUM_SAMPLES                       0x2012
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLBuffer(cl_context     context,
+                     cl_mem_flags   flags,
+                     cl_GLuint      bufobj,
+                     cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLTexture(cl_context      context,
+                      cl_mem_flags    flags,
+                      cl_GLenum       target,
+                      cl_GLint        miplevel,
+                      cl_GLuint       texture,
+                      cl_int *        errcode_ret) CL_API_SUFFIX__VERSION_1_2;
+
+extern CL_API_ENTRY cl_mem CL_API_CALL
+clCreateFromGLRenderbuffer(cl_context   context,
+                           cl_mem_flags flags,
+                           cl_GLuint    renderbuffer,
+                           cl_int *     errcode_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLObjectInfo(cl_mem                memobj,
+                  cl_gl_object_type *   gl_object_type,
+                  cl_GLuint *           gl_object_name) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLTextureInfo(cl_mem               memobj,
+                   cl_gl_texture_info   param_name,
+                   size_t               param_value_size,
+                   void *               param_value,
+                   size_t *             param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueAcquireGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clEnqueueReleaseGLObjects(cl_command_queue      command_queue,
+                          cl_uint               num_objects,
+                          const cl_mem *        mem_objects,
+                          cl_uint               num_events_in_wait_list,
+                          const cl_event *      event_wait_list,
+                          cl_event *            event) CL_API_SUFFIX__VERSION_1_0;
+
+
+/* Deprecated OpenCL 1.1 APIs */
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture2D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
+clCreateFromGLTexture3D(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
+
+/* cl_khr_gl_sharing extension  */
+
+#define cl_khr_gl_sharing 1
+
+typedef cl_uint     cl_gl_context_info;
+
+/* Additional Error Codes  */
+#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
+
+/* cl_gl_context_info  */
+#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
+#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
+
+/* Additional cl_context_properties  */
+#define CL_GL_CONTEXT_KHR                       0x2008
+#define CL_EGL_DISPLAY_KHR                      0x2009
+#define CL_GLX_DISPLAY_KHR                      0x200A
+#define CL_WGL_HDC_KHR                          0x200B
+#define CL_CGL_SHAREGROUP_KHR                   0x200C
+
+extern CL_API_ENTRY cl_int CL_API_CALL
+clGetGLContextInfoKHR(const cl_context_properties * properties,
+                      cl_gl_context_info            param_name,
+                      size_t                        param_value_size,
+                      void *                        param_value,
+                      size_t *                      param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
+
+typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
+    const cl_context_properties * properties,
+    cl_gl_context_info            param_name,
+    size_t                        param_value_size,
+    void *                        param_value,
+    size_t *                      param_value_size_ret);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_CL_GL_H */
\ No newline at end of file
diff --git a/ext/cudart/include/CL/cl_gl_ext.h b/ext/cudart/include/CL/cl_gl_ext.h
new file mode 100644
index 0000000000000000000000000000000000000000..83102e3435e93eaa34c99efae3680c913ba3990e
--- /dev/null
+++ b/ext/cudart/include/CL/cl_gl_ext.h
@@ -0,0 +1,44 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_CL_GL_EXT_H
+#define __OPENCL_CL_GL_EXT_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+    #include <OpenCL/cl_gl.h>
+#else
+    #include <CL/cl_gl.h>
+#endif
+
+/* 
+ *  cl_khr_gl_event extension
+ */
+#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
+
+extern CL_API_ENTRY cl_event CL_API_CALL
+clCreateEventFromGLsyncKHR(cl_context context,
+                           cl_GLsync  sync,
+                           cl_int *   errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* __OPENCL_CL_GL_EXT_H  */
diff --git a/ext/cudart/include/CL/cl_platform.h b/ext/cudart/include/CL/cl_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fba9080bd6cf87a1614c1378a1b9b151efbd21e
--- /dev/null
+++ b/ext/cudart/include/CL/cl_platform.h
@@ -0,0 +1,1414 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __CL_PLATFORM_H
+#define __CL_PLATFORM_H
+
+#ifdef __APPLE__
+    /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */
+    #include <AvailabilityMacros.h>
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(_WIN32)
+    #define CL_API_ENTRY
+    #define CL_API_CALL     __stdcall
+    #define CL_CALLBACK     __stdcall
+#else
+    #define CL_API_ENTRY
+    #define CL_API_CALL
+    #define CL_CALLBACK
+#endif
+
+/*
+ * Deprecation flags refer to the last version of the header in which the
+ * feature was not deprecated.
+ *
+ * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without
+ * deprecation but is deprecated in versions later than 1.1.
+ */
+#ifdef __APPLE__
+    #define CL_EXTENSION_WEAK_LINK       __attribute__((weak_import))
+    #define CL_API_SUFFIX__VERSION_1_0                  AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER
+    #define CL_API_SUFFIX__VERSION_1_1                  AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define GCL_API_SUFFIX__VERSION_1_1                 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_1                  CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED       CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7
+    
+    #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8
+    #else
+        #warning  This path should never happen outside of internal operating system development.  AvailabilityMacros do not function correctly here!
+        #define CL_API_SUFFIX__VERSION_1_2              AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define GCL_API_SUFFIX__VERSION_1_2             AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_2              CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED   CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER
+    #endif
+ 
+ 
+ 
+#else
+    #define CL_EXTENSION_WEAK_LINK  
+    #define CL_API_SUFFIX__VERSION_1_0
+    #define CL_EXT_SUFFIX__VERSION_1_0
+    #define CL_API_SUFFIX__VERSION_1_1
+    #define CL_EXT_SUFFIX__VERSION_1_1
+    #define CL_API_SUFFIX__VERSION_1_2
+    #define CL_EXT_SUFFIX__VERSION_1_2
+    #define CL_API_SUFFIX__VERSION_2_0
+    #define CL_EXT_SUFFIX__VERSION_2_0
+    #define CL_API_SUFFIX__VERSION_2_1
+    #define CL_EXT_SUFFIX__VERSION_2_1
+    #define CL_API_SUFFIX__VERSION_2_2
+    #define CL_EXT_SUFFIX__VERSION_2_2
+    #define CL_API_SUFFIX__VERSION_3_0
+    #define CL_EXT_SUFFIX__VERSION_3_0
+    #define CL_API_SUFFIX__EXPERIMENTAL
+    #define CL_EXT_SUFFIX__EXPERIMENTAL
+    
+    #ifdef __GNUC__
+        #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated))
+        #define CL_EXT_PREFIX_DEPRECATED
+    #elif defined(_WIN32)
+        #define CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated)
+    #else
+        #define CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX_DEPRECATED
+    #endif
+
+    #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+
+    #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+
+    #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+
+    #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS
+        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+
+    #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS
+        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+
+    #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS
+        #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED
+    #else
+        #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED
+        #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED
+    #endif
+#endif
+
+#if (defined (_WIN32) && defined(_MSC_VER))
+
+/* scalar types  */
+typedef signed   __int8         cl_char;
+typedef unsigned __int8         cl_uchar;
+typedef signed   __int16        cl_short;
+typedef unsigned __int16        cl_ushort;
+typedef signed   __int32        cl_int;
+typedef unsigned __int32        cl_uint;
+typedef signed   __int64        cl_long;
+typedef unsigned __int64        cl_ulong;
+
+typedef unsigned __int16        cl_half;
+typedef float                   cl_float;
+typedef double                  cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          1.7976931348623158e+308
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#define CL_NAN              (CL_INFINITY - CL_INFINITY)
+#define CL_HUGE_VALF        ((cl_float) 1e50)
+#define CL_HUGE_VAL         ((cl_double) 1e500)
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#else
+
+#include <stdint.h>
+
+/* scalar types  */
+typedef int8_t          cl_char;
+typedef uint8_t         cl_uchar;
+typedef int16_t         cl_short;
+typedef uint16_t        cl_ushort;
+typedef int32_t         cl_int;
+typedef uint32_t        cl_uint;
+typedef int64_t         cl_long;
+typedef uint64_t        cl_ulong;
+
+typedef uint16_t        cl_half;
+typedef float           cl_float;
+typedef double          cl_double;
+
+/* Macro names and corresponding values defined by OpenCL */
+#define CL_CHAR_BIT         8
+#define CL_SCHAR_MAX        127
+#define CL_SCHAR_MIN        (-127-1)
+#define CL_CHAR_MAX         CL_SCHAR_MAX
+#define CL_CHAR_MIN         CL_SCHAR_MIN
+#define CL_UCHAR_MAX        255
+#define CL_SHRT_MAX         32767
+#define CL_SHRT_MIN         (-32767-1)
+#define CL_USHRT_MAX        65535
+#define CL_INT_MAX          2147483647
+#define CL_INT_MIN          (-2147483647-1)
+#define CL_UINT_MAX         0xffffffffU
+#define CL_LONG_MAX         ((cl_long) 0x7FFFFFFFFFFFFFFFLL)
+#define CL_LONG_MIN         ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL)
+#define CL_ULONG_MAX        ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL)
+
+#define CL_FLT_DIG          6
+#define CL_FLT_MANT_DIG     24
+#define CL_FLT_MAX_10_EXP   +38
+#define CL_FLT_MAX_EXP      +128
+#define CL_FLT_MIN_10_EXP   -37
+#define CL_FLT_MIN_EXP      -125
+#define CL_FLT_RADIX        2
+#define CL_FLT_MAX          340282346638528859811704183484516925440.0f
+#define CL_FLT_MIN          1.175494350822287507969e-38f
+#define CL_FLT_EPSILON      1.1920928955078125e-7f
+
+#define CL_HALF_DIG          3
+#define CL_HALF_MANT_DIG     11
+#define CL_HALF_MAX_10_EXP   +4
+#define CL_HALF_MAX_EXP      +16
+#define CL_HALF_MIN_10_EXP   -4
+#define CL_HALF_MIN_EXP      -13
+#define CL_HALF_RADIX        2
+#define CL_HALF_MAX          65504.0f
+#define CL_HALF_MIN          6.103515625e-05f
+#define CL_HALF_EPSILON      9.765625e-04f
+
+#define CL_DBL_DIG          15
+#define CL_DBL_MANT_DIG     53
+#define CL_DBL_MAX_10_EXP   +308
+#define CL_DBL_MAX_EXP      +1024
+#define CL_DBL_MIN_10_EXP   -307
+#define CL_DBL_MIN_EXP      -1021
+#define CL_DBL_RADIX        2
+#define CL_DBL_MAX          179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0
+#define CL_DBL_MIN          2.225073858507201383090e-308
+#define CL_DBL_EPSILON      2.220446049250313080847e-16
+
+#define CL_M_E              2.7182818284590452354
+#define CL_M_LOG2E          1.4426950408889634074
+#define CL_M_LOG10E         0.43429448190325182765
+#define CL_M_LN2            0.69314718055994530942
+#define CL_M_LN10           2.30258509299404568402
+#define CL_M_PI             3.14159265358979323846
+#define CL_M_PI_2           1.57079632679489661923
+#define CL_M_PI_4           0.78539816339744830962
+#define CL_M_1_PI           0.31830988618379067154
+#define CL_M_2_PI           0.63661977236758134308
+#define CL_M_2_SQRTPI       1.12837916709551257390
+#define CL_M_SQRT2          1.41421356237309504880
+#define CL_M_SQRT1_2        0.70710678118654752440
+
+#define CL_M_E_F            2.718281828f
+#define CL_M_LOG2E_F        1.442695041f
+#define CL_M_LOG10E_F       0.434294482f
+#define CL_M_LN2_F          0.693147181f
+#define CL_M_LN10_F         2.302585093f
+#define CL_M_PI_F           3.141592654f
+#define CL_M_PI_2_F         1.570796327f
+#define CL_M_PI_4_F         0.785398163f
+#define CL_M_1_PI_F         0.318309886f
+#define CL_M_2_PI_F         0.636619772f
+#define CL_M_2_SQRTPI_F     1.128379167f
+#define CL_M_SQRT2_F        1.414213562f
+#define CL_M_SQRT1_2_F      0.707106781f
+
+#if defined( __GNUC__ )
+   #define CL_HUGE_VALF     __builtin_huge_valf()
+   #define CL_HUGE_VAL      __builtin_huge_val()
+   #define CL_NAN           __builtin_nanf( "" )
+#else
+   #define CL_HUGE_VALF     ((cl_float) 1e50)
+   #define CL_HUGE_VAL      ((cl_double) 1e500)
+   float nanf( const char * );
+   #define CL_NAN           nanf( "" )
+#endif
+#define CL_MAXFLOAT         CL_FLT_MAX
+#define CL_INFINITY         CL_HUGE_VALF
+
+#endif
+
+#include <stddef.h>
+
+/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */
+typedef unsigned int cl_GLuint;
+typedef int          cl_GLint;
+typedef unsigned int cl_GLenum;
+
+/*
+ * Vector types
+ *
+ *  Note:   OpenCL requires that all types be naturally aligned.
+ *          This means that vector types must be naturally aligned.
+ *          For example, a vector of four floats must be aligned to
+ *          a 16 byte boundary (calculated as 4 * the natural 4-byte
+ *          alignment of the float).  The alignment qualifiers here
+ *          will only function properly if your compiler supports them
+ *          and if you don't actively work to defeat them.  For example,
+ *          in order for a cl_float4 to be 16 byte aligned in a struct,
+ *          the start of the struct must itself be 16-byte aligned.
+ *
+ *          Maintaining proper alignment is the user's responsibility.
+ */
+
+/* Define basic vector types */
+#if defined( __VEC__ )
+  #if !defined(__clang__)
+     #include <altivec.h>   /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */
+  #endif
+   typedef __vector unsigned char     __cl_uchar16;
+   typedef __vector signed char       __cl_char16;
+   typedef __vector unsigned short    __cl_ushort8;
+   typedef __vector signed short      __cl_short8;
+   typedef __vector unsigned int      __cl_uint4;
+   typedef __vector signed int        __cl_int4;
+   typedef __vector float             __cl_float4;
+   #define  __CL_UCHAR16__  1
+   #define  __CL_CHAR16__   1
+   #define  __CL_USHORT8__  1
+   #define  __CL_SHORT8__   1
+   #define  __CL_UINT4__    1
+   #define  __CL_INT4__     1
+   #define  __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <xmmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef float __cl_float4   __attribute__((vector_size(16)));
+    #else
+        typedef __m128 __cl_float4;
+    #endif
+    #define __CL_FLOAT4__   1
+#endif
+
+#if defined( __SSE2__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <emmintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar16    __attribute__((vector_size(16)));
+        typedef cl_char     __cl_char16     __attribute__((vector_size(16)));
+        typedef cl_ushort   __cl_ushort8    __attribute__((vector_size(16)));
+        typedef cl_short    __cl_short8     __attribute__((vector_size(16)));
+        typedef cl_uint     __cl_uint4      __attribute__((vector_size(16)));
+        typedef cl_int      __cl_int4       __attribute__((vector_size(16)));
+        typedef cl_ulong    __cl_ulong2     __attribute__((vector_size(16)));
+        typedef cl_long     __cl_long2      __attribute__((vector_size(16)));
+        typedef cl_double   __cl_double2    __attribute__((vector_size(16)));
+    #else
+        typedef __m128i __cl_uchar16;
+        typedef __m128i __cl_char16;
+        typedef __m128i __cl_ushort8;
+        typedef __m128i __cl_short8;
+        typedef __m128i __cl_uint4;
+        typedef __m128i __cl_int4;
+        typedef __m128i __cl_ulong2;
+        typedef __m128i __cl_long2;
+        typedef __m128d __cl_double2;
+    #endif
+    #define __CL_UCHAR16__  1
+    #define __CL_CHAR16__   1
+    #define __CL_USHORT8__  1
+    #define __CL_SHORT8__   1
+    #define __CL_INT4__     1
+    #define __CL_UINT4__    1
+    #define __CL_ULONG2__   1
+    #define __CL_LONG2__    1
+    #define __CL_DOUBLE2__  1
+#endif
+
+#if defined( __MMX__ )
+    #include <mmintrin.h>
+    #if defined( __GNUC__ )
+        typedef cl_uchar    __cl_uchar8     __attribute__((vector_size(8)));
+        typedef cl_char     __cl_char8      __attribute__((vector_size(8)));
+        typedef cl_ushort   __cl_ushort4    __attribute__((vector_size(8)));
+        typedef cl_short    __cl_short4     __attribute__((vector_size(8)));
+        typedef cl_uint     __cl_uint2      __attribute__((vector_size(8)));
+        typedef cl_int      __cl_int2       __attribute__((vector_size(8)));
+        typedef cl_ulong    __cl_ulong1     __attribute__((vector_size(8)));
+        typedef cl_long     __cl_long1      __attribute__((vector_size(8)));
+        typedef cl_float    __cl_float2     __attribute__((vector_size(8)));
+    #else
+        typedef __m64       __cl_uchar8;
+        typedef __m64       __cl_char8;
+        typedef __m64       __cl_ushort4;
+        typedef __m64       __cl_short4;
+        typedef __m64       __cl_uint2;
+        typedef __m64       __cl_int2;
+        typedef __m64       __cl_ulong1;
+        typedef __m64       __cl_long1;
+        typedef __m64       __cl_float2;
+    #endif
+    #define __CL_UCHAR8__   1
+    #define __CL_CHAR8__    1
+    #define __CL_USHORT4__  1
+    #define __CL_SHORT4__   1
+    #define __CL_INT2__     1
+    #define __CL_UINT2__    1
+    #define __CL_ULONG1__   1
+    #define __CL_LONG1__    1
+    #define __CL_FLOAT2__   1
+#endif
+
+#if defined( __AVX__ )
+    #if defined( __MINGW64__ )
+        #include <intrin.h>
+    #else
+        #include <immintrin.h>
+    #endif
+    #if defined( __GNUC__ )
+        typedef cl_float    __cl_float8     __attribute__((vector_size(32)));
+        typedef cl_double   __cl_double4    __attribute__((vector_size(32)));
+    #else
+        typedef __m256      __cl_float8;
+        typedef __m256d     __cl_double4;
+    #endif
+    #define __CL_FLOAT8__   1
+    #define __CL_DOUBLE4__  1
+#endif
+
+/* Define capabilities for anonymous struct members. */
+#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__
+#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ )
+#define  __CL_HAS_ANON_STRUCT__ 1
+#define  __CL_ANON_STRUCT__ __extension__
+#elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
+    #if _MSC_VER >= 1500
+   /* Microsoft Developer Studio 2008 supports anonymous structs, but
+    * complains by default. */
+    #define  __CL_HAS_ANON_STRUCT__ 1
+    #define  __CL_ANON_STRUCT__
+   /* Disable warning C4201: nonstandard extension used : nameless
+    * struct/union */
+    #pragma warning( push )
+    #pragma warning( disable : 4201 )
+    #endif
+#else
+#define  __CL_HAS_ANON_STRUCT__ 0
+#define  __CL_ANON_STRUCT__
+#endif
+
+/* Define alignment keys */
+#if defined( __GNUC__ )
+    #define CL_ALIGNED(_x)          __attribute__ ((aligned(_x)))
+#elif defined( _WIN32) && (_MSC_VER)
+    /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements     */
+    /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx                                                 */
+    /* #include <crtdefs.h>                                                                                             */
+    /* #define CL_ALIGNED(_x)          _CRT_ALIGN(_x)                                                                   */
+    #define CL_ALIGNED(_x)
+#else
+   #warning  Need to implement some method to align data here
+   #define  CL_ALIGNED(_x)
+#endif
+
+/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */
+#if __CL_HAS_ANON_STRUCT__
+    /* .xyzw and .s0123...{f|F} are supported */
+    #define CL_HAS_NAMED_VECTOR_FIELDS 1
+    /* .hi and .lo are supported */
+    #define CL_HAS_HI_LO_VECTOR_FIELDS 1
+#endif
+
+/* Define cl_vector types */
+
+/* ---- cl_charn ---- */
+typedef union
+{
+    cl_char  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_char  lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2;
+#endif
+}cl_char2;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[2];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4;
+#endif
+}cl_char4;
+
+/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */
+typedef  cl_char4  cl_char3;
+
+typedef union
+{
+    cl_char   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[4];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[2];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8;
+#endif
+}cl_char8;
+
+typedef union
+{
+    cl_char  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_char  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_char  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; };
+#endif
+#if defined( __CL_CHAR2__)
+    __cl_char2     v2[8];
+#endif
+#if defined( __CL_CHAR4__)
+    __cl_char4     v4[4];
+#endif
+#if defined( __CL_CHAR8__ )
+    __cl_char8     v8[2];
+#endif
+#if defined( __CL_CHAR16__ )
+    __cl_char16    v16;
+#endif
+}cl_char16;
+
+
+/* ---- cl_ucharn ---- */
+typedef union
+{
+    cl_uchar  CL_ALIGNED(2) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  lo, hi; };
+#endif
+#if defined( __cl_uchar2__)
+    __cl_uchar2     v2;
+#endif
+}cl_uchar2;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(4) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[2];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4;
+#endif
+}cl_uchar4;
+
+/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */
+typedef  cl_uchar4  cl_uchar3;
+
+typedef union
+{
+    cl_uchar   CL_ALIGNED(8) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[4];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[2];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8;
+#endif
+}cl_uchar8;
+
+typedef union
+{
+    cl_uchar  CL_ALIGNED(16) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uchar  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; };
+#endif
+#if defined( __CL_UCHAR2__)
+    __cl_uchar2     v2[8];
+#endif
+#if defined( __CL_UCHAR4__)
+    __cl_uchar4     v4[4];
+#endif
+#if defined( __CL_UCHAR8__ )
+    __cl_uchar8     v8[2];
+#endif
+#if defined( __CL_UCHAR16__ )
+    __cl_uchar16    v16;
+#endif
+}cl_uchar16;
+
+
+/* ---- cl_shortn ---- */
+typedef union
+{
+    cl_short  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_short  lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2;
+#endif
+}cl_short2;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[2];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4;
+#endif
+}cl_short4;
+
+/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */
+typedef  cl_short4  cl_short3;
+
+typedef union
+{
+    cl_short   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[4];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[2];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8;
+#endif
+}cl_short8;
+
+typedef union
+{
+    cl_short  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_short  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_short  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; };
+#endif
+#if defined( __CL_SHORT2__)
+    __cl_short2     v2[8];
+#endif
+#if defined( __CL_SHORT4__)
+    __cl_short4     v4[4];
+#endif
+#if defined( __CL_SHORT8__ )
+    __cl_short8     v8[2];
+#endif
+#if defined( __CL_SHORT16__ )
+    __cl_short16    v16;
+#endif
+}cl_short16;
+
+
+/* ---- cl_ushortn ---- */
+typedef union
+{
+    cl_ushort  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2;
+#endif
+}cl_ushort2;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[2];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4;
+#endif
+}cl_ushort4;
+
+/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */
+typedef  cl_ushort4  cl_ushort3;
+
+typedef union
+{
+    cl_ushort   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[4];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[2];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8;
+#endif
+}cl_ushort8;
+
+typedef union
+{
+    cl_ushort  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ushort  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; };
+#endif
+#if defined( __CL_USHORT2__)
+    __cl_ushort2     v2[8];
+#endif
+#if defined( __CL_USHORT4__)
+    __cl_ushort4     v4[4];
+#endif
+#if defined( __CL_USHORT8__ )
+    __cl_ushort8     v8[2];
+#endif
+#if defined( __CL_USHORT16__ )
+    __cl_ushort16    v16;
+#endif
+}cl_ushort16;
+
+
+/* ---- cl_halfn ---- */
+typedef union
+{
+    cl_half  CL_ALIGNED(4) s[2];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1; };
+    __CL_ANON_STRUCT__ struct{ cl_half  lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2;
+#endif
+}cl_half2;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(8) s[4];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3; };
+    __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[2];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4;
+#endif
+}cl_half4;
+
+/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */
+typedef  cl_half4  cl_half3;
+
+typedef union
+{
+    cl_half   CL_ALIGNED(16) s[8];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7; };
+    __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[4];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[2];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8;
+#endif
+}cl_half8;
+
+typedef union
+{
+    cl_half  CL_ALIGNED(32) s[16];
+#if __CL_HAS_ANON_STRUCT__
+    __CL_ANON_STRUCT__ struct{ cl_half  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+    __CL_ANON_STRUCT__ struct{ cl_half  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+    __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; };
+#endif
+#if defined( __CL_HALF2__)
+    __cl_half2     v2[8];
+#endif
+#if defined( __CL_HALF4__)
+    __cl_half4     v4[4];
+#endif
+#if defined( __CL_HALF8__ )
+    __cl_half8     v8[2];
+#endif
+#if defined( __CL_HALF16__ )
+    __cl_half16    v16;
+#endif
+}cl_half16;
+
+/* ---- cl_intn ---- */
+typedef union
+{
+    cl_int  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_int  lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2;
+#endif
+}cl_int2;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[2];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4;
+#endif
+}cl_int4;
+
+/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */
+typedef  cl_int4  cl_int3;
+
+typedef union
+{
+    cl_int   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[4];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[2];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8;
+#endif
+}cl_int8;
+
+typedef union
+{
+    cl_int  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_int  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_int  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; };
+#endif
+#if defined( __CL_INT2__)
+    __cl_int2     v2[8];
+#endif
+#if defined( __CL_INT4__)
+    __cl_int4     v4[4];
+#endif
+#if defined( __CL_INT8__ )
+    __cl_int8     v8[2];
+#endif
+#if defined( __CL_INT16__ )
+    __cl_int16    v16;
+#endif
+}cl_int16;
+
+
+/* ---- cl_uintn ---- */
+typedef union
+{
+    cl_uint  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2;
+#endif
+}cl_uint2;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[2];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4;
+#endif
+}cl_uint4;
+
+/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */
+typedef  cl_uint4  cl_uint3;
+
+typedef union
+{
+    cl_uint   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[4];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[2];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8;
+#endif
+}cl_uint8;
+
+typedef union
+{
+    cl_uint  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_uint  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_uint  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; };
+#endif
+#if defined( __CL_UINT2__)
+    __cl_uint2     v2[8];
+#endif
+#if defined( __CL_UINT4__)
+    __cl_uint4     v4[4];
+#endif
+#if defined( __CL_UINT8__ )
+    __cl_uint8     v8[2];
+#endif
+#if defined( __CL_UINT16__ )
+    __cl_uint16    v16;
+#endif
+}cl_uint16;
+
+/* ---- cl_longn ---- */
+typedef union
+{
+    cl_long  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_long  lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2;
+#endif
+}cl_long2;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[2];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4;
+#endif
+}cl_long4;
+
+/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */
+typedef  cl_long4  cl_long3;
+
+typedef union
+{
+    cl_long   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[4];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[2];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8;
+#endif
+}cl_long8;
+
+typedef union
+{
+    cl_long  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_long  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_long  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; };
+#endif
+#if defined( __CL_LONG2__)
+    __cl_long2     v2[8];
+#endif
+#if defined( __CL_LONG4__)
+    __cl_long4     v4[4];
+#endif
+#if defined( __CL_LONG8__ )
+    __cl_long8     v8[2];
+#endif
+#if defined( __CL_LONG16__ )
+    __cl_long16    v16;
+#endif
+}cl_long16;
+
+
+/* ---- cl_ulongn ---- */
+typedef union
+{
+    cl_ulong  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2;
+#endif
+}cl_ulong2;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[2];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4;
+#endif
+}cl_ulong4;
+
+/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */
+typedef  cl_ulong4  cl_ulong3;
+
+typedef union
+{
+    cl_ulong   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[4];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[2];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8;
+#endif
+}cl_ulong8;
+
+typedef union
+{
+    cl_ulong  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_ulong  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; };
+#endif
+#if defined( __CL_ULONG2__)
+    __cl_ulong2     v2[8];
+#endif
+#if defined( __CL_ULONG4__)
+    __cl_ulong4     v4[4];
+#endif
+#if defined( __CL_ULONG8__ )
+    __cl_ulong8     v8[2];
+#endif
+#if defined( __CL_ULONG16__ )
+    __cl_ulong16    v16;
+#endif
+}cl_ulong16;
+
+
+/* --- cl_floatn ---- */
+
+typedef union
+{
+    cl_float  CL_ALIGNED(8) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_float  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2;
+#endif
+}cl_float2;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(16) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_float2  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[2];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4;
+#endif
+}cl_float4;
+
+/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */
+typedef  cl_float4  cl_float3;
+
+typedef union
+{
+    cl_float   CL_ALIGNED(32) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float   x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_float   s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_float4  lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[4];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[2];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8;
+#endif
+}cl_float8;
+
+typedef union
+{
+    cl_float  CL_ALIGNED(64) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_float  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_float  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; };
+#endif
+#if defined( __CL_FLOAT2__)
+    __cl_float2     v2[8];
+#endif
+#if defined( __CL_FLOAT4__)
+    __cl_float4     v4[4];
+#endif
+#if defined( __CL_FLOAT8__ )
+    __cl_float8     v8[2];
+#endif
+#if defined( __CL_FLOAT16__ )
+    __cl_float16    v16;
+#endif
+}cl_float16;
+
+/* --- cl_doublen ---- */
+
+typedef union
+{
+    cl_double  CL_ALIGNED(16) s[2];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y; };
+   __CL_ANON_STRUCT__ struct{ cl_double s0, s1; };
+   __CL_ANON_STRUCT__ struct{ cl_double lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2;
+#endif
+}cl_double2;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(32) s[4];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3; };
+   __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[2];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4;
+#endif
+}cl_double4;
+
+/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */
+typedef  cl_double4  cl_double3;
+
+typedef union
+{
+    cl_double   CL_ALIGNED(64) s[8];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7; };
+   __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[4];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[2];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8;
+#endif
+}cl_double8;
+
+typedef union
+{
+    cl_double  CL_ALIGNED(128) s[16];
+#if __CL_HAS_ANON_STRUCT__
+   __CL_ANON_STRUCT__ struct{ cl_double  x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; };
+   __CL_ANON_STRUCT__ struct{ cl_double  s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; };
+   __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; };
+#endif
+#if defined( __CL_DOUBLE2__)
+    __cl_double2     v2[8];
+#endif
+#if defined( __CL_DOUBLE4__)
+    __cl_double4     v4[4];
+#endif
+#if defined( __CL_DOUBLE8__ )
+    __cl_double8     v8[2];
+#endif
+#if defined( __CL_DOUBLE16__ )
+    __cl_double16    v16;
+#endif
+}cl_double16;
+
+/* Macro to facilitate debugging
+ * Usage:
+ *   Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source.
+ *   The first line ends with:   CL_PROGRAM_STRING_DEBUG_INFO \"
+ *   Each line thereafter of OpenCL C source must end with: \n\
+ *   The last line ends in ";
+ *
+ *   Example:
+ *
+ *   const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\
+ *   kernel void foo( int a, float * b )             \n\
+ *   {                                               \n\
+ *      // my comment                                \n\
+ *      *b[ get_global_id(0)] = a;                   \n\
+ *   }                                               \n\
+ *   ";
+ *
+ * This should correctly set up the line, (column) and file information for your source
+ * string so you can do source level debugging.
+ */
+#define  __CL_STRINGIFY( _x )               # _x
+#define  _CL_STRINGIFY( _x )                __CL_STRINGIFY( _x )
+#define  CL_PROGRAM_STRING_DEBUG_INFO       "#line "  _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n"
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef __CL_HAS_ANON_STRUCT__
+#undef __CL_ANON_STRUCT__
+#if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__)
+    #if _MSC_VER >=1500
+    #pragma warning( pop )
+    #endif
+#endif
+
+#endif  /* __CL_PLATFORM_H  */
diff --git a/ext/cudart/include/CL/opencl.h b/ext/cudart/include/CL/opencl.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd7fa6be57b8df26003c414041683340bcb95404
--- /dev/null
+++ b/ext/cudart/include/CL/opencl.h
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ * Copyright (c) 2008-2020 The Khronos Group Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ ******************************************************************************/
+
+#ifndef __OPENCL_H
+#define __OPENCL_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __APPLE__
+#include <OpenCL/cl.h>
+#include <OpenCL/cl_gl.h>
+#include <OpenCL/cl_gl_ext.h>
+#include <OpenCL/cl_ext.h>
+#else
+#include <CL/cl.h>
+#include <CL/cl_gl.h>
+#include <CL/cl_gl_ext.h>
+#include <CL/cl_ext.h>
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  /* __OPENCL_H   */
diff --git a/ext/cudart/include/builtin_types.h b/ext/cudart/include/builtin_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..5247c40807f0dd36a886513ab1bff5d2977364db
--- /dev/null
+++ b/ext/cudart/include/builtin_types.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "device_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "driver_types.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "surface_types.h"
+#include "texture_types.h"
+#include "vector_types.h"
diff --git a/ext/cudart/include/channel_descriptor.h b/ext/cudart/include/channel_descriptor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d61d2974a4f2b527d04baf586c73c2af86358f4
--- /dev/null
+++ b/ext/cudart/include/channel_descriptor.h
@@ -0,0 +1,595 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CHANNEL_DESCRIPTOR_H__)
+#define __CHANNEL_DESCRIPTOR_H__
+
+#if defined(__cplusplus)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ *
+ * @{
+ */
+
+/**
+ * \brief \hl Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat,
+ * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2,
+ * ::cudaChannelFormatKindSignedNormalized8X4,
+ * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2,
+ * ::cudaChannelFormatKindUnsignedNormalized8X4,
+ * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2,
+ * ::cudaChannelFormatKindSignedNormalized16X4,
+ * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2,
+ * ::cudaChannelFormatKindUnsignedNormalized16X4
+ * or ::cudaChannelFormatKindNV12.
+ *
+ * The format is specified by the template specialization.
+ *
+ * The template function specializes for the following scalar types:
+ * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float.
+ * The template function specializes for the following vector types:
+ * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}.
+ * The template function specializes for following cudaChannelFormatKind enum values:
+ * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12.
+ *
+ * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)"
+ */
+template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+  return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void)
+{
+  int e = (int)sizeof(char) * 8;
+
+#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__)
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void)
+{
+  int e = (int)sizeof(signed char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void)
+{
+  int e = (int)sizeof(unsigned char) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void)
+{
+  int e = (int)sizeof(short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void)
+{
+  int e = (int)sizeof(unsigned short) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void)
+{
+  int e = (int)sizeof(int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void)
+{
+  int e = (int)sizeof(unsigned int) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#if !defined(__LP64__)
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void)
+{
+  int e = (int)sizeof(long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void)
+{
+  int e = (int)sizeof(unsigned long) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned);
+}
+
+#endif /* !__LP64__ */
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void)
+{
+  int e = (int)sizeof(float) * 8;
+
+  return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
+static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void)
+{
+    int e = (int)sizeof(char) * 8;
+
+    return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12);
+}
+
+template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void)
+{
+    return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone);
+}
+
+/* Signed 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4);
+}
+
+/* Unsigned 8-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4);
+}
+
+/* Signed 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4);
+}
+
+/* Unsigned 16-bit normalized integer formats */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void)
+{
+    return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2);
+}
+
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4);
+}
+
+/* NV12 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12);
+}
+
+/* BC1 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1);
+}
+
+/* BC1sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB);
+}
+
+/* BC2 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2);
+}
+
+/* BC2sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB);
+}
+
+/* BC3 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3);
+}
+
+/* BC3sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB);
+}
+
+/* BC4 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4);
+}
+
+/* BC4 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void)
+{
+    return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4);
+}
+
+/* BC5 unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5);
+}
+
+/* BC5 signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5);
+}
+
+/* BC6H unsigned format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H);
+}
+
+/* BC6H signed format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void)
+{
+    return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H);
+}
+
+/* BC7 format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7);
+}
+
+/* BC7sRGB format */
+template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void)
+{
+    return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB);
+}
+
+#endif /* __cplusplus */
+
+/** @} */
+/** @} */ /* END CUDART_TEXTURE_HL */
+
+#endif /* !__CHANNEL_DESCRIPTOR_H__ */
diff --git a/ext/cudart/include/common_functions.h b/ext/cudart/include/common_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f8ea3d242640f2196b789c7da6c05d2ed1bed3e
--- /dev/null
+++ b/ext/cudart/include/common_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/common_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/ext/cudart/include/cooperative_groups.h b/ext/cudart/include/cooperative_groups.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8823fb79bc78e513861f9e7d3e671bf0effc6b7
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups.h
@@ -0,0 +1,1828 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _COOPERATIVE_GROUPS_H_
+#define _COOPERATIVE_GROUPS_H_
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "cooperative_groups/details/info.h"
+#include "cooperative_groups/details/driver_abi.h"
+#include "cooperative_groups/details/helpers.h"
+
+#if defined(_CG_HAS_STL_ATOMICS)
+#include <cuda/atomic>
+#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
+#else
+#define _CG_THREAD_SCOPE(scope)
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    _CG_CONST_DECL unsigned int coalesced_group_id = 1;
+    _CG_CONST_DECL unsigned int multi_grid_group_id = 2;
+    _CG_CONST_DECL unsigned int grid_group_id = 3;
+    _CG_CONST_DECL unsigned int thread_block_id = 4;
+    _CG_CONST_DECL unsigned int multi_tile_group_id = 5;
+    _CG_CONST_DECL unsigned int cluster_group_id = 6;
+}
+
+/**
+ * class thread_group;
+ *
+ * Generic thread group type, into which all groups are convertible.
+ * It acts as a container for all storage necessary for the derived groups,
+ * and will dispatch the API calls to the correct derived group. This means
+ * that all derived groups must implement the same interface as thread_group.
+ */
+class thread_group
+{
+protected:
+    struct group_data {
+        unsigned int _unused : 1;
+        unsigned int type : 7, : 0;
+    };
+
+    struct gg_data  {
+        details::grid_workspace *gridWs;
+    };
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    struct mg_data  {
+        unsigned long long _unused : 1;
+        unsigned long long type    : 7;
+        unsigned long long handle  : 56;
+        const details::multi_grid::multi_grid_functions *functions;
+    };
+#endif
+
+    struct tg_data {
+        unsigned int is_tiled : 1;
+        unsigned int type : 7;
+        unsigned int size : 24;
+        // packed to 4b
+        unsigned int metaGroupSize : 16;
+        unsigned int metaGroupRank : 16;
+        // packed to 8b
+        unsigned int mask;
+        // packed to 12b
+        unsigned int _res;
+    };
+
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend class thread_block;
+
+    union __align__(8) {
+        group_data  group;
+        tg_data     coalesced;
+        gg_data     grid;
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+        mg_data     multi_grid;
+#endif
+    } _data;
+
+    _CG_QUALIFIER thread_group operator=(const thread_group& src);
+
+    _CG_QUALIFIER thread_group(unsigned int type) {
+        _data.group.type = type;
+        _data.group._unused = false;
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    static_assert(sizeof(tg_data) <= 16, "Failed size check");
+    static_assert(sizeof(gg_data) <= 16, "Failed size check");
+#  ifdef _CG_ABI_EXPERIMENTAL
+    static_assert(sizeof(mg_data) <= 16, "Failed size check");
+#  endif
+#endif
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+
+    _CG_QUALIFIER unsigned long long size() const;
+    _CG_QUALIFIER unsigned long long num_threads() const;
+    _CG_QUALIFIER unsigned long long thread_rank() const;
+    _CG_QUALIFIER void sync() const;
+    _CG_QUALIFIER unsigned int get_type() const {
+        return _data.group.type;
+    }
+
+};
+
+template <unsigned int TyId>
+struct thread_group_base : public thread_group {
+    _CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
+    _CG_STATIC_CONST_DECL unsigned int id = TyId;
+};
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+/**
+ * class multi_grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same system, on multiple devices within the same launched kernels.
+ * To use this group, the kernel must have been launched with
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_multi_grid();
+ */
+
+
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+class multi_grid_group;
+
+// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
+template <typename = void>
+__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
+
+class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
+{
+private:
+    template <typename = void>
+    _CG_QUALIFIER multi_grid_group() {
+        _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
+        _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
+    }
+
+    friend multi_grid_group this_multi_grid<void>();
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.multi_grid.handle != 0);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        _data.multi_grid.functions->sync(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->size(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned long long size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
+    }
+
+    _CG_QUALIFIER unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
+    }
+
+    _CG_QUALIFIER unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
+    }
+};
+# else
+class multi_grid_group
+{
+private:
+    unsigned long long _handle;
+    unsigned int _size;
+    unsigned int _rank;
+
+    friend _CG_QUALIFIER multi_grid_group this_multi_grid();
+
+    _CG_QUALIFIER multi_grid_group() {
+        _handle = details::multi_grid::get_intrinsic_handle();
+        _size = details::multi_grid::size(_handle);
+        _rank = details::multi_grid::thread_rank(_handle);
+    }
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
+
+    _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
+        return (_handle != 0);
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::multi_grid::sync(_handle);
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
+        _CG_ASSERT(is_valid());
+        return _size;
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return _rank;
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::grid_rank(_handle));
+    }
+
+    _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (details::multi_grid::num_grids(_handle));
+    }
+};
+# endif
+
+/**
+ * multi_grid_group this_multi_grid()
+ *
+ * Constructs a multi_grid_group
+ */
+# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+template <typename>
+__device__
+#else
+_CG_QUALIFIER
+# endif
+_CG_DEPRECATED
+multi_grid_group this_multi_grid()
+{
+    return multi_grid_group();
+}
+#endif
+
+/**
+ * class grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same device within the same launched kernel. To use this group, the kernel
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_grid();
+ */
+class grid_group : public thread_group_base<details::grid_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
+    friend _CG_QUALIFIER grid_group this_grid();
+
+private:
+    _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
+        _data.grid.gridWs = gridWs;
+    }
+
+ public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.grid.gridWs != NULL);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        if (!is_valid()) {
+            _CG_ABORT();
+        }
+        details::grid::sync(&_data.grid.gridWs->barrier);
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long size() {
+        return details::grid::size();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long thread_rank() {
+        return details::grid::thread_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::grid::grid_dim();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_threads() {
+        return details::grid::num_threads();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_blocks() {
+        return details::grid::dim_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_blocks() {
+        return details::grid::num_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 block_index() {
+        return details::grid::block_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long block_rank() {
+        return details::grid::block_rank();
+    }
+
+# if defined(_CG_HAS_CLUSTER_GROUP)
+    _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+        return details::grid::dim_clusters();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+        return details::grid::num_clusters();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 cluster_index() {
+        return details::grid::cluster_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+        return details::grid::cluster_rank();
+    }
+# endif
+};
+
+_CG_QUALIFIER grid_group this_grid() {
+    // Load a workspace from the driver
+    grid_group gg(details::get_grid_workspace());
+#ifdef _CG_DEBUG
+    // *all* threads must be available to synchronize
+    gg.sync();
+#endif // _CG_DEBUG
+    return gg;
+}
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+/**
+ * class cluster_group
+ *
+ * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
+ * divided along all dimensions to form groups of blocks, each group of which is
+ * a block cluster. Clustered grids are subject to various restrictions and
+ * limitations. Primarily, a cluster consists of at most 8 blocks by default
+ * (although the user is allowed to opt-in to non-standard sizes,) and clustered
+ * grids are subject to additional occupancy limitations due to per-cluster
+ * hardware resource consumption. In exchange, a block cluster is guaranteed to
+ * be a cooperative group, with access to all cooperative group capabilities, as
+ * well as cluster specific capabilities and accelerations. A cluster_group
+ * represents a block cluster.
+ *
+ * Constructed via this_cluster_group();
+ */
+class cluster_group : public thread_group_base<details::cluster_group_id>
+{
+    // Friends
+    friend _CG_QUALIFIER cluster_group this_cluster();
+
+    // Disable constructor
+    _CG_QUALIFIER cluster_group()
+    {
+    }
+
+ public:
+    //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
+
+    // Functionality exposed by the group
+    _CG_STATIC_QUALIFIER void sync()
+    {
+        return details::cluster::sync();
+    }
+
+    _CG_STATIC_QUALIFIER void barrier_arrive()
+    {
+        return details::cluster::barrier_arrive();
+    }
+
+    _CG_STATIC_QUALIFIER void barrier_wait()
+    {
+        return details::cluster::barrier_wait();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+    {
+        return details::cluster::query_shared_rank(addr);
+    }
+
+    template <typename T>
+    _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+    {
+        return details::cluster::map_shared_rank(addr, rank);
+    }
+
+    _CG_STATIC_QUALIFIER dim3 block_index()
+    {
+        return details::cluster::block_index();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int block_rank()
+    {
+        return details::cluster::block_rank();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int thread_rank()
+    {
+        return details::cluster::thread_rank();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_blocks()
+    {
+        return details::cluster::dim_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_blocks()
+    {
+        return details::cluster::num_blocks();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads()
+    {
+        return details::cluster::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_threads()
+    {
+        return details::cluster::num_threads();
+    }
+
+    // Legacy aliases
+    _CG_STATIC_QUALIFIER unsigned int size()
+    {
+        return num_threads();
+    }
+};
+
+/*
+ * cluster_group this_cluster()
+ *
+ * Constructs a cluster_group
+ */
+_CG_QUALIFIER cluster_group this_cluster()
+{
+    cluster_group cg;
+#ifdef _CG_DEBUG
+    cg.sync();
+#endif
+    return cg;
+}
+#endif
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+namespace details {
+
+    _CG_CONSTEXPR_QUALIFIER unsigned int scratch_sync_memory_size(unsigned int max_block_size) {
+        // One barrier per possible size of the group rounded up to multiple of 4.
+        return 8 * sizeof(details::barrier_t);
+    }
+
+    _CG_CONSTEXPR_QUALIFIER unsigned int scratch_collectives_memory_size(unsigned int communication_size, unsigned int max_block_size) {
+        // One slot of collectives memory per warp.
+        return max_block_size / 32 * communication_size;
+    }
+
+    _CG_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int communication_size, unsigned int max_block_size) {
+        return scratch_sync_memory_size(max_block_size) + scratch_collectives_memory_size(communication_size, max_block_size);
+    }
+
+    _CG_CONSTEXPR_QUALIFIER size_t scratch_alignment(unsigned int communication_size) {
+        return ((communication_size & (communication_size - 1) == 0) && communication_size > 8) ?
+            communication_size : 8;
+    }
+
+    _CG_CONST_DECL unsigned int default_tile_communication_size = 8;
+    _CG_CONST_DECL unsigned int default_max_block_size = 1024;
+
+    struct multi_warp_scratch {
+        char memory[1];
+    };
+}
+
+class thread_block;
+namespace experimental {
+    template <unsigned int TileCommunicationSize = details::default_tile_communication_size,
+              unsigned int MaxBlockSize = details::default_max_block_size>
+    struct __align__(details::scratch_alignment(TileCommunicationSize)) block_tile_memory {
+    private:
+        char scratch[details::scratch_size_needed(TileCommunicationSize, MaxBlockSize)];
+
+    public:
+        _CG_QUALIFIER void* get_memory() {
+            return static_cast<void*>(scratch);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int get_size() {
+            return details::scratch_size_needed(TileCommunicationSize, MaxBlockSize);
+        }
+    };
+
+    template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch);
+}
+#endif
+
+/**
+ * class thread_block
+ *
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
+ * each block are guaranteed to reside on the same streaming multiprocessor.
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
+ *
+ * Constructed via this_thread_block();
+ */
+class thread_block : public thread_group_base<details::thread_block_id>
+{
+    // Friends
+    friend _CG_QUALIFIER thread_block this_thread_block();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
+    friend _CG_QUALIFIER thread_block experimental::this_thread_block(
+            experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch);
+
+    const unsigned short communication_size;
+    const unsigned short max_block_size;
+    details::multi_warp_scratch* const tile_memory;
+
+    template <unsigned int Size>
+    friend class __static_size_multi_warp_tile_base;
+
+    template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) :
+        tile_memory(reinterpret_cast<details::multi_warp_scratch*>(&scratch)),
+        communication_size(TileCommunicationSize), max_block_size(MaxBlockSize) {
+        if (thread_rank() < details::scratch_sync_memory_size(MaxBlockSize) / sizeof(details::barrier_t)) {
+            details::barrier_t* barriers = reinterpret_cast<details::barrier_t*>(&tile_memory->memory);
+            barriers[thread_rank()] = 0;
+        }
+        sync();
+    }
+#endif
+
+    // Disable constructor
+    _CG_QUALIFIER thread_block()
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    : tile_memory(NULL), communication_size(0), max_block_size(0)
+#endif
+    { }
+
+    // Internal Use
+    _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (thread_block());
+        }
+
+        unsigned int mask;
+        unsigned int base_offset = thread_rank() & (~(tilesz - 1));
+        unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+
+        mask = (unsigned int)(-1) >> (32 - masklength);
+        mask <<= (details::laneid() & ~(tilesz - 1));
+        thread_group tile = thread_group(details::coalesced_group_id);
+        tile._data.coalesced.mask = mask;
+        tile._data.coalesced.size = __popc(mask);
+        tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
+        tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
+        tile._data.coalesced.is_tiled = true;
+        return (tile);
+    }
+
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    _CG_STATIC_QUALIFIER void sync() {
+        details::cta::sync();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int size() {
+        return details::cta::size();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return details::cta::thread_rank();
+    }
+
+    // Additional functionality exposed by the group
+    _CG_STATIC_QUALIFIER dim3 group_index() {
+        return details::cta::group_index();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 thread_index() {
+        return details::cta::thread_index();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 group_dim() {
+        return details::cta::block_dim();
+    }
+
+    _CG_STATIC_QUALIFIER dim3 dim_threads() {
+        return details::cta::dim_threads();
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int num_threads() {
+        return details::cta::num_threads();
+    }
+
+};
+
+/**
+ * thread_block this_thread_block()
+ *
+ * Constructs a thread_block group
+ */
+_CG_QUALIFIER thread_block this_thread_block()
+{
+    return (thread_block());
+}
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+namespace experimental {
+    template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize>
+    _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) {
+        return (thread_block(scratch));
+    }
+}
+#endif
+
+/**
+ * class coalesced_group
+ *
+ * A group representing the current set of converged threads in a warp.
+ * The size of the group is not guaranteed and it may return a group of
+ * only one thread (itself).
+ *
+ * This group exposes warp-synchronous builtins.
+ * Constructed via coalesced_threads();
+ */
+class coalesced_group : public thread_group_base<details::coalesced_group_id>
+{
+private:
+    friend _CG_QUALIFIER coalesced_group coalesced_threads();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
+    friend class details::_coalesced_group_data_access;
+
+    _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
+        unsigned int member_pack = 0;
+        unsigned int member_rank = 0;
+        for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
+            unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+            if (lane_bit) {
+                if (laneMask & lane_bit)
+                    member_pack |= 1 << member_rank;
+                member_rank++;
+            }
+        }
+        return (member_pack);
+    }
+
+    // Internal Use
+    _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            details::abort();
+            return (coalesced_group(0));
+        }
+        if (size() <= tilesz) {
+            return (*this);
+        }
+
+        if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
+            unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
+            unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
+            unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
+
+            mask <<= (details::laneid() & ~(tilesz - 1));
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            coalesced_tile._data.coalesced.is_tiled = true;
+            return (coalesced_tile);
+        }
+        else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
+            unsigned int mask = 0;
+            unsigned int member_rank = 0;
+            int seen_lanes = (thread_rank() / tilesz) * tilesz;
+            for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
+                unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+                if (lane_bit) {
+                    if (seen_lanes <= 0 && member_rank < tilesz) {
+                        mask |= lane_bit;
+                        member_rank++;
+                    }
+                    seen_lanes--;
+                }
+            }
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            // Override parent with the size of this group
+            coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
+            coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
+            return coalesced_tile;
+        }
+        else {
+            // None in _CG_VERSION 1000
+            details::abort();
+        }
+
+        return (coalesced_group(0));
+    }
+
+ protected:
+    _CG_QUALIFIER coalesced_group(unsigned int mask) {
+        _data.coalesced.mask = mask;
+        _data.coalesced.size = __popc(mask);
+        _data.coalesced.metaGroupRank = 0;
+        _data.coalesced.metaGroupSize = 1;
+        _data.coalesced.is_tiled = false;
+    }
+
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+
+ public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    _CG_QUALIFIER unsigned int num_threads() const {
+        return _data.coalesced.size;
+    }
+
+    _CG_QUALIFIER unsigned int size() const {
+        return num_threads();
+    }
+
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
+    }
+
+    // Rank of this group in the upper level of the hierarchy
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+
+    _CG_QUALIFIER void sync() const {
+        __syncwarp(_data.coalesced.mask);
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+
+        if (lane >= 32)
+            lane = details::laneid();
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
+        if (size() == 32) {
+            return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+                _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
+        }
+
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32)
+            lane = details::laneid();
+
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
+            (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
+        }
+        unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
+        if (lane >= 32) lane = details::laneid();
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
+    }
+#endif
+
+    _CG_QUALIFIER int any(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        if (size() == 32) {
+            return (__ballot_sync(0xFFFFFFFF, predicate));
+        }
+        unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
+        return (_packLanes(lane_ballot));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_any_sync(0xFFFFFFFF, val));
+        }
+        unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
+        return (_packLanes(lane_match));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        if (size() == 32) {
+            return (__match_all_sync(0xFFFFFFFF, val, &pred));
+        }
+        unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
+        return (_packLanes(lane_match));
+    }
+
+#endif /* !_CG_HAS_MATCH_COLLECTIVE */
+
+};
+
+_CG_QUALIFIER coalesced_group coalesced_threads()
+{
+    return (coalesced_group(__activemask()));
+}
+
+namespace details {
+    template <unsigned int Size> struct verify_thread_block_tile_size;
+    template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<8>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<4>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<2>  { typedef void OK; };
+    template <> struct verify_thread_block_tile_size<1>  { typedef void OK; };
+
+#ifdef _CG_CPP11_FEATURES
+    template <unsigned int Size>
+    using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
+
+    template <unsigned int Size>
+    using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
+    template <unsigned int Size>
+    using _is_multi_warp =
+    _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
+
+    template <unsigned int Size>
+    using _is_valid_single_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
+    template <unsigned int Size>
+    using _is_valid_multi_warp_tile =
+        _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
+#else
+    template <unsigned int Size>
+    struct _is_multi_warp {
+        static const bool value = false;
+    };
+#endif
+}
+
+template <unsigned int Size>
+class __static_size_tile_base
+{
+protected:
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+
+public:
+    _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
+
+    // Rank of thread within tile
+    _CG_STATIC_QUALIFIER unsigned int thread_rank() {
+        return (details::cta::thread_rank() & (numThreads - 1));
+    }
+
+    // Number of threads within tile
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
+        return numThreads;
+    }
+
+    _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
+        return num_threads();
+    }
+};
+
+template <unsigned int Size>
+class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
+{
+    friend class details::_coalesced_group_data_access;
+    typedef details::tile::tile_helpers<Size> th;
+
+#ifdef _CG_CPP11_FEATURES
+    static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
+#else
+    typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
+#endif
+    using __static_size_tile_base<Size>::numThreads;
+    _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
+
+ protected:
+    _CG_STATIC_QUALIFIER unsigned int build_mask() {
+        unsigned int mask = fullMask;
+        if (numThreads != 32) {
+            // [0,31] representing the current active thread in the warp
+            unsigned int laneId = details::laneid();
+            // shift mask according to the partition it belongs to
+            mask = th::tileMask << (laneId & ~(th::laneMask));
+        }
+        return (mask);
+    }
+
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
+
+    _CG_STATIC_QUALIFIER void sync() {
+        __syncwarp(build_mask());
+    }
+
+#ifdef _CG_CPP11_FEATURES
+    // PTX supported collectives
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_down(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_up(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
+    }
+
+    template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
+    _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
+        return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
+            _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
+    }
+#else
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+#endif //_CG_CPP11_FEATURES
+
+    _CG_QUALIFIER int any(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot == build_mask());
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
+        return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_any_sync(build_mask(), val);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+
+    template <typename TyIntegral>
+    _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
+        details::assert_if_not_arithmetic<TyIntegral>();
+        unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (details::laneid() & (~(th::laneMask))));
+    }
+#endif
+
+};
+
+template <unsigned int Size, typename ParentT>
+class __static_parent_thread_block_tile_base
+{
+public:
+    // Rank of this group in the upper level of the hierarchy
+    _CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
+        return ParentT::thread_rank() / Size;
+    }
+
+    // Total num partitions created out of all CTAs when the group was created
+    _CG_STATIC_QUALIFIER unsigned int meta_group_size() {
+        return (ParentT::size() + Size - 1) / Size;
+    }
+};
+
+/**
+ * class thread_block_tile<unsigned int Size, ParentT = void>
+ *
+ * Statically-sized group type, representing one tile of a thread block.
+ * The only specializations currently supported are those with native
+ * hardware support (1/2/4/8/16/32)
+ *
+ * This group exposes warp-synchronous builtins.
+ * Can only be constructed via tiled_partition<Size>(ParentT&)
+ */
+
+template <unsigned int Size, typename ParentT = void>
+class __single_warp_thread_block_tile :
+    public __static_size_thread_block_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    friend class details::_coalesced_group_data_access;
+
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile() { };
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
+
+    _CG_STATIC_QUALIFIER unsigned int get_mask() {
+        return __static_size_thread_block_tile_base<Size>::build_mask();
+    }
+};
+
+template <unsigned int Size>
+class __single_warp_thread_block_tile<Size, void> :
+    public __static_size_thread_block_tile_base<Size>,
+    public thread_group_base<details::coalesced_group_id>
+{
+    _CG_STATIC_CONST_DECL unsigned int numThreads = Size;
+
+    template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
+    friend class details::_coalesced_group_data_access;
+
+    typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
+
+protected:
+    _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank, unsigned int meta_group_size) {
+        _data.coalesced.mask = staticSizeBaseT::build_mask();
+        _data.coalesced.size = numThreads;
+        _data.coalesced.metaGroupRank = meta_group_rank;
+        _data.coalesced.metaGroupSize = meta_group_size;
+        _data.coalesced.is_tiled = true;
+    }
+
+    _CG_QUALIFIER unsigned int get_mask() const {
+        return (_data.coalesced.mask);
+    }
+
+public:
+    using staticSizeBaseT::sync;
+    using staticSizeBaseT::size;
+    using staticSizeBaseT::num_threads;
+    using staticSizeBaseT::thread_rank;
+
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return _data.coalesced.metaGroupRank;
+    }
+
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return _data.coalesced.metaGroupSize;
+    }
+};
+
+/**
+ * Outer level API calls
+ * void sync(GroupT) - see <group_type>.sync()
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
+ * void group_size(GroupT) - see <group_type>.size()
+ */
+template <class GroupT>
+_CG_QUALIFIER void sync(GroupT const &g)
+{
+    g.sync();
+}
+
+// TODO: Use a static dispatch to determine appropriate return type
+// C++03 is stuck with unsigned long long for now
+#ifdef _CG_CPP11_FEATURES
+template <class GroupT>
+_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
+    return g.thread_rank();
+}
+
+
+template <class GroupT>
+_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
+    return g.num_threads();
+}
+#else
+template <class GroupT>
+_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
+    return static_cast<unsigned long long>(g.thread_rank());
+}
+
+
+template <class GroupT>
+_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
+    return static_cast<unsigned long long>(g.num_threads());
+}
+#endif
+
+
+/**
+ * tiled_partition
+ *
+ * The tiled_partition(parent, tilesz) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
+ * will be members of the same subgroup.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to power-of-two sized subgorup instances of at most
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
+ * tiled_partition() in _CG_VERSION 1000.
+ */
+_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
+{
+    if (parent.get_type() == details::coalesced_group_id) {
+        const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
+        return _cg->_get_tiled_threads(tilesz);
+    }
+    else {
+        const thread_block *_tb = static_cast<const thread_block*>(&parent);
+        return _tb->_get_tiled_threads(tilesz);
+    }
+}
+
+// Thread block type overload: returns a basic thread_group for now (may be specialized later)
+_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+// Coalesced group type overload: retains its ability to stay coalesced
+_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
+
+    template <unsigned int Size, typename ParentT>
+    _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
+        return internal_thread_block_tile<Size, ParentT>();
+    }
+
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda) {
+                return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
+            }
+        
+    template <typename T, typename GroupT>
+    _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
+        return group.template get_scratch_location<T>(warp_id);
+    }
+
+    template <typename GroupT>
+    _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
+        return group.get_sync_location();
+    }
+
+}
+/**
+ * tiled_partition<tilesz>
+ *
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)/tilesz) subgroups will be created,
+ * therefore the parent group size must be evenly divisible by the tilesz.
+ * The allow parent groups are thread_block or thread_block_tile<size>.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
+ * The size(parent) must be greater than the template Size parameter
+ * otherwise the results are undefined.
+ */
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+template <unsigned int Size>
+class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
+{
+    static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
+
+    template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
+    friend TyVal details::multi_warp_collectives_helper(
+            const GroupT& group,
+            WarpLambda warp_lambda,
+            InterWarpLambda inter_warp_lambda);
+    template <typename T, typename GroupT>
+    friend T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
+    template <typename GroupT>
+    friend details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
+    template <unsigned int OtherSize>
+    friend class __static_size_multi_warp_tile_base;
+    using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+    using ThisType = __static_size_multi_warp_tile_base<Size>;
+    _CG_STATIC_CONST_DECL int numWarps = Size / 32;
+    const unsigned short communication_size;
+    const unsigned short max_block_size;
+
+protected:
+    details::multi_warp_scratch* const tile_memory;
+
+    template <typename GroupT>
+    _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) :
+            tile_memory(g.tile_memory), communication_size(g.communication_size), max_block_size(g.max_block_size) {}
+
+
+private:
+    _CG_QUALIFIER details::barrier_t* get_sync_location() const {
+        // Different group sizes use different barriers, all groups of a given size share one barrier.
+        unsigned int sync_id = details::log2(Size / 64);
+        return &(reinterpret_cast<details::barrier_t*>(tile_memory->memory)[sync_id]);
+    }
+
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
+        unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size);
+        unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
+        return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]);
+    }
+
+    template <typename T>
+    _CG_QUALIFIER T* get_scratch_location() const {
+        unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size);
+        unsigned int scratch_id = details::cta::thread_rank() / 32;
+        return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]);
+    }
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
+        unsigned int src_warp = src / 32;
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+
+        // Get warp slot of the source threads warp.
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
+
+        if (warp.meta_group_rank() == src_warp) {
+            // Put shuffled value into my warp slot and let my warp arrive at the barrier.
+            if (thread_rank() == src) {
+                *warp_scratch_location = val;
+            }
+            details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
+            TyVal result = *warp_scratch_location;
+            details::sync_warps_wait(sync_location, details::cta::thread_rank());
+            return result;
+        }
+        else {
+            // Wait for the source warp to arrive on the barrier.
+            details::sync_warps_wait_for_warps<details::wait_for_specific_warp>(
+                    (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp),
+                    sync_location, details::cta::thread_rank(),
+                    numWarps);
+            TyVal result = *warp_scratch_location;
+            details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
+            return result;
+        }
+    }
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl_iterative_impl(TyVal val, unsigned int src) const {
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+
+        details::copy_channel<numWarps> broadcast_channel{
+            get_scratch_location<char>(0),
+            get_sync_location(),
+            (size_t) communication_size * numWarps};
+
+        if (warp.meta_group_rank() == src / 32) {
+            val = warp.shfl(val, src);
+            broadcast_channel.template send_value<
+                TyVal, 32, decltype(broadcast_channel)::send_many_to_many>(
+                    val, warp.thread_rank(), details::cta::thread_rank() / 32);
+        }
+        else {
+            broadcast_channel.template receive_value<TyVal>(val, warp.thread_rank() == 0);
+        }
+        sync();
+        return val;
+    }
+
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme_impl(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+        TyVal* warp_scratch_location = get_scratch_location<TyVal>();
+
+        warp_lambda(warp, warp_scratch_location);
+
+        if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
+            auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
+            if (subwarp.meta_group_rank() == 0) {
+                TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
+                inter_warp_lambda(subwarp, thread_scratch_location);
+            }
+            warp.sync();
+            details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
+        }
+        TyVal result = *warp_scratch_location;
+        warp.sync();  // Added warpsync, if all collectives do sync before writing to reduce_location (they does right now),
+                      // we could delete it.
+        return result;
+    }
+
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme_iterative_impl(
+            const WarpLambda& warp_lambda,
+            const InterWarpLambda& inter_warp_lambda) const {
+        auto warp = details::tiled_partition_internal<32, ThisType>();
+        details::barrier_t* sync_location = get_sync_location();
+        details::copy_channel<numWarps> final_result_channel{
+            get_scratch_location<char>(0),
+            sync_location,
+            (size_t) communication_size * numWarps};
+
+        TyVal warp_result;
+        warp_lambda(warp, &warp_result);
+
+        if (warp.meta_group_rank() == 0) {
+            auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
+            details::copy_channel<numWarps> partial_results_channel{
+                get_scratch_location<char>(subwarp.thread_rank()),
+                sync_location,
+                (size_t) communication_size};
+
+            // Thread 0 in subwarp set as inactive to not overwrite warp 0 warp_result.
+            partial_results_channel.template receive_value<TyVal>(
+                    warp_result,
+                    warp.thread_rank() == 0,
+                    subwarp.thread_rank() != 0 && subwarp.meta_group_rank() == 0);
+            if (subwarp.meta_group_rank() == 0) {
+                inter_warp_lambda(subwarp, &warp_result);
+            }
+            warp_result = warp.shfl(warp_result, 0);
+            final_result_channel.template send_value<TyVal, 32, decltype(final_result_channel)::send_many_to_many>(
+                    warp_result,
+                    warp.thread_rank(),
+                    details::cta::thread_rank() / 32);
+        }
+        else {
+            details::copy_channel<numWarps> partial_results_channel{get_scratch_location<char>(), sync_location, (size_t) communication_size};
+            partial_results_channel.template send_value<TyVal, 32, decltype(partial_results_channel)::send_many_to_one>(
+                    warp_result,
+                    warp.thread_rank(),
+                    (details::cta::thread_rank() - thread_rank()) / 32);
+            final_result_channel.template receive_value<TyVal>(warp_result, warp.thread_rank() == 0);
+        }
+        sync();
+        return warp_result;
+    }
+
+    template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
+    _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
+        if (sizeof(TyVal) > communication_size) {
+            return collectives_scheme_iterative_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda);
+        }
+        else {
+            return collectives_scheme_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda);
+        }
+    }
+
+public:
+    _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
+
+    using __static_size_tile_base<Size>::thread_rank;
+
+    template <typename TyVal>
+    _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
+        if (sizeof(TyVal) > communication_size) {
+            return shfl_iterative_impl(val, src);
+        }
+        else {
+            return shfl_impl(val, src);
+        }
+    }
+
+    _CG_QUALIFIER void sync() const {
+        details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
+    }
+
+    _CG_QUALIFIER int any(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+
+    _CG_QUALIFIER int all(int predicate) const {
+        auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
+                *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
+        };
+        auto inter_warp_lambda =
+            [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
+                *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
+        };
+        return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
+    }
+};
+
+
+template <unsigned int Size, typename ParentT = void>
+class __multi_warp_thread_block_tile :
+    public __static_size_multi_warp_tile_base<Size>,
+    public __static_parent_thread_block_tile_base<Size, ParentT>
+{
+    typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
+    typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
+protected:
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
+        __static_size_multi_warp_tile_base<Size>(g) {}
+};
+
+template <unsigned int Size>
+class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
+{
+    const unsigned int metaGroupRank;
+    const unsigned int metaGroupSize;
+
+protected:
+    template <unsigned int OtherSize, typename ParentT>
+    _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
+        __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
+
+public:
+    _CG_QUALIFIER unsigned int meta_group_rank() const {
+        return metaGroupRank;
+    }
+
+    _CG_QUALIFIER unsigned int meta_group_size() const {
+        return metaGroupSize;
+    }
+};
+#endif
+
+template <unsigned int Size, typename ParentT = void>
+class thread_block_tile;
+
+namespace details {
+    template <unsigned int Size, typename ParentT, bool IsMultiWarp>
+    class thread_block_tile_impl;
+
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
+    {
+    protected:
+        template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
+            __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
+
+        _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
+            __single_warp_thread_block_tile<Size, ParentT>() {}
+    };
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
+            __multi_warp_thread_block_tile<Size, ParentT>(g) {}
+    };
+#else
+    template <unsigned int Size, typename ParentT>
+    class thread_block_tile_impl<Size, ParentT, true> 
+    {
+        protected:
+        template <typename GroupT>
+        _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
+    };
+#endif
+}
+
+template <unsigned int Size, typename ParentT>
+class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
+{
+    friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
+
+protected:
+    _CG_QUALIFIER thread_block_tile(const ParentT& g) :
+        details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
+
+public:
+    _CG_QUALIFIER operator thread_block_tile<Size, void>() const {
+        return thread_block_tile<Size, void>(*this);
+    }
+};
+
+template <unsigned int Size>
+class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
+{
+    template <unsigned int, typename ParentT>
+    friend class thread_block_tile;
+
+protected:
+    template <unsigned int OtherSize, typename OtherParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+
+public:
+    template <typename ParentT>
+    _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
+        details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
+};
+
+namespace details {
+    template <unsigned int Size, typename ParentT>
+    struct tiled_partition_impl;
+
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
+        _CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
+            thread_block_tile<Size, thread_block>(g) {}
+    };
+
+    // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
+    template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
+    struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
+        public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
+#endif
+        _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
+            thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
+    };
+
+}
+
+namespace experimental {
+    template <unsigned int Size, typename ParentT>
+    _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
+    {
+#if defined(_CG_CPP11_FEATURES) && !defined(_CG_ABI_EXPERIMENTAL)
+        static_assert(details::_is_single_warp<Size>::value, "_CG_ABI_EXPERIMENTAL needs to be defined"
+                " before cooperative_groups header is included to enable experimental features");
+#endif
+        return details::tiled_partition_impl<Size, ParentT>(g);
+    }
+
+}
+
+template <unsigned int Size, typename ParentT>
+_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
+{
+#ifdef _CG_CPP11_FEATURES
+    static_assert(details::_is_single_warp<Size>::value, "Tiled partition with Size > 32 is supported only by"
+            " cooperative_groups::experimental::tiled_partition available with experimental features enabled");
+#endif
+    return details::tiled_partition_impl<Size, ParentT>(g);
+}
+
+/**
+ * thread_group this_thread()
+ *
+ * Constructs a generic thread_group containing only the calling thread
+ */
+_CG_QUALIFIER thread_block_tile<1, void> this_thread()
+{
+    // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
+    // meta group rank and size set to 0 and 1 respectively.
+    return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
+}
+
+/**
+ * <group_type>.sync()
+ *
+ * Executes a barrier across the group
+ *
+ * Implements both a compiler fence and an architectural fence to prevent,
+ * memory reordering around the barrier.
+ */
+_CG_QUALIFIER void thread_group::sync() const
+{
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        cooperative_groups::sync(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        cooperative_groups::sync(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        cooperative_groups::sync(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+}
+
+/**
+ * <group_type>.size()
+ *
+ * Returns the total number of threads in the group.
+ */
+_CG_QUALIFIER unsigned long long thread_group::size() const
+{
+    unsigned long long size = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return size;
+}
+
+/**
+ * <group_type>.thread_rank()
+ *
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
+ */
+_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
+{
+    unsigned long long rank = 0;
+    switch (_data.group.type) {
+    case details::coalesced_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
+        break;
+    case details::thread_block_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
+        break;
+    case details::grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
+        break;
+#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    case details::multi_grid_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
+        break;
+#endif
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    case details::cluster_group_id:
+        rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
+        break;
+#endif
+    default:
+        break;
+    }
+    return rank;
+}
+
+_CG_END_NAMESPACE
+
+#include <cooperative_groups/details/partitioning.h>
+
+# endif /* ! (__cplusplus, __CUDACC__) */
+
+#endif /* !_COOPERATIVE_GROUPS_H_ */
diff --git a/ext/cudart/include/cooperative_groups/details/async.h b/ext/cudart/include/cooperative_groups/details/async.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b7dcb2433f2cb7d1ef61290995ac871a901b1e8
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/async.h
@@ -0,0 +1,452 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * The source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * The Licensed Deliverables contained herein are PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_ASYNC_H
+#define _CG_ASYNC_H
+
+#include "helpers.h"
+#include "info.h"
+
+#include <cuda_pipeline.h>
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+// Groups supported by memcpy_async
+template <class TyGroup>
+struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
+
+template <class TyGroup>
+using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
+
+// Groups that require optimization
+template <class TyGroup>
+struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
+
+template <typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
+    : public _CG_STL_NAMESPACE::false_type {};
+
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+
+template <class TyGroup>
+using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
+
+// SFINAE helpers for tile optimizations
+template <class TyGroup>
+using enable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
+
+template <class TyGroup>
+using disable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
+
+// Segment for punning to aligned types
+template <unsigned int N>
+struct _Segment {
+    int _seg[N];
+};
+
+// Trivial layout guaranteed-aligned copy-async compatible segments
+template <unsigned int N>
+struct Segment;
+template <>
+struct __align__(4) Segment<1> : public _Segment<1>{};
+template <>
+struct __align__(8) Segment<2> : public _Segment<2>{};
+template <>
+struct __align__(16) Segment<4> : public _Segment<4>{};
+
+// Interleaved element by element copies from source to dest
+template <typename TyGroup, typename TyElem>
+_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
+                                      size_t count) {
+    const unsigned int rank = group.thread_rank();
+    const unsigned int stride = group.size();
+
+    for (size_t idx = rank; idx < count; idx += stride) {
+        dst[idx] = src[idx];
+    }
+}
+
+template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+
+    if (count == 0) {
+        return;
+    }
+
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+
+    const unsigned int stride = group.size();
+    const unsigned int rank = group.thread_rank();
+    // Efficient copies require warps to operate on the same amount of work at each step.
+    // remainders are handled in a separate stage to prevent branching
+    const unsigned int subWarpMask = (stride - 1);
+    const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
+    const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
+
+    const size_t warpCopies = (count & (~subWarpMask));
+
+    for (size_t idx = 0; idx < warpCopies; idx += stride) {
+        size_t _srcIdx = rank + idx;
+        size_t _dstIdx = rank + idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+
+    if (subwarpCopies) {
+        size_t _srcIdx = warpCopies + maxSubwarpRank;
+        size_t _dstIdx = warpCopies + maxSubwarpRank;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+
+template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+
+    unsigned int stride = group.size();
+    unsigned int rank = group.thread_rank();
+
+    for (size_t idx = rank; idx < count; idx += stride) {
+        size_t _srcIdx = idx;
+        size_t _dstIdx = idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <unsigned int MinAlignment, unsigned int MaxAlignment>
+_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
+    // Narrowing conversion intentional
+    uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
+
+    uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
+
+    // range [MaxAlignment, alignof(elem)], step: x >> 1
+    // over range of possible alignments, choose best available out of range
+    uint32_t out = MaxAlignment;
+#pragma unroll
+    for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
+        if (alignment & diff)
+            out = alignment;
+    }
+
+    return out;
+}
+
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <typename TyType, typename TyGroup>
+_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                    size_t count) {
+    const char *src = reinterpret_cast<const char *>(_src);
+    char *dst = reinterpret_cast<char *>(_dst);
+
+    constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
+
+    uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
+
+    inline_copy(group, dst, src, alignOffset);
+    count -= alignOffset;
+    src += alignOffset;
+    dst += alignOffset;
+
+    // Copy using the best available alignment, async_copy expects n-datums, not bytes
+    size_t asyncCount = count / sizeof(TyType);
+    accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
+    asyncCount *= sizeof(TyType);
+
+    count -= asyncCount;
+    src += asyncCount;
+    dst += asyncCount;
+    inline_copy(group, dst, src, count);
+}
+
+// We must determine alignment and manually align src/dst ourselves
+template <size_t AlignHint>
+struct _memcpy_async_align_dispatch {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
+        uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
+
+        // Avoid copying the extra bytes if desired copy count is smaller
+        alignment = count < alignment ? AlignHint : alignment;
+
+        switch (alignment) {
+        default:
+        case 1:
+            inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
+            break;
+        case 2:
+            inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
+            break;
+        case 4:
+            copy_like<Segment<1>>(group, dst, src, count);
+            break;
+        case 8:
+            copy_like<Segment<2>>(group, dst, src, count);
+            break;
+        case 16:
+            copy_like<Segment<4>>(group, dst, src, count);
+            break;
+        }
+    }
+};
+
+// Specialization for 4 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<4> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
+        Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// Specialization for 8 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<8> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
+        Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// Alignments over 16 are truncated to 16 and bypass alignment
+// This is the highest performing memcpy available
+template <>
+struct _memcpy_async_align_dispatch<16> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
+        Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// byte-wide API
+template <size_t Alignment, class TyGroup>
+_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
+                                                                 const void *__restrict__ _src, size_t count) {
+    static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
+    details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
+}
+
+// Internal dispatch APIs
+// These deduce the alignments and sizes necessary to invoke the underlying copy engine
+template <typename Ty>
+using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
+
+template <typename Ty>
+using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
+
+template <typename Ty>
+using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
+
+template <typename Ty>
+using enable_if_integral =
+    typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
+
+// byte-wide API using aligned_sized_t
+template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
+                                              const void *__restrict__ _src, const Alignment<Hint> &count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
+}
+
+// byte-wide API using type for aligment
+template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
+          enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
+}
+
+// byte-wide API with full alignment deduction required
+template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
+          enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
+}
+
+// 1d-datum API
+template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
+                                              const TyElem *__restrict__ src, const size_t srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+
+// 1d-datum API using aligned_size_t
+template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
+                                              const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+
+} // namespace details
+
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ */
+template <class TyGroup, typename TyElem, typename TySizeT>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
+                                       const TySizeT &count) {
+    details::_memcpy_async_bytes(group, _dst, _src, count);
+    __pipeline_commit();
+}
+
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ * Object counts are in datum sized chunks, not bytes.
+ */
+template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
+                                       const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
+    details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
+    __pipeline_commit();
+}
+
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
+    __pipeline_wait_prior(Stage);
+    group.sync();
+}
+
+/* Group wait all previously submitted memcpy_async to complete. */
+template <class TyGroup>
+_CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
+    __pipeline_wait_prior(0);
+    group.sync();
+}
+
+/***************** CG APIs including pipeline are deprecated *****************/
+
+/* Group submit batch of async-copy to cover of contiguous 1D array
+   to a pipeline and commit the batch*/
+template <class TyGroup, class TyElem>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
+                                       nvcuda::experimental::pipeline &pipe) {
+    details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
+    pipe.commit();
+}
+
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
+    pipe.wait_prior<Stage>();
+    group.sync();
+}
+
+/* Group wait for stage-S of memcpy_async to complete. */
+template <class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
+    pipe.wait(stage);
+    group.sync();
+}
+_CG_END_NAMESPACE
+
+#endif // _CG_ASYNC_H
diff --git a/ext/cudart/include/cooperative_groups/details/coalesced_reduce.h b/ext/cudart/include/cooperative_groups/details/coalesced_reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3722fb5c22809027cee66ab05758e477e8ef2bf
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/coalesced_reduce.h
@@ -0,0 +1,108 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_COALESCED_REDUCE_H_
+#define _CG_COALESCED_REDUCE_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "cooperative_groups.h"
+#include "partitioning.h"
+#include "coalesced_scan.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_reduce_to_one(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    if (group.size() == 32) {
+        auto out = val;
+        for (int offset = group.size() >> 1; offset > 0; offset >>= 1) {
+            out = op(out, group.shfl_up(out, offset));
+        }
+        return out;
+    }
+    else {
+        auto scan_result =
+            inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+        return scan_result;
+    }
+}
+
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    auto out = coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    if (group.size() == 32) {
+        return group.shfl(out, 31);
+    }
+    else {
+        unsigned int group_mask = _coalesced_group_data_access::get_mask(group);
+        unsigned int last_thread_id = 31 - __clz(group_mask);
+        return details::tile::shuffle_dispatch<TyVal>::shfl(
+            _CG_STL_NAMESPACE::forward<TyVal>(out), group_mask, last_thread_id, 32);
+    }
+}
+
+template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT>
+_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, 
+                                    TyVal&& val,
+                                    TyOp&& op) -> decltype(op(val, val)) {
+    auto out = val;
+    for (int mask = TySize >> 1; mask > 0; mask >>= 1) {
+        out = op(out, group.shfl_xor(out, mask));
+    }
+
+    return out;
+}
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_COALESCED_REDUCE_H_
diff --git a/ext/cudart/include/cooperative_groups/details/coalesced_scan.h b/ext/cudart/include/cooperative_groups/details/coalesced_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..383f4bde059dd8daad7d1c56e99152ea7ee28a08
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/coalesced_scan.h
@@ -0,0 +1,174 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_COALESCED_SCAN_H_
+#define _CG_COALESCED_SCAN_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "cooperative_groups.h"
+#include "partitioning.h"
+#include "functional.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+template <typename TyGroup, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto inclusive_scan_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    auto out = val;
+    for (int mask = 1; mask < group.size(); mask <<= 1) {
+        auto tmp = group.shfl_up(out, mask);
+        if (mask <= group.thread_rank()) {
+            out = op(out, tmp);
+        }
+    }
+
+    return out;
+}
+
+template <typename TyGroup, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto inclusive_scan_non_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    const unsigned int groupSize = group.size();
+    auto out = val;
+
+    const unsigned int mask = details::_coalesced_group_data_access::get_mask(group);
+    unsigned int lanemask = details::lanemask32_lt() & mask;
+    unsigned int srcLane = details::laneid();
+
+    const unsigned int base = __ffs(mask)-1; /* lane with rank == 0 */
+    const unsigned int rank = __popc(lanemask);
+
+    for (unsigned int i = 1, j = 1; i < groupSize; i <<= 1) {
+        if (i <= rank) {
+            srcLane -= j;
+            j = i; /* maximum possible lane */
+
+            unsigned int begLane = base + rank - i; /* minimum possible lane */
+
+            /*  Next source lane is in the range [ begLane .. srcLane ]
+                *  If begLane < srcLane then do a binary search.
+                */
+            while (begLane < srcLane) {
+                const unsigned int halfLane = (begLane + srcLane) >> 1;
+                const unsigned int halfMask = lanemask >> halfLane;
+                const unsigned int d = __popc(halfMask);
+                if (d < i) {
+                    srcLane = halfLane - 1; /* halfLane too large */
+                }
+                else if ((i < d) || !(halfMask & 0x01)) {
+                    begLane = halfLane + 1; /* halfLane too small */
+                }
+                else {
+                    begLane = srcLane = halfLane; /* happen to hit */
+                }
+            }
+        }
+
+        auto tmp = details::tile::shuffle_dispatch<TyVal>::shfl(out, mask, srcLane, 32);
+        if (i <= rank) {
+            out = op(out, tmp);
+        }
+    }
+    return out;
+}
+
+template <unsigned int TySize, typename ParentT, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_inclusive_scan(const __single_warp_thread_block_tile<TySize, ParentT>& group,
+                                            TyVal&& val,
+                                            TyOp&& op) -> decltype(op(val, val)) {
+    return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+}
+
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_inclusive_scan(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    if (group.size() == 32) {
+        return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+    else {
+        return inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+}
+
+template <bool IntegralOptimized>
+struct scan_choose_convertion;
+
+template<>
+struct scan_choose_convertion<true> {
+    template <typename TyGroup, typename TyRes, typename TyVal>
+    _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
+        return result - val;
+    }
+};
+
+template<>
+struct scan_choose_convertion<false> {
+    template <typename TyGroup, typename TyRes, typename TyVal>
+    _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
+        auto ret = group.shfl_up(result, 1);
+        if (group.thread_rank() == 0) {
+            return {};
+        }
+        else {
+            return ret;
+        }
+    }
+};
+
+template <typename TyGroup, typename TyRes, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    using conversion = scan_choose_convertion<_CG_STL_NAMESPACE::is_same<remove_qual<TyFn>, cooperative_groups::plus<remove_qual<TyVal>>>::value
+                                 && _CG_STL_NAMESPACE::is_integral<remove_qual<TyVal>>::value>;
+    return conversion::convert_inclusive_to_exclusive(group, result, _CG_STL_NAMESPACE::forward<TyVal>(val));
+}
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_COALESCED_SCAN_H_
\ No newline at end of file
diff --git a/ext/cudart/include/cooperative_groups/details/driver_abi.h b/ext/cudart/include/cooperative_groups/details/driver_abi.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c866fcf740beb709a106057d28e8a2a1ac37924
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/driver_abi.h
@@ -0,0 +1,99 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_DRIVER_API_H
+#define _CG_DRIVER_API_H
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    template <unsigned int RegId>
+    _CG_QUALIFIER unsigned int load_env_reg() {
+        // Abort by default
+        _CG_ABORT();
+        return 0;
+    }
+
+    template <unsigned int HiReg, unsigned int LoReg>
+    _CG_QUALIFIER unsigned long long load_env_reg64() {
+        unsigned long long registerLo = load_env_reg<LoReg>();
+        unsigned long long registerHi = load_env_reg<HiReg>();
+
+        return (registerHi << 32) | registerLo;
+    }
+
+// inline PTX for accessing registers requires an immediate for the special reg
+# define LOAD_ENVREG(NUMBER) \
+    template <> _CG_QUALIFIER unsigned int load_env_reg<NUMBER>() { \
+        unsigned int r; \
+        asm ("mov.u32 %0, %%envreg" #NUMBER ";" : "=r"(r)); \
+        return r; \
+    }
+
+    // Instantiate loaders for registers used
+    LOAD_ENVREG(0);
+    LOAD_ENVREG(1);
+    LOAD_ENVREG(2);
+# undef LOAD_ENVREG
+
+    struct grid_workspace {
+        unsigned int wsSize;
+        unsigned int barrier;
+    };
+
+    _CG_QUALIFIER grid_workspace* get_grid_workspace() {
+        unsigned long long gridWsAbiAddress = load_env_reg64<1, 2>();
+        // Interpret the address from envreg 1 and 2 as the driver's grid workspace
+        return (reinterpret_cast<grid_workspace*>(gridWsAbiAddress));
+    }
+}
+_CG_END_NAMESPACE
+
+#endif // _CG_DRIVER_API_H
diff --git a/ext/cudart/include/cooperative_groups/details/functional.h b/ext/cudart/include/cooperative_groups/details/functional.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca17d08ad8407af2ea8967e53a8bdc4b59079c77
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/functional.h
@@ -0,0 +1,207 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_FUNCTIONAL_H
+#define _CG_FUNCTIONAL_H
+
+#include "info.h"
+#include "helpers.h"
+
+#ifdef _CG_CPP11_FEATURES
+#ifdef _CG_USE_CUDA_STL
+# include <cuda/std/functional>
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+#ifdef _CG_USE_CUDA_STL
+    using cuda::std::plus;
+    using cuda::std::bit_and;
+    using cuda::std::bit_xor;
+    using cuda::std::bit_or;
+#else
+    template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}};
+    template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}};
+    template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}};
+    template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}};
+#endif // _CG_USE_PLATFORM_STL
+} // details
+
+template <typename Ty>
+struct plus : public details::plus<Ty> {};
+
+template <typename Ty>
+struct less {
+    __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
+        return (arg2 < arg1) ? arg2 : arg1;
+    }
+};
+
+template <typename Ty>
+struct greater {
+    __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {
+        return (arg1 < arg2) ? arg2 : arg1;
+    }
+};
+
+template <typename Ty>
+struct bit_and : public details::bit_and<Ty> {};
+
+template <typename Ty>
+struct bit_xor : public details::bit_xor<Ty> {};
+
+template <typename Ty>
+struct bit_or : public details::bit_or<Ty> {};
+
+#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL)
+namespace details {
+    template <class Ty>
+    using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>;
+
+    template <typename TyOp> struct _atomic_op_supported                                : public _CG_STL_NAMESPACE::false_type {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>>    : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>>    : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>>  : public _atomic_is_type_supported<Ty> {};
+    template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {};
+
+    template<typename TyAtomic, typename TyVal, typename TyOp>
+    _CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) {
+        remove_qual<TyVal> old = atomic;
+        while(!atomic.compare_exchange_weak(old, op(old, val)));
+        return old;
+    }
+
+    template<typename TyOp>
+    struct op_picker;
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::plus<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_add(val);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::less<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_min(val);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::greater<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_max(val);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_and<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_and(val);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_xor<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_xor(val);
+        }
+    };
+
+    template<typename TyVal>
+    struct op_picker<cooperative_groups::bit_or<TyVal>> {
+        template<typename TyAtomic>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) {
+            return atomic.fetch_or(val);
+        }
+    };
+    
+    template<bool atomic_supported>
+    struct atomic_update_dispatch {};
+
+    template<>
+    struct atomic_update_dispatch<false> {
+        template<typename TyAtomic, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
+            return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+        }
+    };
+
+    template<>
+    struct atomic_update_dispatch<true> {
+        template<typename TyAtomic, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) {
+            using dispatch = op_picker<details::remove_qual<TyOp>>;
+
+            return dispatch::atomic_update(atomic, val);
+        }
+    };
+
+    template<typename TyAtomic, typename TyVal, typename TyOp>
+    _CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) {
+        using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>;
+
+        return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif
+#endif //_CG_FUNCTIONAL_H
diff --git a/ext/cudart/include/cooperative_groups/details/helpers.h b/ext/cudart/include/cooperative_groups/details/helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e4f98b4eb59c68ed61c929b7b7982f9985b12f7
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/helpers.h
@@ -0,0 +1,707 @@
+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_HELPERS_H_
+# define _COOPERATIVE_GROUPS_HELPERS_H_
+
+#include "info.h"
+#include "sync.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+#ifdef _CG_CPP11_FEATURES
+    template <typename Ty> struct _is_float_or_half          : public _CG_STL_NAMESPACE::is_floating_point<Ty> {};
+# ifdef _CG_HAS_FP16_COLLECTIVE
+    template <>            struct _is_float_or_half<__half>  : public _CG_STL_NAMESPACE::true_type {};
+    template <>            struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {};
+# endif
+    template <typename Ty>
+    using  is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>;
+
+    // Non-STL utility templates 
+    template <typename Ty>
+    using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type;
+
+    template <typename TyLhs, typename TyRhs>
+    using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs>
+    >;
+#endif
+
+    template <typename TyTrunc>
+    _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) {
+        return ((TyTrunc)index.z * nIndex.y * nIndex.x) +
+               ((TyTrunc)index.y * nIndex.x) +
+                (TyTrunc)index.x;
+    }
+
+    namespace cta {
+
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            __barrier_sync(0);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_threads()
+        {
+            return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return vec3_to_linear<unsigned int>(threadIdx, blockDim);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 group_index()
+        {
+            return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            return dim3(threadIdx.x, threadIdx.y, threadIdx.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            return dim3(blockDim.x, blockDim.y, blockDim.z);
+        }
+
+        // Legacy aliases
+        _CG_STATIC_QUALIFIER unsigned int size()
+        {
+            return num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_dim()
+        {
+            return dim_threads();
+        }
+
+    };
+
+    class _coalesced_group_data_access {
+    public:
+        // Retrieve mask of coalesced groups
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) {
+            return group.get_mask();
+        }
+
+        // Retrieve mask of tiles
+        template <template <typename, typename> typename TyGroup, typename Sz, typename Parent>
+        _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup<Sz, Parent> &group) {
+            return group.build_maks();
+        }
+
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) {
+            return TyGroup(mask);
+        }
+
+        template <typename TyGroup>
+        _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) {
+            group._data.coalesced.metaGroupRank = mgRank;
+            group._data.coalesced.metaGroupSize = mgSize;
+        }
+    };
+
+    namespace tile {
+        template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount>
+        struct _tile_helpers{
+            _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount;
+            _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask;
+            _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask;
+            _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount;
+        };
+
+        template <unsigned int> struct tile_helpers;
+        template <> struct tile_helpers<32> : public _tile_helpers<1,  0xFFFFFFFF, 0x1F, 5> {};
+        template <> struct tile_helpers<16> : public _tile_helpers<2,  0x0000FFFF, 0x0F, 4> {};
+        template <> struct tile_helpers<8>  : public _tile_helpers<4,  0x000000FF, 0x07, 3> {};
+        template <> struct tile_helpers<4>  : public _tile_helpers<8,  0x0000000F, 0x03, 2> {};
+        template <> struct tile_helpers<2>  : public _tile_helpers<16, 0x00000003, 0x01, 1> {};
+        template <> struct tile_helpers<1>  : public _tile_helpers<32, 0x00000001, 0x00, 0> {};
+
+#ifdef _CG_CPP11_FEATURES
+        namespace shfl {
+            /***********************************************************************************
+             * Recursively Sliced Shuffle
+             *  Purpose:
+             *      Slices an input type a number of times into integral types so that shuffles
+             *      are well defined
+             *  Expectations:
+             *      This object *should not* be used from a reinterpret_cast pointer unless
+             *      some alignment guarantees can be met. Use a memcpy to guarantee that loads
+             *      from the integral types stored within are aligned and correct.
+             **********************************************************************************/
+            template <unsigned int count, bool intSized = (count <= sizeof(int))>
+            struct recursive_sliced_shuffle_helper;
+
+            template <unsigned int count>
+            struct recursive_sliced_shuffle_helper<count, true> {
+                int val;
+
+                template <typename TyFn>
+                _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
+                    val = shfl(val);
+                }
+            };
+
+            template <unsigned int count>
+            struct recursive_sliced_shuffle_helper<count, false> {
+                int val;
+                recursive_sliced_shuffle_helper<count - sizeof(int)> next;
+
+                template <typename TyFn>
+                _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) {
+                    val = shfl(val);
+                    next.invoke_shuffle(shfl);
+                }
+            };
+        }
+
+        struct _memory_shuffle {
+            template <typename TyElem, typename TyShflFn>
+            _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
+                static_assert(sizeof(TyElem) > 0, "in memory shuffle is not yet implemented");
+                return TyElem{};
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return 0;
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+        };
+
+        /***********************************************************************************
+         * Intrinsic Device Function Shuffle
+         *  Purpose:
+         *      Uses a shuffle helper that has characteristics best suited for moving
+         *      elements between threads
+         *  Expectations:
+         *      Object given will be forced into an l-value type so that it can be used
+         *      with a helper structure that reinterprets the data into intrinsic compatible
+         *      types
+         *  Notes:
+         *      !! TyRet is required so that objects are returned by value and not as
+         *      dangling references depending on the value category of the passed object
+         **********************************************************************************/
+        struct _intrinsic_compat_shuffle {
+            template <unsigned int count>
+            using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>;
+
+            template <typename TyElem, typename TyShflFn>
+            _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) {
+                static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle");
+                shfl_helper<sizeof(TyElem)> helper;
+                memcpy(&helper, &elem, sizeof(TyElem));
+                helper.invoke_shuffle(fn);
+                memcpy(&elem, &helper, sizeof(TyElem));
+                return elem;
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_sync(gMask, val, srcRank, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_down_sync(gMask, val, delta, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_up_sync(gMask, val, delta, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+
+            template <typename TyElem, typename TyRet = remove_qual<TyElem>>
+            _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                auto shfl = [=](int val) -> int {
+                    return __shfl_xor_sync(gMask, val, lMask, threads);
+                };
+
+                return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl);
+            }
+        };
+
+        struct _native_shuffle {
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl(
+                    TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_down(
+                    TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_up(
+                    TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads));
+            }
+
+            template <typename TyElem>
+            _CG_STATIC_QUALIFIER TyElem shfl_xor(
+                    TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) {
+                return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads));
+            }
+        };
+
+        // Almost all arithmetic types are supported by native shuffle
+        // Vector types are the exception
+        template <typename TyElem>
+        using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<
+                remove_qual<TyElem>>::value ||
+            details::is_float_or_half<
+                remove_qual<TyElem>>::value
+        >;
+
+        constexpr unsigned long long _MemoryShuffleCutoff = 32;
+
+        template <typename TyElem,
+                  bool IsNative = use_native_shuffle<TyElem>::value,
+                  bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)>
+        struct shuffle_dispatch;
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, true, false> :  public _native_shuffle {};
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {};
+
+        template <typename TyElem>
+        struct shuffle_dispatch<TyElem, false, true> :  public _memory_shuffle {};
+
+#endif //_CG_CPP11_FEATURES
+    };
+
+    namespace multi_grid {
+        struct multi_grid_functions;
+    };
+
+    namespace grid {
+        _CG_STATIC_QUALIFIER void sync(unsigned int *bar) {
+            unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
+
+            details::sync_grids(expected, bar);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_blocks()
+        {
+            // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication
+            // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)]  exceeds 4b, promote before multiplication
+            return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_threads()
+        {
+            return num_blocks() * cta::num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long block_rank()
+        {
+            return vec3_to_linear<unsigned long long>(blockIdx, gridDim);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long thread_rank()
+        {
+            return block_rank() * cta::num_threads() + cta::thread_rank();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_blocks()
+        {
+            return dim3(gridDim.x, gridDim.y, gridDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_index()
+        {
+            return dim3(blockIdx.x, blockIdx.y, blockIdx.z);
+        }
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+        _CG_STATIC_QUALIFIER dim3 dim_clusters() {
+            return __clusterGridDimInClusters();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long num_clusters() {
+            const dim3 dimClusters = dim_clusters();
+            return dimClusters.x * dimClusters.y * dimClusters.z;
+        }
+
+        _CG_STATIC_QUALIFIER dim3 cluster_index() {
+            return __clusterIdx();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
+            return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters());
+        }
+#endif
+
+        // Legacy aliases
+        _CG_STATIC_QUALIFIER unsigned long long size()
+        {
+            return num_threads();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 grid_dim()
+        {
+            return dim_blocks();
+        }
+    };
+
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+    namespace multi_grid {
+        _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
+        {
+            return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
+        }
+
+        _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
+        {
+            cudaError_t err = cudaCGSynchronize(handle, 0);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
+        {
+            unsigned int numThreads = 0;
+            cudaCGGetSize(&numThreads, NULL, handle);
+            return numThreads;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
+        {
+            unsigned int threadRank = 0;
+            cudaCGGetRank(&threadRank, NULL, handle);
+            return threadRank;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
+        {
+            unsigned int gridRank = 0;
+            cudaCGGetRank(NULL, &gridRank, handle);
+            return gridRank;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
+        {
+            unsigned int numGrids = 0;
+            cudaCGGetSize(NULL, &numGrids, handle);
+            return numGrids;
+        }
+
+# ifdef _CG_CPP11_FEATURES
+        struct multi_grid_functions {
+            decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle;
+            decltype(multi_grid::sync) *sync;
+            decltype(multi_grid::size) *size;
+            decltype(multi_grid::thread_rank) *thread_rank;
+            decltype(multi_grid::grid_rank) *grid_rank;
+            decltype(multi_grid::num_grids) *num_grids;
+        };
+
+        template <typename = void>
+        _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() {
+            __constant__ static const multi_grid_functions mgf {
+                &multi_grid::get_intrinsic_handle,
+                &multi_grid::sync,
+                &multi_grid::size,
+                &multi_grid::thread_rank,
+                &multi_grid::grid_rank,
+                &multi_grid::num_grids
+            };
+
+            return &mgf;
+        }
+# endif
+    };
+#endif
+
+#if defined(_CG_HAS_CLUSTER_GROUP)
+    namespace cluster {
+
+        _CG_STATIC_QUALIFIER bool isReal()
+        {
+            return __clusterDimIsSpecified();
+        }
+
+        _CG_STATIC_QUALIFIER void barrier_arrive()
+        {
+            __cluster_barrier_arrive();
+        }
+
+        _CG_STATIC_QUALIFIER void barrier_wait()
+        {
+            __cluster_barrier_wait();
+        }
+
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            barrier_arrive();
+            barrier_wait();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
+        {
+            return __cluster_query_shared_rank(addr);
+        }
+
+        template <typename T>
+        _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
+        {
+            return static_cast<T*>(__cluster_map_shared_rank(addr, rank));
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_index()
+        {
+            return __clusterRelativeBlockIdx();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int block_rank()
+        {
+            return __clusterRelativeBlockRank();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return block_rank() * cta::num_threads() + cta::thread_rank();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_blocks()
+        {
+            return __clusterDim();
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_blocks()
+        {
+            return __clusterSizeInBlocks();
+        }
+
+        _CG_STATIC_QUALIFIER dim3 dim_threads()
+        {
+            const dim3 dimBlocks = dim_blocks();
+            const unsigned int x = dimBlocks.x * blockDim.x;
+            const unsigned int y = dimBlocks.y * blockDim.y;
+            const unsigned int z = dimBlocks.z * blockDim.z;
+            return dim3(x, y, z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_threads()
+        {
+            return num_blocks() * cta::num_threads();
+        }
+
+    };
+#endif
+
+    _CG_STATIC_QUALIFIER unsigned int laneid()
+    {
+        unsigned int laneid;
+        asm ("mov.u32 %0, %%laneid;" : "=r"(laneid));
+        return laneid;
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
+    {
+        unsigned int lanemask32_eq;
+        asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
+        return (lanemask32_eq);
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
+    {
+        unsigned int lanemask32_lt;
+        asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
+        return (lanemask32_lt);
+    }
+
+    _CG_STATIC_QUALIFIER void abort()
+    {
+        _CG_ABORT();
+    }
+
+    template <typename Ty>
+    _CG_QUALIFIER void assert_if_not_arithmetic() {
+#ifdef _CG_CPP11_FEATURES
+        static_assert(
+            _CG_STL_NAMESPACE::is_integral<Ty>::value ||
+            details::is_float_or_half<Ty>::value,
+            "Error: Ty is neither integer or float"
+        );
+#endif
+    }
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <unsigned int numWarps>
+    struct copy_channel {
+        char* channel_ptr;
+        barrier_t* sync_location;
+        size_t channel_size;
+
+        // One warp sending to all other warps, it has to wait for all other warps.
+        struct send_many_to_many {
+            _CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_all_other_warps;
+            _CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) {
+                __syncwarp(0xFFFFFFFF);
+                details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps);
+            }
+        };
+
+        // One warp is receiving and all other warps are sending to that warp, they have to wait for that one warp.
+        struct send_many_to_one {
+            _CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_specific_warp;
+            _CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) {
+                // Wait for all warps to finish and let the last warp release all threads.
+                if (details::sync_warps_last_releases(sync_location, cta::thread_rank(), numWarps)) {
+                    details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps);
+                }
+            }
+        };
+
+        template <unsigned int ThreadCnt, size_t ValSize, typename SendDetails>
+        _CG_QUALIFIER void _send_value_internal(char* val_ptr, unsigned int thread_idx, unsigned int warp_id) {
+            size_t thread_offset = thread_idx * sizeof(int);
+
+            for (size_t i = 0; i < ValSize; i += channel_size) {
+                size_t bytes_left = ValSize - i;
+                size_t copy_chunk = min(bytes_left, channel_size);
+
+                details::sync_warps_wait_for_warps<SendDetails::wait_kind>(warp_id, sync_location, cta::thread_rank(), numWarps);
+                #pragma unroll 1
+                for (size_t j = thread_offset; j < copy_chunk ; j += sizeof(int) * ThreadCnt) {
+                    size_t my_bytes_left = copy_chunk - j;
+                    memcpy(channel_ptr + j, val_ptr + i + j, min(my_bytes_left, sizeof(int)));
+                }
+                SendDetails::post_iter_release(thread_idx, sync_location);
+            }
+        }
+
+
+        template <typename TyVal, unsigned int ThreadCnt, typename SendDetails>
+        _CG_QUALIFIER void send_value(TyVal& val, unsigned int thread_idx, unsigned int warp_id) {
+            _send_value_internal<ThreadCnt, sizeof(TyVal), SendDetails>(reinterpret_cast<char*>(&val), thread_idx, warp_id);
+        }
+
+        template <size_t ValSize>
+        _CG_QUALIFIER void _receive_value_internal(char* val_ptr, bool warp_master, bool active) {
+            for (size_t i = 0; i < ValSize; i += channel_size) {
+                size_t bytes_left = ValSize - i;
+                details::sync_warps_wait_for_release(sync_location, warp_master, cta::thread_rank(), numWarps);
+                if (active) {
+                    memcpy(val_ptr + i, channel_ptr, min(bytes_left, channel_size));
+                }
+            }
+        }
+
+        template <typename TyVal>
+        _CG_QUALIFIER void receive_value(TyVal& val, bool warp_master, bool active = true) {
+            _receive_value_internal<sizeof(TyVal)>(reinterpret_cast<char*>(&val), warp_master, active);
+        }
+    };
+
+    _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) {
+        return x == 1 ? 0 : 1 + log2(x / 2);
+    }
+#endif //_CG_CPP11_FEATURES
+
+}; // !Namespace internal
+
+_CG_END_NAMESPACE
+
+#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */
diff --git a/ext/cudart/include/cooperative_groups/details/info.h b/ext/cudart/include/cooperative_groups/details/info.h
new file mode 100644
index 0000000000000000000000000000000000000000..4afc475dc59f2d638bc99756fa33db799523f518
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/info.h
@@ -0,0 +1,323 @@
+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+
+
+#ifndef _CG_INFO_H_
+#define _CG_INFO_H_
+/*
+** Define: _CG_VERSION
+*/
+#define _CG_VERSION 1000
+
+/*
+** Define: _CG_ABI_VERSION
+*/
+#ifndef _CG_ABI_VERSION
+# define _CG_ABI_VERSION 1
+#endif
+
+/*
+** Define: _CG_ABI_EXPERIMENTAL
+** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
+*/
+#if defined(_CG_ABI_EXPERIMENTAL)
+#endif
+
+#define _CG_CONCAT_INNER(x, y) x ## y
+#define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
+#define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
+
+#define _CG_BEGIN_NAMESPACE \
+    namespace cooperative_groups { namespace _CG_NAMESPACE {
+#define _CG_END_NAMESPACE \
+    }; using namespace _CG_NAMESPACE; };
+
+#if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
+# define _CG_CPP11_FEATURES
+#endif
+
+#if !defined(_CG_QUALIFIER)
+# define _CG_QUALIFIER __forceinline__ __device__
+#endif
+#if !defined(_CG_STATIC_QUALIFIER)
+# define _CG_STATIC_QUALIFIER static __forceinline__ __device__
+#endif
+#if !defined(_CG_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
+# else
+#  define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
+# endif
+#endif
+#if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
+# else
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
+# endif
+#endif
+
+#if defined(_MSC_VER)
+# define _CG_DEPRECATED __declspec(deprecated)
+#else
+# define _CG_DEPRECATED __attribute__((deprecated))
+#endif
+
+#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_GRID_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_MULTI_GRID_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_MATCH_COLLECTIVE
+#endif
+#if ((__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)) && (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE))
+# define _CG_HAS_CLUSTER_GROUP
+#endif
+// Has __half and __half2
+// Only usable if you include the cuda_fp16.h extension, and
+// _before_ including cooperative_groups.h
+#ifdef __CUDA_FP16_TYPES_EXIST__
+# define _CG_HAS_FP16_COLLECTIVE
+#endif
+
+#if (__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__) && (defined(__NVCC__) || defined(__CUDACC_RTC__))
+# define _CG_HAS_OP_REDUX
+#endif
+
+// Include libcu++ where supported.
+#if defined(_CG_CPP11_FEATURES) && !defined(__QNX__) && !defined(__ibmxl__) && \
+    (defined(__NVCC__) || defined(__CUDACC_RTC__)) && \
+    (defined(__x86_64__) || defined(__aarch64__) || defined(__ppc64__)|| defined(_M_X64) || defined(_M_ARM64)) && \
+    (defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__))
+# define _CG_USE_CUDA_STL
+#else
+# define _CG_USE_OWN_TRAITS
+#endif
+
+#if defined(_CG_USE_CUDA_STL) && (!defined(__CUDA_ARCH__) || \
+    ((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
+# define _CG_HAS_STL_ATOMICS
+#endif
+
+#ifdef _CG_CPP11_FEATURES
+// Use cuda::std:: for type_traits
+# if defined(_CG_USE_CUDA_STL)
+#  define _CG_STL_NAMESPACE cuda::std
+#  include <cuda/std/type_traits>
+// Use CG's implementation of type traits
+# else
+#  define _CG_STL_NAMESPACE cooperative_groups::details::templates
+# endif
+#endif
+
+#ifdef _CG_CPP11_FEATURES
+# define _CG_STATIC_CONST_DECL static constexpr
+# define _CG_CONST_DECL constexpr
+#else
+# define _CG_STATIC_CONST_DECL static const
+# define _CG_CONST_DECL const
+#endif
+
+#if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
+# define _CG_ASM_PTR_CONSTRAINT "r"
+#else
+#  define _CG_ASM_PTR_CONSTRAINT "l"
+#endif
+
+/*
+** Define: CG_DEBUG
+** What: Enables various runtime safety checks
+*/
+#if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
+# define _CG_DEBUG
+#endif
+
+#if defined(_CG_DEBUG)
+# include <assert.h>
+# define _CG_ASSERT(x) assert((x));
+# define _CG_ABORT() assert(0);
+#else
+# define _CG_ASSERT(x)
+# define _CG_ABORT() __trap();
+#endif
+
+#if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+namespace templates {
+
+/**
+ * Integral constants
+ **/
+template <typename Ty, Ty Val>
+struct integral_constant {
+    static constexpr Ty value = Val;
+    typedef Ty type;
+
+    _CG_QUALIFIER constexpr operator type() const noexcept { return value; }
+    _CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
+};
+
+typedef integral_constant<bool, true>  true_type;
+typedef integral_constant<bool, false> false_type;
+
+/**
+ * CV Qualifiers
+ **/
+template <class Ty> struct is_lvalue_reference       : public details::templates::false_type {};
+template <class Ty> struct is_lvalue_reference<Ty&>  : public details::templates::true_type {};
+
+template <class Ty> struct remove_reference       {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&>  {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
+
+template <class Ty>
+using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
+
+template <class Ty> struct remove_const           {typedef Ty type;};
+template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
+
+template <class Ty> struct remove_volatile              {typedef Ty type;};
+template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
+
+template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
+
+template <class Ty>
+using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
+
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
+    return static_cast<Ty&&>(t);
+}
+
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
+    static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
+    return static_cast<Ty&&>(t);
+}
+
+/**
+ * is_integral
+ **/
+template <class Ty> struct _is_integral                     : public details::templates::false_type {};
+template <>         struct _is_integral<bool>               : public details::templates::true_type {};
+template <>         struct _is_integral<char>               : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned char>      : public details::templates::true_type {};
+template <>         struct _is_integral<short>              : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned short>     : public details::templates::true_type {};
+template <>         struct _is_integral<int>                : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned int>       : public details::templates::true_type {};
+template <>         struct _is_integral<long>               : public details::templates::true_type {};
+template <>         struct _is_integral<long long>          : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long>      : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long long> : public details::templates::true_type {};
+//Vector type support?
+
+template <typename Ty>
+struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
+
+/**
+ * is_floating_point
+ **/
+template <class Ty> struct _is_floating_point              : public details::templates::false_type {};
+template <>         struct _is_floating_point<float>       : public details::templates::true_type {};
+template <>         struct _is_floating_point<double>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<long double> : public details::templates::true_type {};
+# ifdef __CUDA_FP16_TYPES_EXIST__
+template <>         struct _is_floating_point<__half>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<__half2>     : public details::templates::true_type {};
+# endif
+//Vector type support?
+
+template <typename Ty>
+struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
+
+template <class T>
+struct is_arithmetic : details::templates::integral_constant<
+    bool,
+    details::templates::is_integral<T>::value ||
+    details::templates::is_floating_point<T>::value> {};
+
+template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
+struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
+
+template <typename Ty>
+struct _is_unsigned<Ty,false> : details::templates::false_type {};
+
+template <typename Ty>
+struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
+
+/**
+ * programmatic type traits
+ **/
+template<bool B, class Ty = void>
+struct enable_if {};
+
+template<class Ty>
+struct enable_if<true, Ty> { typedef Ty type; };
+
+template<bool Cond, typename Ty = void>
+using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
+
+template<class Ty1, class Ty2>
+struct is_same : details::templates::false_type {};
+
+template<class Ty>
+struct is_same<Ty, Ty> : details::templates::true_type {};
+
+} // templates
+} // details
+_CG_END_NAMESPACE
+
+#endif // _CG_CPP11_FEATURES
+
+#endif // _CG_INFO_H_
diff --git a/ext/cudart/include/cooperative_groups/details/partitioning.h b/ext/cudart/include/cooperative_groups/details/partitioning.h
new file mode 100644
index 0000000000000000000000000000000000000000..c38418657d149e9527f9a01ce5a9f18e0f2bec61
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/partitioning.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_PARTITIONING_H
+#define _CG_PARTITIONING_H
+
+#include "info.h"
+#include "helpers.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
+        const unsigned int fullMask = ~0u;
+
+        unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+        unsigned int predMask = pred ? 0 : fullMask;
+        unsigned int setMask = __ballot_sync(thisMask, pred);
+
+        if (setMask == thisMask || setMask == 0) {
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
+            return subTile;
+        }
+        else {
+            unsigned int subMask = thisMask & (setMask ^ predMask);
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
+            return subTile;
+        }
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    template <typename TyGroup, typename TyPredicate>
+    _CG_STATIC_QUALIFIER coalesced_group _labeled_partition(const TyGroup &tile, TyPredicate pred) {
+        unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+        unsigned int thisBias = __ffs(thisMask) - 1; // Subtract 1 to index properly from [1-32]
+        unsigned int subMask = __match_any_sync(thisMask, pred);
+
+        coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+
+        int leaderLaneId = subTile.shfl(details::laneid(), 0);
+
+        bool isLeader = !subTile.thread_rank();
+        unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
+        unsigned int tileRank = __fns(leaderMask, leaderLaneId, 0) - thisBias;
+
+        _coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
+
+        return subTile;
+    }
+#endif
+}; // namespace details
+
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
+    return details::_binary_partition(tile, pred);
+}
+
+template <unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
+#ifdef _CG_CPP11_FEATURES
+    static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
+#endif
+    return details::_binary_partition(tile, pred);
+}
+
+
+#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
+template <typename TyPredicate>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type");
+    return details::_labeled_partition(tile, pred);
+}
+
+template <typename TyPredicate, unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type");
+    static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
+    return details::_labeled_partition(tile, pred);
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_PARTITIONING_H
diff --git a/ext/cudart/include/cooperative_groups/details/reduce.h b/ext/cudart/include/cooperative_groups/details/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..240296133e1a7ef4bcf3249eb965ee101c6020de
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/reduce.h
@@ -0,0 +1,430 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_REDUCE_H_
+#define _CG_REDUCE_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "coalesced_reduce.h"
+#include "functional.h"
+#include "cooperative_groups.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <class Ty>
+    using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
+
+    template <class Ty>
+    using redux_is_add_supported = _redux_is_add_supported<Ty>;
+
+    // A specialization for 64 bit logical operations is possible
+    // but for now only accelerate 32 bit bitwise ops
+    template <class Ty>
+    using redux_is_logical_supported = redux_is_add_supported<Ty>;
+
+    // Base operator support case
+    template <class TyOp, class Ty> struct _redux_op_supported                 : public _CG_STL_NAMESPACE::false_type {};
+#ifdef _CG_HAS_OP_REDUX
+    template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>,  Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+#endif
+
+    template <class Ty, template <class> class TyOp>
+    using redux_op_supported = _redux_op_supported<
+            typename details::remove_qual<TyOp<Ty>>,
+            Ty>;
+
+    // Groups smaller than 16 actually have worse performance characteristics when used with redux
+    // tiles of size 16 and 32 perform the same or better and have better code generation profiles
+    template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <>
+    struct _redux_group_optimized<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type  {};
+
+    template <typename TyGroup>
+    using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
+
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
+
+#ifdef _CG_HAS_OP_REDUX
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
+        return __reduce_or_sync(mask, val);
+    }
+
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
+        return __reduce_or_sync(mask, val);
+    }
+#endif
+
+
+    template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
+    struct _accelerated_op;
+
+    // Signed type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, false> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
+        }
+    };
+
+    // Unsigned type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, true> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
+        }
+    };
+
+    template <typename TyVal>
+    using accelerated_op = _accelerated_op<TyVal>;
+
+
+    template <typename TyVal, typename TyFnInput, typename TyGroup>
+    class _redux_dispatch {
+        template <class Ty, template <class> class TyOp>
+        using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
+            redux_op_supported<Ty, TyOp>::value &&
+            redux_group_optimized<TyGroup>::value>;
+
+        template <class Ty, template <class> class TyOp>
+        using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
+
+        template <class Ty, template <class> class TyOp>
+        using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
+
+    public:
+        // Dispatch to redux if the combination of op and args are supported
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+
+        // Fallback shuffle sync reduction
+        template <
+            template <class> class TyOp,
+            redux_is_not_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            //Dispatch to fallback shuffle sync accelerated reduction
+            return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+        }
+
+    };
+
+    // Group support for reduce.
+    template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _reduce_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+
+    template <typename TyGroup>
+    using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
+
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+
+
+    template <typename TyVal, typename TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+        return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+
+    template <unsigned int GroupId>
+    struct tile_reduce_dispatch;
+
+    template <>
+    struct tile_reduce_dispatch<details::coalesced_group_id> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+        }
+    };
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <>
+    struct tile_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                    *warp_scratch_location =
+                        details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            };
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    *thread_scratch_location =
+                        details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            };
+            return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+        }
+    };
+
+    enum class AsyncReduceType { store, update };
+
+    template <AsyncReduceType TyAsyncReduce>
+    struct async_reduce_result_handler;
+
+    template<>
+    struct async_reduce_result_handler<AsyncReduceType::store> {
+        template<typename TyDst, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER void handleResult(TyDst *dst, TyVal& result, TyOp&& op) {
+            *dst = result;
+        }
+    };
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template<>
+    struct async_reduce_result_handler<AsyncReduceType::update> {
+        template<typename TyDst, typename TyVal, typename TyOp>
+        _CG_STATIC_QUALIFIER void handleResult(TyDst& dst, TyVal& result, TyOp&& op) {
+            atomic_update(dst, result, _CG_STL_NAMESPACE::forward<TyOp>(op));
+        }
+    };
+#endif
+
+    template <unsigned int GroupId, AsyncReduceType TyAsyncReduce>
+    struct tile_async_reduce_dispatch;
+
+    template <AsyncReduceType TyAsyncReduce>
+    struct tile_async_reduce_dispatch<details::coalesced_group_id, TyAsyncReduce> {
+        template <unsigned int TySize, typename ParentT, typename TyDst, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER void reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyVal&& val, TyFn&& op) {
+            // Do regular, in group reduction
+            auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+
+            // One thread stores/updates the destination
+            if (group.thread_rank() == 0) {
+                async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+        }
+
+        template <typename TyDst, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER void reduce(const coalesced_group& group, TyDst& dst, TyVal&& val, TyFn&& op) {
+            // Do in group reduction to the last thread
+            auto result = details::coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+
+            // One thread stores/updates the destination
+            if (group.thread_rank() == group.size() - 1) {
+                async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+        }
+    };
+
+    template <AsyncReduceType TyAsyncReduce>
+    struct tile_async_reduce_dispatch<details::multi_tile_group_id, TyAsyncReduce> {
+        template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn>
+        _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op) {
+            using TyVal = remove_qual<TyInputVal>;
+            const unsigned int num_warps = TySize / 32;
+            details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
+            auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
+
+            // Do in warp reduce
+            auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
+            *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
+
+            // Tile of size num_warps from the last warp to arrive does final reduction step
+            if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
+                auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
+                if (subwarp.meta_group_rank() == 0) {
+                    auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
+                    auto thread_val = *thread_scratch_location;
+                    // Release other warps, we read their contribution already.
+                    subwarp.sync();
+                    details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
+                    TyVal result = details::reduce(subwarp, thread_val, op);
+                    // One thread stores the result or updates the atomic
+                    if (subwarp.thread_rank() == 0) {
+                        async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+                    }
+                }
+                warp.sync();
+            }
+        }
+    };
+#endif
+
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_reduce_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    };
+
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_async_reduce_params() {
+        check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+} // details
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
+
+    using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
+    return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+namespace experimental {
+
+    #if defined(_CG_HAS_STL_ATOMICS)
+    template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+    void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+        details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+        using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>;
+        dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+
+    template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+    void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+        details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+        using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>;
+        dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+    #endif
+
+    template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
+    void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
+        details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+        using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::store>;
+        dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_REDUCE_H_
diff --git a/ext/cudart/include/cooperative_groups/details/scan.h b/ext/cudart/include/cooperative_groups/details/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..badc50de13a276784ba61397ad13e1dc87cec269
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/scan.h
@@ -0,0 +1,324 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_SCAN_H_
+#define _CG_SCAN_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "functional.h"
+#include "coalesced_scan.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    // Group support for scan.
+    template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _scan_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+
+    template <typename TyGroup>
+    using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
+
+    template <bool IsIntegralPlus>
+    struct integral_optimized_scan;
+
+    enum class ScanType { exclusive, inclusive };
+
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_dispatch;
+
+    template <ScanType TyScan>
+    struct scan_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            auto scan_result = coalesced_inclusive_scan(group, val, op);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group,
+                                                             scan_result,
+                                                             _CG_STL_NAMESPACE::forward<TyVal>(val),
+                                                             _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            return scan_result;
+        }
+    };
+
+#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
+    template <ScanType TyScan>
+    struct scan_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan = 
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
+                    *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
+            };
+
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            if (warpType::meta_group_rank() == 0) {
+                return warp_scan;
+            }
+            else {
+                return op(warp_scan, previous_warps_sum);
+            }
+        }
+    };
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_update_dispatch;
+
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            details::remove_qual<TyVal> old;
+
+            // Do regular in group scan
+            auto scan_result = details::coalesced_inclusive_scan(group, val, op);
+
+            // Last thread updates the atomic and distributes its old value to other threads
+            if (group.thread_rank() == group.size() - 1) {                                                
+                old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            old = group.shfl(old, group.size() - 1);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            }
+            scan_result = op(old, scan_result);
+            return scan_result;
+        }
+    };
+
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan = 
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
+                    TyRet offset;
+                    // Single thread does the atomic update with sum of all contributions and reads the old value.
+                    if (subwarp.thread_rank() == subwarp.size() - 1) {
+                        offset = details::atomic_update(dst, scan_result, op);
+                    }
+                    offset = subwarp.shfl(offset, subwarp.size() - 1);
+                    scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
+                    // Add offset read from the atomic to the scanned warp sum.
+                    // Skipping first thread, since it got defautly constructed value from the conversion,
+                    // it should just return the offset received from the thread that did the atomic update.
+                    if (subwarp.thread_rank() != 0) {
+                        offset = op(scan_result, offset);
+                    }
+                    *thread_scratch_location = offset;
+            };
+
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            return op(warp_scan, previous_warps_sum);
+        }
+    };
+#endif
+#endif
+
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    }
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_update_params() {
+        check_scan_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+#endif
+
+} // details
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
+    return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
+    return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+
+#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL)
+
+namespace experimental {
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+    _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+        details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+        using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+        return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+    _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
+        return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+    }
+
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+    _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+        details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+        using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+        return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+    _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
+        return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+    }
+
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+    _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+        details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+        using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+        return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+    _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
+        return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+    }
+
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+    _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+        details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+        using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+        return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+    }
+
+    template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+    _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
+        return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+    }
+}
+
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_SCAN_H_
diff --git a/ext/cudart/include/cooperative_groups/details/sync.h b/ext/cudart/include/cooperative_groups/details/sync.h
new file mode 100644
index 0000000000000000000000000000000000000000..af144a68a877a00f662e2f6c36417f284e47d337
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/details/sync.h
@@ -0,0 +1,276 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_GRID_H
+#define _CG_GRID_H
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details
+{
+
+typedef unsigned int barrier_t;
+
+_CG_STATIC_QUALIFIER bool bar_has_flipped(unsigned int old_arrive, unsigned int current_arrive) {
+    return (((old_arrive ^ current_arrive) & 0x80000000) != 0);
+}
+
+_CG_STATIC_QUALIFIER void bar_flush(volatile unsigned int *addr) {
+#if __CUDA_ARCH__ < 700
+    __threadfence();
+#else
+    unsigned int val;
+    asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)addr) : "memory");
+    // Avoids compiler warnings from unused variable val
+    (void)(val = val);
+#endif
+}
+
+_CG_STATIC_QUALIFIER unsigned int atomic_add(volatile unsigned int *addr, unsigned int val) {
+    unsigned int old;
+#if __CUDA_ARCH__ < 700
+    old = atomicAdd((unsigned int*)addr, val);
+#else
+    asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)addr), "r"(val) : "memory");
+#endif
+    return old;
+}
+
+_CG_STATIC_QUALIFIER void sync_grids(unsigned int expected, volatile barrier_t *arrived) {
+    bool cta_master = (threadIdx.x + threadIdx.y + threadIdx.z == 0);
+    bool gpu_master = (blockIdx.x + blockIdx.y + blockIdx.z == 0);
+
+    __syncthreads();
+
+    if (cta_master) {
+        unsigned int nb = 1;
+        if (gpu_master) {
+            nb = 0x80000000 - (expected - 1);
+        }
+
+        __threadfence();
+
+        unsigned int oldArrive;
+        oldArrive = atomic_add(arrived, nb);
+
+        while (!bar_has_flipped(oldArrive, *arrived));
+
+        //flush barrier upon leaving
+        bar_flush((unsigned int*)arrived);
+    }
+
+    __syncthreads();
+}
+
+/* - Multi warp groups synchronization routines - */
+
+// Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them,
+// thread ranks 32..63 second etc 
+// Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions 
+_CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) {
+    return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32)));
+}
+
+// Default blocking sync.
+_CG_STATIC_QUALIFIER void sync_warps(volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (warp_master) {
+        unsigned int old = atomicOr((unsigned int *)arrived, warp_bit);
+        if (((old | warp_bit) & group_mask) == group_mask) {
+            atomicAnd((unsigned int *)arrived, ~group_mask);
+        }
+        else {
+            while(*arrived & warp_bit);
+        }
+    }
+
+    __syncwarp(0xFFFFFFFF);
+}
+
+// Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first.
+// Warp returning true from this function needs to call sync_warps_release.
+_CG_STATIC_QUALIFIER bool sync_warps_last_releases(volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    unsigned int old = 0;
+    if (warp_master) {
+        old = atomicOr((unsigned int *)arrived, warp_bit);
+    }
+    old = __shfl_sync(0xFFFFFFFF, old, 0);
+    if (((old | warp_bit) & group_mask) == group_mask) {
+        return true;
+    }
+    while(*arrived & warp_bit);
+
+    return false;
+}
+
+// Release my group from the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_release(volatile barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+    if (is_master) {
+        atomicAnd((unsigned int *)arrived, ~group_mask);
+    }
+}
+
+// Arrive at my group barrier, but don't block or release the barrier, even if every one arrives.
+// sync_warps_release needs to be called by some warp after this one to reset the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_arrive(volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (warp_master) {
+        unsigned int old = atomicOr((unsigned int *)arrived, warp_bit);
+    }
+    __syncwarp(0xFFFFFFFF);
+}
+
+// Arrive at my group barrier, but don't block. Last arriving warp immediately releases the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_arrive_release(volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (warp_master) {
+        unsigned int old = atomicOr((unsigned int *)arrived, warp_bit);
+        if (((old | warp_bit) & group_mask) == group_mask) {
+            atomicAnd((unsigned int *)arrived, ~group_mask);
+        }
+    }
+    __syncwarp(0xFFFFFFFF);
+}
+
+// Wait for my warp to be released from the barrier. Warp must have arrived first.
+_CG_STATIC_QUALIFIER void sync_warps_wait(volatile barrier_t *arrived, unsigned int thread_rank) {
+    unsigned int warp_id = thread_rank / 32;
+    unsigned int warp_bit = 1 << warp_id;
+
+    while(*arrived & warp_bit);
+    __syncwarp(0xFFFFFFFF);
+}
+
+// Arrive at my group barrier and block. Barrier is not released, even if every warp arrives.
+// sync_warps_release needs to be called by some warp after this one.
+_CG_STATIC_QUALIFIER void sync_warps_wait_for_release(
+        volatile barrier_t *arrived,
+        bool is_master,
+        unsigned int thread_rank,
+        unsigned int num_warps) {
+
+    unsigned int warp_id = thread_rank / 32;
+    unsigned int warp_bit = 1 << warp_id;
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (is_master) {
+        atomicOr((unsigned int *)arrived, warp_bit);
+        while(*arrived & warp_bit);
+    }
+
+    __syncwarp(0xFFFFFFFF);
+}
+
+enum wait_for_warps_kind {
+    wait_for_all_other_warps,
+    wait_for_specific_warp
+};
+
+// Wait for a combinantion of warps specified by Kind parameter to arrive at the group barrier.
+// This function does not arrive at the barrier.
+template <wait_for_warps_kind Kind>
+_CG_QUALIFIER void sync_warps_wait_for_warps(
+        unsigned int wait_warp_id, volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps);
+
+template <>
+_CG_QUALIFIER void sync_warps_wait_for_warps<wait_for_all_other_warps>(
+        unsigned int wait_warp_id,
+        volatile barrier_t *arrived,
+        unsigned int thread_rank,
+        unsigned int num_warps) {
+
+    unsigned int wait_mask = get_group_mask(thread_rank, num_warps);
+    wait_mask &= ~(1 << wait_warp_id);
+    while((*arrived & wait_mask) != wait_mask);
+}
+
+template <>
+_CG_QUALIFIER void sync_warps_wait_for_warps<wait_for_specific_warp>(
+        unsigned int wait_warp_id,
+        volatile barrier_t *arrived,
+        unsigned int thread_rank,
+        unsigned int num_warps) {
+
+    unsigned int wait_mask = 1 << wait_warp_id;
+    while((*arrived & wait_mask) != wait_mask);
+}
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_GRID_H
diff --git a/ext/cudart/include/cooperative_groups/memcpy_async.h b/ext/cudart/include/cooperative_groups/memcpy_async.h
new file mode 100644
index 0000000000000000000000000000000000000000..50b907d9a1fe45cdc411891a20d8fd035118e5be
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/memcpy_async.h
@@ -0,0 +1,62 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC
+#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/async.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC
diff --git a/ext/cudart/include/cooperative_groups/reduce.h b/ext/cudart/include/cooperative_groups/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..3c87d780db0b437f1ae06e0ef8d60137233795c0
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/reduce.h
@@ -0,0 +1,63 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_REDUCE_H
+#define _COOPERATIVE_GROUPS_REDUCE_H
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/reduce.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+
+#endif //_COOPERATIVE_GROUPS_REDUCE_H
diff --git a/ext/cudart/include/cooperative_groups/scan.h b/ext/cudart/include/cooperative_groups/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..9bc27078028318ada00cbcccd052e0d6cc930cfe
--- /dev/null
+++ b/ext/cudart/include/cooperative_groups/scan.h
@@ -0,0 +1,63 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_SCAN_H
+#define _COOPERATIVE_GROUPS_SCAN_H
+
+#include "../cooperative_groups.h"
+#include "details/info.h"
+
+#ifdef _CG_CPP11_FEATURES
+# include "details/scan.h"
+#else
+# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+#endif
+
+
+#endif //_COOPERATIVE_GROUPS_SCAN_H
diff --git a/ext/cudart/include/cuComplex.h b/ext/cudart/include/cuComplex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b167111b0b387a5279da6749d946560e1c42c1b
--- /dev/null
+++ b/ext/cudart/include/cuComplex.h
@@ -0,0 +1,348 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CU_COMPLEX_H_)
+#define CU_COMPLEX_H_
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#endif
+#endif
+
+/* When trying to include C header file in C++ Code extern "C" is required
+ * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
+ * extern "C" cannot be nested
+ * Hence keep the header out of extern "C" block
+ */
+
+#if !defined(__CUDACC__)
+#include <math.h>       /* import fabsf, sqrt */
+#endif /* !defined(__CUDACC__) */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#include "vector_types.h"
+
+typedef float2 cuFloatComplex;
+
+__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
+                                                             (float r, float i)
+{
+    cuFloatComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
+{
+    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
+}
+__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
+                                cuCimagf(x) + cuCimagf(y));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
+                                    cuCimagf(x) - cuCimagf(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex prod;
+    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
+                                 (cuCimagf(x) * cuCimagf(y)),
+                                 (cuCrealf(x) * cuCimagf(y)) + 
+                                 (cuCimagf(x) * cuCrealf(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex quot;
+    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
+    float oos = 1.0f / s;
+    float ars = cuCrealf(x) * oos;
+    float ais = cuCimagf(x) * oos;
+    float brs = cuCrealf(y) * oos;
+    float bis = cuCimagf(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0f / s;
+    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
+                                ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* 
+ * We would like to call hypotf(), but it's not available on all platforms.
+ * This discrete implementation guards against intermediate underflow and 
+ * overflow by scaling. Otherwise we would lose half the exponent range. 
+ * There are various ways of doing guarded computation. For now chose the 
+ * simplest and fastest solution, however this may suffer from inaccuracies 
+ * if sqrt and division are not IEEE compliant. 
+ */
+__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
+{
+    float a = cuCrealf(x);
+    float b = cuCimagf(x);
+    float v, w, t;
+    a = fabsf(a);
+    b = fabsf(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0f + t * t;
+    t = v * sqrtf(t);
+    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
+        t = v + w;
+    }
+    return t;
+}
+
+/* Double precision */
+typedef double2 cuDoubleComplex;
+
+__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
+                                                           (double r, double i)
+{
+    cuDoubleComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
+{
+    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
+                                 cuCimag(x) + cuCimag(y));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
+                                 cuCimag(x) - cuCimag(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex prod;
+    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
+                                 (cuCimag(x) * cuCimag(y)),
+                                 (cuCreal(x) * cuCimag(y)) + 
+                                 (cuCimag(x) * cuCreal(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex quot;
+    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
+    double oos = 1.0 / s;
+    double ars = cuCreal(x) * oos;
+    double ais = cuCimag(x) * oos;
+    double brs = cuCreal(y) * oos;
+    double bis = cuCimag(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0 / s;
+    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
+                                 ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Otherwise we would lose half the exponent range. There are
+ * various ways of doing guarded computation. For now chose the simplest
+ * and fastest solution, however this may suffer from inaccuracies if sqrt
+ * and division are not IEEE compliant.
+ */
+__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
+{
+    double a = cuCreal(x);
+    double b = cuCimag(x);
+    double v, w, t;
+    a = fabs(a);
+    b = fabs(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0 + t * t;
+    t = v * sqrt(t);
+    if ((v == 0.0) || 
+        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
+        t = v + w;
+    }
+    return t;
+}
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+/* aliases */
+typedef cuFloatComplex cuComplex;
+__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
+                                                                float y) 
+{ 
+    return make_cuFloatComplex (x, y); 
+}
+
+/* float-to-double promotion */
+__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
+                                                      (cuFloatComplex c)
+{
+    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
+(cuDoubleComplex c)
+{
+	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
+}
+
+
+__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
+{
+    float real_res;
+    float imag_res;
+    
+    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
+    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
+            
+    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
+    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
+     
+    return make_cuComplex(real_res, imag_res);
+}
+
+__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
+{
+    double real_res;
+    double imag_res;
+    
+    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
+    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
+            
+    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
+    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
+     
+    return make_cuDoubleComplex(real_res, imag_res);
+}
+
+#endif /* !defined(CU_COMPLEX_H_) */
diff --git a/ext/cudart/include/cuda.h b/ext/cudart/include/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d82e525f49cad8776af65024f6bdf140dd91833
--- /dev/null
+++ b/ext/cudart/include/cuda.h
@@ -0,0 +1,20295 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __cuda_cuda_h__
+#define __cuda_cuda_h__
+
+
+
+#include <stdlib.h>
+#ifdef _MSC_VER
+typedef unsigned __int32 cuuint32_t;
+typedef unsigned __int64 cuuint64_t;
+#else
+#include <stdint.h>
+typedef uint32_t cuuint32_t;
+typedef uint64_t cuuint64_t;
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#if defined(CUDA_FORCE_API_VERSION)
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+
+#define cuDeviceTotalMem                    cuDeviceTotalMem_v2
+#define cuCtxCreate                         cuCtxCreate_v2
+#define cuCtxCreate_v3                      cuCtxCreate_v3
+#define cuModuleGetGlobal                   cuModuleGetGlobal_v2
+#define cuMemGetInfo                        cuMemGetInfo_v2
+#define cuMemAlloc                          cuMemAlloc_v2
+#define cuMemAllocPitch                     cuMemAllocPitch_v2
+#define cuMemFree                           cuMemFree_v2
+#define cuMemGetAddressRange                cuMemGetAddressRange_v2
+#define cuMemAllocHost                      cuMemAllocHost_v2
+#define cuMemHostGetDevicePointer           cuMemHostGetDevicePointer_v2
+#define cuMemcpyHtoD                        __CUDA_API_PTDS(cuMemcpyHtoD_v2)
+#define cuMemcpyDtoH                        __CUDA_API_PTDS(cuMemcpyDtoH_v2)
+#define cuMemcpyDtoD                        __CUDA_API_PTDS(cuMemcpyDtoD_v2)
+#define cuMemcpyDtoA                        __CUDA_API_PTDS(cuMemcpyDtoA_v2)
+#define cuMemcpyAtoD                        __CUDA_API_PTDS(cuMemcpyAtoD_v2)
+#define cuMemcpyHtoA                        __CUDA_API_PTDS(cuMemcpyHtoA_v2)
+#define cuMemcpyAtoH                        __CUDA_API_PTDS(cuMemcpyAtoH_v2)
+#define cuMemcpyAtoA                        __CUDA_API_PTDS(cuMemcpyAtoA_v2)
+#define cuMemcpyHtoAAsync                   __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2)
+#define cuMemcpyAtoHAsync                   __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2)
+#define cuMemcpy2D                          __CUDA_API_PTDS(cuMemcpy2D_v2)
+#define cuMemcpy2DUnaligned                 __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2)
+#define cuMemcpy3D                          __CUDA_API_PTDS(cuMemcpy3D_v2)
+#define cuMemcpyHtoDAsync                   __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2)
+#define cuMemcpyDtoHAsync                   __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2)
+#define cuMemcpyDtoDAsync                   __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2)
+#define cuMemcpy2DAsync                     __CUDA_API_PTSZ(cuMemcpy2DAsync_v2)
+#define cuMemcpy3DAsync                     __CUDA_API_PTSZ(cuMemcpy3DAsync_v2)
+#define cuMemsetD8                          __CUDA_API_PTDS(cuMemsetD8_v2)
+#define cuMemsetD16                         __CUDA_API_PTDS(cuMemsetD16_v2)
+#define cuMemsetD32                         __CUDA_API_PTDS(cuMemsetD32_v2)
+#define cuMemsetD2D8                        __CUDA_API_PTDS(cuMemsetD2D8_v2)
+#define cuMemsetD2D16                       __CUDA_API_PTDS(cuMemsetD2D16_v2)
+#define cuMemsetD2D32                       __CUDA_API_PTDS(cuMemsetD2D32_v2)
+#define cuArrayCreate                       cuArrayCreate_v2
+#define cuArrayGetDescriptor                cuArrayGetDescriptor_v2
+#define cuArray3DCreate                     cuArray3DCreate_v2
+#define cuArray3DGetDescriptor              cuArray3DGetDescriptor_v2
+#define cuTexRefSetAddress                  cuTexRefSetAddress_v2
+#define cuTexRefGetAddress                  cuTexRefGetAddress_v2
+#define cuGraphicsResourceGetMappedPointer  cuGraphicsResourceGetMappedPointer_v2
+#define cuCtxDestroy                        cuCtxDestroy_v2
+#define cuCtxPopCurrent                     cuCtxPopCurrent_v2
+#define cuCtxPushCurrent                    cuCtxPushCurrent_v2
+#define cuStreamDestroy                     cuStreamDestroy_v2
+#define cuEventDestroy                      cuEventDestroy_v2
+#define cuTexRefSetAddress2D                cuTexRefSetAddress2D_v3
+#define cuLinkCreate                        cuLinkCreate_v2
+#define cuLinkAddData                       cuLinkAddData_v2
+#define cuLinkAddFile                       cuLinkAddFile_v2
+#define cuMemHostRegister                   cuMemHostRegister_v2
+#define cuGraphicsResourceSetMapFlags       cuGraphicsResourceSetMapFlags_v2
+#define cuStreamBeginCapture                __CUDA_API_PTSZ(cuStreamBeginCapture_v2)
+#define cuDevicePrimaryCtxRelease           cuDevicePrimaryCtxRelease_v2
+#define cuDevicePrimaryCtxReset             cuDevicePrimaryCtxReset_v2
+#define cuDevicePrimaryCtxSetFlags          cuDevicePrimaryCtxSetFlags_v2
+#define cuDeviceGetUuid_v2                  cuDeviceGetUuid_v2
+#define cuIpcOpenMemHandle                  cuIpcOpenMemHandle_v2
+#define cuGraphInstantiate                  cuGraphInstantiate_v2
+
+#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define cuMemcpy                            __CUDA_API_PTDS(cuMemcpy)
+    #define cuMemcpyAsync                       __CUDA_API_PTSZ(cuMemcpyAsync)
+    #define cuMemcpyPeer                        __CUDA_API_PTDS(cuMemcpyPeer)
+    #define cuMemcpyPeerAsync                   __CUDA_API_PTSZ(cuMemcpyPeerAsync)
+    #define cuMemcpy3DPeer                      __CUDA_API_PTDS(cuMemcpy3DPeer)
+    #define cuMemcpy3DPeerAsync                 __CUDA_API_PTSZ(cuMemcpy3DPeerAsync)
+    #define cuMemPrefetchAsync                  __CUDA_API_PTSZ(cuMemPrefetchAsync)
+
+    #define cuMemsetD8Async                     __CUDA_API_PTSZ(cuMemsetD8Async)
+    #define cuMemsetD16Async                    __CUDA_API_PTSZ(cuMemsetD16Async)
+    #define cuMemsetD32Async                    __CUDA_API_PTSZ(cuMemsetD32Async)
+    #define cuMemsetD2D8Async                   __CUDA_API_PTSZ(cuMemsetD2D8Async)
+    #define cuMemsetD2D16Async                  __CUDA_API_PTSZ(cuMemsetD2D16Async)
+    #define cuMemsetD2D32Async                  __CUDA_API_PTSZ(cuMemsetD2D32Async)
+
+    #define cuStreamGetPriority                 __CUDA_API_PTSZ(cuStreamGetPriority)
+    #define cuStreamGetFlags                    __CUDA_API_PTSZ(cuStreamGetFlags)
+    #define cuStreamGetCtx                      __CUDA_API_PTSZ(cuStreamGetCtx)
+    #define cuStreamWaitEvent                   __CUDA_API_PTSZ(cuStreamWaitEvent)
+    #define cuStreamEndCapture                  __CUDA_API_PTSZ(cuStreamEndCapture)
+    #define cuStreamIsCapturing                 __CUDA_API_PTSZ(cuStreamIsCapturing)
+    #define cuStreamGetCaptureInfo              __CUDA_API_PTSZ(cuStreamGetCaptureInfo)
+    #define cuStreamGetCaptureInfo_v2           __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2)
+    #define cuStreamUpdateCaptureDependencies   __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies)
+    #define cuStreamAddCallback                 __CUDA_API_PTSZ(cuStreamAddCallback)
+    #define cuStreamAttachMemAsync              __CUDA_API_PTSZ(cuStreamAttachMemAsync)
+    #define cuStreamQuery                       __CUDA_API_PTSZ(cuStreamQuery)
+    #define cuStreamSynchronize                 __CUDA_API_PTSZ(cuStreamSynchronize)
+    #define cuEventRecord                       __CUDA_API_PTSZ(cuEventRecord)
+    #define cuEventRecordWithFlags              __CUDA_API_PTSZ(cuEventRecordWithFlags)
+    #define cuLaunchKernel                      __CUDA_API_PTSZ(cuLaunchKernel)
+    #define cuLaunchKernelEx                    __CUDA_API_PTSZ(cuLaunchKernelEx)
+    #define cuLaunchHostFunc                    __CUDA_API_PTSZ(cuLaunchHostFunc)
+    #define cuGraphicsMapResources              __CUDA_API_PTSZ(cuGraphicsMapResources)
+    #define cuGraphicsUnmapResources            __CUDA_API_PTSZ(cuGraphicsUnmapResources)
+
+    #define cuStreamWriteValue32                __CUDA_API_PTSZ(cuStreamWriteValue32)
+    #define cuStreamWaitValue32                 __CUDA_API_PTSZ(cuStreamWaitValue32)
+    #define cuStreamWriteValue64                __CUDA_API_PTSZ(cuStreamWriteValue64)
+    #define cuStreamWaitValue64                 __CUDA_API_PTSZ(cuStreamWaitValue64)
+    #define cuStreamBatchMemOp                  __CUDA_API_PTSZ(cuStreamBatchMemOp)
+    #define cuStreamWriteValue32_v2             __CUDA_API_PTSZ(cuStreamWriteValue32_v2)
+    #define cuStreamWaitValue32_v2              __CUDA_API_PTSZ(cuStreamWaitValue32_v2)
+    #define cuStreamWriteValue64_v2             __CUDA_API_PTSZ(cuStreamWriteValue64_v2)
+    #define cuStreamWaitValue64_v2              __CUDA_API_PTSZ(cuStreamWaitValue64_v2)
+    #define cuStreamBatchMemOp_v2               __CUDA_API_PTSZ(cuStreamBatchMemOp_v2)
+
+    #define cuLaunchCooperativeKernel           __CUDA_API_PTSZ(cuLaunchCooperativeKernel)
+
+    #define cuSignalExternalSemaphoresAsync     __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync)
+    #define cuWaitExternalSemaphoresAsync       __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync)
+
+    #define cuGraphUpload                       __CUDA_API_PTSZ(cuGraphUpload)
+    #define cuGraphLaunch                       __CUDA_API_PTSZ(cuGraphLaunch)
+    #define cuStreamCopyAttributes              __CUDA_API_PTSZ(cuStreamCopyAttributes)
+    #define cuStreamGetAttribute                __CUDA_API_PTSZ(cuStreamGetAttribute)
+    #define cuStreamSetAttribute                __CUDA_API_PTSZ(cuStreamSetAttribute)
+    #define cuMemMapArrayAsync                  __CUDA_API_PTSZ(cuMemMapArrayAsync)
+
+    #define cuMemFreeAsync                      __CUDA_API_PTSZ(cuMemFreeAsync)
+    #define cuMemAllocAsync                     __CUDA_API_PTSZ(cuMemAllocAsync)
+    #define cuMemAllocFromPoolAsync             __CUDA_API_PTSZ(cuMemAllocFromPoolAsync)
+#endif
+
+/**
+ * \file cuda.h
+ * \brief Header file for the CUDA Toolkit application programming interface.
+ *
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * \file cudaD3D9.h
+ * \brief Header file for the Direct3D 9 interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_TYPES Data types used by CUDA driver
+ * @{
+ */
+
+/**
+ * CUDA API version number
+ */
+#define CUDA_VERSION 11080
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * CUDA device pointer
+ * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+ */
+#if defined(_WIN64) || defined(__LP64__)
+typedef unsigned long long CUdeviceptr_v2;
+#else
+typedef unsigned int CUdeviceptr_v2;
+#endif
+typedef CUdeviceptr_v2 CUdeviceptr;                          /**< CUDA device pointer */
+
+typedef int CUdevice_v1;                                     /**< CUDA device */
+typedef CUdevice_v1 CUdevice;                                /**< CUDA device */
+typedef struct CUctx_st *CUcontext;                          /**< CUDA context */
+typedef struct CUmod_st *CUmodule;                           /**< CUDA module */
+typedef struct CUfunc_st *CUfunction;                        /**< CUDA function */
+typedef struct CUarray_st *CUarray;                          /**< CUDA array */
+typedef struct CUmipmappedArray_st *CUmipmappedArray;        /**< CUDA mipmapped array */
+typedef struct CUtexref_st *CUtexref;                        /**< CUDA texture reference */
+typedef struct CUsurfref_st *CUsurfref;                      /**< CUDA surface reference */
+typedef struct CUevent_st *CUevent;                          /**< CUDA event */
+typedef struct CUstream_st *CUstream;                        /**< CUDA stream */
+typedef struct CUgraphicsResource_st *CUgraphicsResource;    /**< CUDA graphics interop resource */
+typedef unsigned long long CUtexObject_v1;                   /**< An opaque value that represents a CUDA texture object */
+typedef CUtexObject_v1 CUtexObject;                          /**< An opaque value that represents a CUDA texture object */
+typedef unsigned long long CUsurfObject_v1;                  /**< An opaque value that represents a CUDA surface object */
+typedef CUsurfObject_v1 CUsurfObject;                        /**< An opaque value that represents a CUDA surface object */ 
+typedef struct CUextMemory_st *CUexternalMemory;             /**< CUDA external memory */
+typedef struct CUextSemaphore_st *CUexternalSemaphore;       /**< CUDA external semaphore */
+typedef struct CUgraph_st *CUgraph;                          /**< CUDA graph */
+typedef struct CUgraphNode_st *CUgraphNode;                  /**< CUDA graph node */
+typedef struct CUgraphExec_st *CUgraphExec;                  /**< CUDA executable graph */
+typedef struct CUmemPoolHandle_st *CUmemoryPool;             /**< CUDA memory pool */
+typedef struct CUuserObject_st *CUuserObject;                /**< CUDA user object for graphs */
+
+#ifndef CU_UUID_HAS_BEEN_DEFINED
+#define CU_UUID_HAS_BEEN_DEFINED
+typedef struct CUuuid_st {                                /**< CUDA definition of UUID */
+    char bytes[16];
+} CUuuid;
+#endif
+
+/**
+ * CUDA IPC handle size
+ */
+#define CU_IPC_HANDLE_SIZE 64
+
+/**
+ * CUDA IPC event handle
+ */
+typedef struct CUipcEventHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcEventHandle_v1;
+typedef CUipcEventHandle_v1 CUipcEventHandle;
+
+/**
+ * CUDA IPC mem handle
+ */
+typedef struct CUipcMemHandle_st {
+    char reserved[CU_IPC_HANDLE_SIZE];
+} CUipcMemHandle_v1;
+typedef CUipcMemHandle_v1 CUipcMemHandle;
+
+/**
+ * CUDA Ipc Mem Flags
+ */
+typedef enum CUipcMem_flags_enum {
+    CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */
+} CUipcMem_flags;
+
+
+/**
+ * CUDA Mem Attach Flags
+ */
+typedef enum CUmemAttach_flags_enum {
+    CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */
+    CU_MEM_ATTACH_HOST   = 0x2, /**< Memory cannot be accessed by any stream on any device */
+    CU_MEM_ATTACH_SINGLE = 0x4  /**< Memory can only be accessed by a single stream on the associated device */
+} CUmemAttach_flags;
+
+/**
+ * Context creation flags
+ */
+typedef enum CUctx_flags_enum {
+    CU_CTX_SCHED_AUTO          = 0x00, /**< Automatic scheduling */
+    CU_CTX_SCHED_SPIN          = 0x01, /**< Set spin as default scheduling */
+    CU_CTX_SCHED_YIELD         = 0x02, /**< Set yield as default scheduling */
+    CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+    CU_CTX_BLOCKING_SYNC       = 0x04, /**< Set blocking synchronization as default scheduling
+                                         *  \deprecated This flag was deprecated as of CUDA 4.0
+                                         *  and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */
+    CU_CTX_SCHED_MASK          = 0x07,
+    CU_CTX_MAP_HOST            = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 
+                                         *  and it no longer has any effect. All contexts 
+                                         *  as of CUDA 3.2 behave as though the flag is enabled. */
+    CU_CTX_LMEM_RESIZE_TO_MAX  = 0x10, /**< Keep local memory allocation after launch */
+    CU_CTX_FLAGS_MASK          = 0x1f
+} CUctx_flags;
+
+/**
+ * Event sched flags
+ */
+typedef enum CUevent_sched_flags_enum {
+    CU_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    CU_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    CU_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    CU_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} CUevent_sched_flags;
+
+/**
+ * NVCL event scheduling flags
+ */
+typedef enum cl_event_flags_enum {
+    NVCL_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    NVCL_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    NVCL_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    NVCL_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} cl_event_flags;
+
+/**
+ * NVCL context scheduling flags
+ */
+typedef enum cl_context_flags_enum {
+    NVCL_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */
+    NVCL_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */
+    NVCL_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */
+    NVCL_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */
+} cl_context_flags;
+
+
+/**
+ * Stream creation flags
+ */
+typedef enum CUstream_flags_enum {
+    CU_STREAM_DEFAULT             = 0x0, /**< Default stream flag */
+    CU_STREAM_NON_BLOCKING        = 0x1  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+} CUstream_flags;
+
+/**
+ * Legacy stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with legacy synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_LEGACY     ((CUstream)0x1)
+
+/**
+ * Per-thread stream handle
+ *
+ * Stream handle that can be passed as a CUstream to use an implicit stream
+ * with per-thread synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define CU_STREAM_PER_THREAD ((CUstream)0x2)
+
+/**
+ * Event creation flags
+ */
+typedef enum CUevent_flags_enum {
+    CU_EVENT_DEFAULT        = 0x0, /**< Default event flag */
+    CU_EVENT_BLOCKING_SYNC  = 0x1, /**< Event uses blocking synchronization */
+    CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */
+    CU_EVENT_INTERPROCESS   = 0x4  /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */
+} CUevent_flags;
+
+/**
+ * Event record flags
+ */
+typedef enum CUevent_record_flags_enum {
+    CU_EVENT_RECORD_DEFAULT  = 0x0, /**< Default event record flag */
+    CU_EVENT_RECORD_EXTERNAL = 0x1  /**< When using stream capture, create an event record node
+                                      *  instead of the default behavior.  This flag is invalid
+                                      *  when used outside of capture. */
+} CUevent_record_flags;
+
+/**
+ * Event wait flags
+ */
+typedef enum CUevent_wait_flags_enum {
+    CU_EVENT_WAIT_DEFAULT  = 0x0, /**< Default event wait flag */
+    CU_EVENT_WAIT_EXTERNAL = 0x1  /**< When using stream capture, create an event wait node
+                                    *  instead of the default behavior.  This flag is invalid
+                                    *  when used outside of capture.*/
+} CUevent_wait_flags;
+
+/**
+ * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64
+ */
+typedef enum CUstreamWaitValue_flags_enum {
+    CU_STREAM_WAIT_VALUE_GEQ   = 0x0,   /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit
+                                             values). Note this is a cyclic comparison which ignores wraparound.
+                                             (Default behavior.) */
+    CU_STREAM_WAIT_VALUE_EQ    = 0x1,   /**< Wait until *addr == value. */
+    CU_STREAM_WAIT_VALUE_AND   = 0x2,   /**< Wait until (*addr & value) != 0. */
+    CU_STREAM_WAIT_VALUE_NOR   = 0x3,   /**< Wait until ~(*addr | value) != 0. Support for this operation can be
+                                             queried with ::cuDeviceGetAttribute() and
+                                             ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/
+    CU_STREAM_WAIT_VALUE_FLUSH = 1<<30  /**< Follow the wait operation with a flush of outstanding remote writes. This
+                                             means that, if a remote write operation is guaranteed to have reached the
+                                             device before the wait can be satisfied, that write is guaranteed to be
+                                             visible to downstream device work. The device is permitted to reorder
+                                             remote writes internally. For example, this flag would be required if
+                                             two remote writes arrive in a defined order, the wait is satisfied by the
+                                             second write, and downstream work needs to observe the first write.
+                                             Support for this operation is restricted to selected platforms and can be
+                                             queried with ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.*/
+} CUstreamWaitValue_flags;
+
+/**
+ * Flags for ::cuStreamWriteValue32
+ */
+typedef enum CUstreamWriteValue_flags_enum {
+    CU_STREAM_WRITE_VALUE_DEFAULT           = 0x0, /**< Default behavior */
+    CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1  /**< Permits the write to be reordered with writes which were issued
+                                                        before it, as a performance optimization. Normally,
+                                                        ::cuStreamWriteValue32 will provide a memory fence before the
+                                                        write, which has similar semantics to
+                                                        __threadfence_system() but is scoped to the stream
+                                                        rather than a CUDA thread.
+                                                        This flag is not supported in the v2 API. */
+} CUstreamWriteValue_flags;
+
+/**
+ * Operations for ::cuStreamBatchMemOp
+ */
+typedef enum CUstreamBatchMemOpType_enum {
+    CU_STREAM_MEM_OP_WAIT_VALUE_32  = 1,     /**< Represents a ::cuStreamWaitValue32 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2,     /**< Represents a ::cuStreamWriteValue32 operation */
+    CU_STREAM_MEM_OP_WAIT_VALUE_64  = 4,     /**< Represents a ::cuStreamWaitValue64 operation */
+    CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5,     /**< Represents a ::cuStreamWriteValue64 operation */
+    CU_STREAM_MEM_OP_BARRIER = 6,            /**< Insert a memory barrier of the specified type */ 
+    CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a
+                                                  standalone operation. */
+} CUstreamBatchMemOpType;
+
+/**
+ * Flags for ::cuStreamMemoryBarrier
+ */
+typedef enum CUstreamMemoryBarrier_flags_enum {
+    CU_STREAM_MEMORY_BARRIER_TYPE_SYS = 0x0, /**< System-wide memory barrier. */
+    CU_STREAM_MEMORY_BARRIER_TYPE_GPU = 0x1 /**< Limit memory barrier scope to the GPU. */
+} CUstreamMemoryBarrier_flags;
+
+/**
+ * Per-operation parameters for ::cuStreamBatchMemOp
+ */
+typedef union CUstreamBatchMemOpParams_union {
+    CUstreamBatchMemOpType operation;
+    struct CUstreamMemOpWaitValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } waitValue;
+    struct CUstreamMemOpWriteValueParams_st {
+        CUstreamBatchMemOpType operation;
+        CUdeviceptr address;
+        union {
+            cuuint32_t value;
+            cuuint64_t value64;
+        };
+        unsigned int flags;
+        CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */
+    } writeValue;
+    struct CUstreamMemOpFlushRemoteWritesParams_st {
+        CUstreamBatchMemOpType operation;
+        unsigned int flags;
+    } flushRemoteWrites;
+    struct CUstreamMemOpMemoryBarrierParams_st { /**< Only supported in the _v2 API */
+        CUstreamBatchMemOpType operation;
+        unsigned int flags;
+    } memoryBarrier;
+    cuuint64_t pad[6];
+} CUstreamBatchMemOpParams_v1;
+typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams;
+
+typedef struct CUDA_BATCH_MEM_OP_NODE_PARAMS_st {
+    CUcontext ctx;
+    unsigned int count;
+    CUstreamBatchMemOpParams *paramArray;
+    unsigned int flags;
+} CUDA_BATCH_MEM_OP_NODE_PARAMS;
+
+/**
+ * Occupancy calculator flag
+ */
+typedef enum CUoccupancy_flags_enum {
+    CU_OCCUPANCY_DEFAULT                  = 0x0, /**< Default behavior */
+    CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1  /**< Assume global caching is enabled and cannot be automatically turned off */
+} CUoccupancy_flags;
+
+/**
+ * Flags for ::cuStreamUpdateCaptureDependencies
+ */
+typedef enum CUstreamUpdateCaptureDependencies_flags_enum {
+    CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */
+    CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1  /**< Replace the dependency set with the new nodes */
+} CUstreamUpdateCaptureDependencies_flags;
+
+/**
+ * Array formats
+ */
+typedef enum CUarray_format_enum {
+    CU_AD_FORMAT_UNSIGNED_INT8  = 0x01, /**< Unsigned 8-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */
+    CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */
+    CU_AD_FORMAT_SIGNED_INT8    = 0x08, /**< Signed 8-bit integers */
+    CU_AD_FORMAT_SIGNED_INT16   = 0x09, /**< Signed 16-bit integers */
+    CU_AD_FORMAT_SIGNED_INT32   = 0x0a, /**< Signed 32-bit integers */
+    CU_AD_FORMAT_HALF           = 0x10, /**< 16-bit floating point */
+    CU_AD_FORMAT_FLOAT          = 0x20, /**< 32-bit floating point */
+    CU_AD_FORMAT_NV12           = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
+    CU_AD_FORMAT_UNORM_INT8X1   = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X2   = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT8X4   = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X1  = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X2  = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_UNORM_INT16X4  = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X1   = 0xc6, /**< 1 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X2   = 0xc7, /**< 2 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT8X4   = 0xc8, /**< 4 channel signed 8-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X1  = 0xc9, /**< 1 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X2  = 0xca, /**< 2 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_SNORM_INT16X4  = 0xcb, /**< 4 channel signed 16-bit normalized integer */
+    CU_AD_FORMAT_BC1_UNORM      = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
+    CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC2_UNORM      = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
+    CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC3_UNORM      = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
+    CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
+    CU_AD_FORMAT_BC4_UNORM      = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC4_SNORM      = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
+    CU_AD_FORMAT_BC5_UNORM      = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC5_SNORM      = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
+    CU_AD_FORMAT_BC6H_UF16      = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC6H_SF16      = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
+    CU_AD_FORMAT_BC7_UNORM      = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
+    CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e  /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
+} CUarray_format;
+
+/**
+ * Texture reference addressing modes
+ */
+typedef enum CUaddress_mode_enum {
+    CU_TR_ADDRESS_MODE_WRAP   = 0, /**< Wrapping address mode */
+    CU_TR_ADDRESS_MODE_CLAMP  = 1, /**< Clamp to edge address mode */
+    CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */
+    CU_TR_ADDRESS_MODE_BORDER = 3  /**< Border address mode */
+} CUaddress_mode;
+
+/**
+ * Texture reference filtering modes
+ */
+typedef enum CUfilter_mode_enum {
+    CU_TR_FILTER_MODE_POINT  = 0, /**< Point filter mode */
+    CU_TR_FILTER_MODE_LINEAR = 1  /**< Linear filter mode */
+} CUfilter_mode;
+
+/**
+ * Device properties
+ */
+typedef enum CUdevice_attribute_enum {
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1,                          /**< Maximum number of threads per block */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2,                                /**< Maximum block dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3,                                /**< Maximum block dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4,                                /**< Maximum block dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5,                                 /**< Maximum grid dimension X */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6,                                 /**< Maximum grid dimension Y */
+    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7,                                 /**< Maximum grid dimension Z */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8,                    /**< Maximum shared memory available per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8,                        /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9,                          /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10,                                     /**< Warp size in threads */
+    CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11,                                     /**< Maximum pitch in bytes allowed by memory copies */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12,                       /**< Maximum number of 32-bit registers available per block */
+    CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12,                           /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */
+    CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13,                                    /**< Typical clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14,                             /**< Alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15,                                   /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */
+    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16,                          /**< Number of multiprocessors on device */
+    CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17,                           /**< Specifies whether there is a run time limit on kernels */
+    CU_DEVICE_ATTRIBUTE_INTEGRATED = 18,                                    /**< Device is integrated with host memory */
+    CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19,                           /**< Device can map host memory into CUDA address space */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20,                                  /**< Compute mode (See ::CUcomputemode for details) */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21,                       /**< Maximum 1D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22,                       /**< Maximum 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23,                      /**< Maximum 2D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24,                       /**< Maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25,                      /**< Maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26,                       /**< Maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27,               /**< Maximum 2D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28,              /**< Maximum 2D layered texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29,              /**< Maximum layers in a 2D layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27,                 /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28,                /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29,             /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */
+    CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30,                             /**< Alignment requirement for surfaces */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31,                            /**< Device can possibly execute multiple kernels concurrently */
+    CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32,                                   /**< Device has ECC support enabled */
+    CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33,                                    /**< PCI bus ID of the device */
+    CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34,                                 /**< PCI device ID of the device */
+    CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35,                                    /**< Device is using TCC driver model */
+    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36,                             /**< Peak memory clock frequency in kilohertz */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37,                       /**< Global memory bus width in bits */
+    CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38,                                 /**< Size of L2 cache in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39,                /**< Maximum resident threads per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40,                            /**< Number of asynchronous engines */
+    CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41,                            /**< Device shares a unified address space with the host */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42,               /**< Maximum 1D layered texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43,              /**< Maximum layers in a 1D layered texture */
+    CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44,                              /**< Deprecated, do not use. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45,                /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46,               /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47,             /**< Alternate maximum 3D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48,            /**< Alternate maximum 3D texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49,             /**< Alternate maximum 3D texture depth */
+    CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50,                                 /**< PCI domain ID of the device */
+    CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51,                       /**< Pitch alignment requirement for textures */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52,                  /**< Maximum cubemap texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53,          /**< Maximum cubemap layered texture width/height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54,         /**< Maximum layers in a cubemap layered texture */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55,                       /**< Maximum 1D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56,                       /**< Maximum 2D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57,                      /**< Maximum 2D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58,                       /**< Maximum 3D surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59,                      /**< Maximum 3D surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60,                       /**< Maximum 3D surface depth */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61,               /**< Maximum 1D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62,              /**< Maximum layers in a 1D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63,               /**< Maximum 2D layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64,              /**< Maximum 2D layered surface height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65,              /**< Maximum layers in a 2D layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66,                  /**< Maximum cubemap surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67,          /**< Maximum cubemap layered surface width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68,         /**< Maximum layers in a cubemap layered surface */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69,                /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70,                /**< Maximum 2D linear texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71,               /**< Maximum 2D linear texture height */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72,                /**< Maximum 2D linear texture pitch in bytes */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73,             /**< Maximum mipmapped 2D texture width */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74,            /**< Maximum mipmapped 2D texture height */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75,                      /**< Major compute capability version number */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76,                      /**< Minor compute capability version number */
+    CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77,             /**< Maximum mipmapped 1D texture width */
+    CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78,                   /**< Device supports stream priorities */
+    CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79,                     /**< Device supports caching globals in L1 */
+    CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80,                      /**< Device supports caching locals in L1 */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81,          /**< Maximum shared memory available per multiprocessor in bytes */
+    CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82,              /**< Maximum number of 32-bit registers available per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83,                                /**< Device can allocate managed memory on this system */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84,                               /**< Device is on a multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85,                      /**< Unique id for a group of devices on the same multi-GPU board */
+    CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86,                  /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/
+    CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87,         /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88,                        /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89,                     /**< Device can coherently access managed memory concurrently with the CPU */
+    CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90,                  /**< Device supports compute preemption. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91,       /**< Device can access host registered memory at the same virtual address as the CPU */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92,                        /**< ::cuStreamBatchMemOp and related APIs are supported. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93,                 /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94,                 /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95,                            /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */
+    CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96,               /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */
+    CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97,             /**< Maximum optin shared memory per block */
+    CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98,                       /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */
+    CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99,                       /**< Device supports host memory registration via ::cudaHostRegister. */
+    CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */
+    CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101,          /**< The host can directly access managed memory on the device without migration. */
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102,         /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/
+    CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102,         /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,  /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,           /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,       /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106,                /**< Maximum number of blocks per multiprocessor */
+    CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107,                /**< Device supports compression of memory */
+    CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108,                 /**< Maximum L2 persisting lines capacity setting in bytes. */
+    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109,                /**< Maximum value of CUaccessPolicyWindow::num_bytes. */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110,      /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
+    CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111,             /**< Shared memory reserved by CUDA driver per block in bytes */
+    CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112,                  /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113,            /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
+    CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114,         /**< External timeline semaphore interop is supported on the device */
+    CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,                       /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116,                    /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117,         /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */
+    CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118,              /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+    CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119,               /**< Handle types supported with mempool based IPC */
+    CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120,                               /**< Indicates device supports cluster launch */
+    CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121,        /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122,             /**< 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */
+    CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123,             /**< ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */
+    CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124,                            /**< Device supports buffer sharing with dma_buf mechanism. */ 
+    CU_DEVICE_ATTRIBUTE_MAX
+} CUdevice_attribute;
+
+/**
+ * Legacy device properties
+ */
+typedef struct CUdevprop_st {
+    int maxThreadsPerBlock;     /**< Maximum number of threads per block */
+    int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
+    int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
+    int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
+    int totalConstantMemory;    /**< Constant memory available on device in bytes */
+    int SIMDWidth;              /**< Warp size in threads */
+    int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
+    int regsPerBlock;           /**< 32-bit registers available per block */
+    int clockRate;              /**< Clock frequency in kilohertz */
+    int textureAlign;           /**< Alignment requirement for textures */
+} CUdevprop_v1;
+typedef CUdevprop_v1 CUdevprop;
+
+/**
+ * Pointer information
+ */
+typedef enum CUpointer_attribute_enum {
+    CU_POINTER_ATTRIBUTE_CONTEXT = 1,                     /**< The ::CUcontext on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2,                 /**< The ::CUmemorytype describing the physical location of a pointer */
+    CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3,              /**< The address at which a pointer's memory may be accessed on the device */
+    CU_POINTER_ATTRIBUTE_HOST_POINTER = 4,                /**< The address at which a pointer's memory may be accessed on the host */
+    CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5,                  /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */
+    CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6,                 /**< Synchronize every synchronous memory operation initiated on this region */
+    CU_POINTER_ATTRIBUTE_BUFFER_ID = 7,                   /**< A process-wide unique ID for an allocated memory region*/
+    CU_POINTER_ATTRIBUTE_IS_MANAGED = 8,                  /**< Indicates if the pointer points to managed memory */
+    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9,              /**< A device ordinal of a device on which a pointer was allocated or registered */
+    CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/
+    CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11,           /**< Starting address for this requested pointer */
+    CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12,                 /**< Size of the address range for this requested pointer */
+    CU_POINTER_ATTRIBUTE_MAPPED = 13,                     /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/
+    CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14,       /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/
+    CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16,               /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */
+    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17              /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/
+    ,
+    CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18,               /**< Size of the actual underlying mapping that the pointer belongs to **/
+    CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19,          /**< The start address of the mapping that the pointer belongs to **/
+    CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20             /**< A process-wide unique id corresponding to the physical allocation the pointer belongs to **/
+} CUpointer_attribute;
+
+/**
+ * Function properties
+ */
+typedef enum CUfunction_attribute_enum {
+    /**
+     * The maximum number of threads per block, beyond which a launch of the
+     * function would fail. This number depends on both the function and the
+     * device on which the function is currently loaded.
+     */
+    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0,
+
+    /**
+     * The size in bytes of statically-allocated shared memory required by
+     * this function. This does not include dynamically-allocated shared
+     * memory requested by the user at runtime.
+     */
+    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1,
+
+    /**
+     * The size in bytes of user-allocated constant memory required by this
+     * function.
+     */
+    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2,
+
+    /**
+     * The size in bytes of local memory used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3,
+
+    /**
+     * The number of registers used by each thread of this function.
+     */
+    CU_FUNC_ATTRIBUTE_NUM_REGS = 4,
+
+    /**
+     * The PTX virtual architecture version for which the function was
+     * compiled. This value is the major PTX version * 10 + the minor PTX
+     * version, so a PTX version 1.3 function would return the value 13.
+     * Note that this may return the undefined value of 0 for cubins
+     * compiled prior to CUDA 3.0.
+     */
+    CU_FUNC_ATTRIBUTE_PTX_VERSION = 5,
+
+    /**
+     * The binary architecture version for which the function was compiled.
+     * This value is the major binary version * 10 + the minor binary version,
+     * so a binary version 1.3 function would return the value 13. Note that
+     * this will return a value of 10 for legacy cubins that do not have a
+     * properly-encoded binary architecture version.
+     */
+    CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6,
+
+    /**
+     * The attribute to indicate whether the function has been compiled with
+     * user specified option "-Xptxas --dlcm=ca" set .
+     */
+    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7,
+
+    /**
+     * The maximum size in bytes of dynamically-allocated shared memory that can be used by
+     * this function. If the user-specified dynamic shared memory size is larger than this
+     * value, the launch will fail.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8,
+
+    /**
+     * On devices where the L1 cache and shared memory use the same hardware resources, 
+     * this sets the shared memory carveout preference, in percent of the total shared memory.
+     * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR.
+     * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9,
+
+    /**
+     * If this attribute is set, the kernel must launch with a valid cluster
+     * size specified.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10,
+
+    /**
+     * The required cluster width in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11,
+
+    /**
+     * The required cluster height in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12,
+
+    /**
+     * The required cluster depth in blocks. The values must either all be 0 or
+     * all be positive. The validity of the cluster dimensions is otherwise
+     * checked at launch time.
+     *
+     * If the value is set during compile time, it cannot be set at runtime.
+     * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13,
+
+    /**
+     * Whether the function can be launched with non-portable cluster size. 1 is
+     * allowed, 0 is disallowed. A non-portable cluster size may only function
+     * on the specific SKUs the program is tested on. The launch might fail if
+     * the program is run on a different hardware platform.
+     *
+     * CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking
+     * whether the desired size can be launched on the current device.
+     *
+     * Portable Cluster Size
+     *
+     * A portable cluster size is guaranteed to be functional on all compute
+     * capabilities higher than the target compute capability. The portable
+     * cluster size for sm_90 is 8 blocks per cluster. This value may increase
+     * for future compute capabilities.
+     *
+     * The specific hardware unit may support higher cluster sizes that’s not
+     * guaranteed to be portable.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14,
+
+    /**
+     * The block scheduling policy of a function. The value type is
+     * CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy.
+     * See ::cuFuncSetAttribute
+     */
+    CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15,
+
+    CU_FUNC_ATTRIBUTE_MAX
+} CUfunction_attribute;
+
+/**
+ * Function cache configurations
+ */
+typedef enum CUfunc_cache_enum {
+    CU_FUNC_CACHE_PREFER_NONE    = 0x00, /**< no preference for shared memory or L1 (default) */
+    CU_FUNC_CACHE_PREFER_SHARED  = 0x01, /**< prefer larger shared memory and smaller L1 cache */
+    CU_FUNC_CACHE_PREFER_L1      = 0x02, /**< prefer larger L1 cache and smaller shared memory */
+    CU_FUNC_CACHE_PREFER_EQUAL   = 0x03  /**< prefer equal sized L1 cache and shared memory */
+} CUfunc_cache;
+
+/**
+ * Shared memory configurations
+ */
+typedef enum CUsharedconfig_enum {
+    CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE    = 0x00, /**< set default shared memory bank size */
+    CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE  = 0x01, /**< set shared memory bank width to four bytes */
+    CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02  /**< set shared memory bank width to eight bytes */
+} CUsharedconfig;
+
+/**
+ * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute
+ */
+typedef enum CUshared_carveout_enum {
+    CU_SHAREDMEM_CARVEOUT_DEFAULT       = -1,  /**< No preference for shared memory or L1 (default) */
+    CU_SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
+    CU_SHAREDMEM_CARVEOUT_MAX_L1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
+} CUshared_carveout;
+
+/**
+ * Memory types
+ */
+typedef enum CUmemorytype_enum {
+    CU_MEMORYTYPE_HOST    = 0x01,    /**< Host memory */
+    CU_MEMORYTYPE_DEVICE  = 0x02,    /**< Device memory */
+    CU_MEMORYTYPE_ARRAY   = 0x03,    /**< Array memory */
+    CU_MEMORYTYPE_UNIFIED = 0x04     /**< Unified device or host memory */
+} CUmemorytype;
+
+/**
+ * Compute Modes
+ */
+typedef enum CUcomputemode_enum {
+    CU_COMPUTEMODE_DEFAULT           = 0, /**< Default compute mode (Multiple contexts allowed per device) */
+    CU_COMPUTEMODE_PROHIBITED        = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */
+    CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3  /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */
+} CUcomputemode;
+
+/**
+ * Memory advise values
+ */
+typedef enum CUmem_advise_enum {
+    CU_MEM_ADVISE_SET_READ_MOSTLY          = 1, /**< Data will mostly be read and only occassionally be written to */
+    CU_MEM_ADVISE_UNSET_READ_MOSTLY        = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
+    CU_MEM_ADVISE_SET_PREFERRED_LOCATION   = 3, /**< Set the preferred location for the data as the specified device */
+    CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
+    CU_MEM_ADVISE_SET_ACCESSED_BY          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
+    CU_MEM_ADVISE_UNSET_ACCESSED_BY        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
+} CUmem_advise;
+
+typedef enum CUmem_range_attribute_enum {
+    CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY            = 1, /**< Whether the range will mostly be read and only occassionally be written to */
+    CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION     = 2, /**< The preferred location of the range */
+    CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY            = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
+    CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4  /**< The last location to which the range was prefetched */
+} CUmem_range_attribute;
+
+/**
+ * Online compiler and linker options
+ */
+typedef enum CUjit_option_enum
+{
+    /**
+     * Max number of registers that a thread may use.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_MAX_REGISTERS = 0,
+
+    /**
+     * IN: Specifies minimum number of threads per block to target compilation
+     * for\n
+     * OUT: Returns the number of threads the compiler actually targeted.
+     * This restricts the resource utilization fo the compiler (e.g. max
+     * registers) such that a block with the given number of threads should be
+     * able to launch based on register limitations. Note, this option does not
+     * currently take into account any other resource limitations, such as
+     * shared memory utilization.\n
+     * Cannot be combined with ::CU_JIT_TARGET.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_THREADS_PER_BLOCK,
+
+    /**
+     * Overwrites the option value with the total wall clock time, in
+     * milliseconds, spent in the compiler and linker\n
+     * Option type: float\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_WALL_TIME,
+
+    /**
+     * Pointer to a buffer in which to print any log messages
+     * that are informational in nature (the buffer size is specified via
+     * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Pointer to a buffer in which to print any log messages that
+     * reflect errors (the buffer size is specified via option
+     * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n
+     * Option type: char *\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER,
+
+    /**
+     * IN: Log buffer size in bytes.  Log messages will be capped at this size
+     * (including null terminator)\n
+     * OUT: Amount of log buffer filled with messages\n
+     * Option type: unsigned int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
+
+    /**
+     * Level of optimizations to apply to generated code (0 - 4), with 4
+     * being the default and highest level of optimizations.\n
+     * Option type: unsigned int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_OPTIMIZATION_LEVEL,
+
+    /**
+     * No option value required. Determines the target based on the current
+     * attached context (default)\n
+     * Option type: No option value needed\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET_FROM_CUCONTEXT,
+
+    /**
+     * Target is chosen based on supplied ::CUjit_target.  Cannot be
+     * combined with ::CU_JIT_THREADS_PER_BLOCK.\n
+     * Option type: unsigned int for enumerated type ::CUjit_target\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_TARGET,
+
+    /**
+     * Specifies choice of fallback strategy if matching cubin is not found.
+     * Choice is based on supplied ::CUjit_fallback.  This option cannot be
+     * used with cuLink* APIs as the linker requires exact matches.\n
+     * Option type: unsigned int for enumerated type ::CUjit_fallback\n
+     * Applies to: compiler only
+     */
+    CU_JIT_FALLBACK_STRATEGY,
+
+    /**
+     * Specifies whether to create debug information in output (-g)
+     * (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_GENERATE_DEBUG_INFO,
+
+    /**
+     * Generate verbose log messages (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_LOG_VERBOSE,
+
+    /**
+     * Generate line number information (-lineinfo) (0: false, default)\n
+     * Option type: int\n
+     * Applies to: compiler only
+     */
+    CU_JIT_GENERATE_LINE_INFO,
+
+    /**
+     * Specifies whether to enable caching explicitly (-dlcm) \n
+     * Choice is based on supplied ::CUjit_cacheMode_enum.\n
+     * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n
+     * Applies to: compiler only
+     */
+    CU_JIT_CACHE_MODE,
+
+    /**
+     * \deprecated
+     * This jit option is deprecated and should not be used.
+     */
+    CU_JIT_NEW_SM3X_OPT,
+
+    /**
+     * This jit option is used for internal purpose only.
+     */
+    CU_JIT_FAST_COMPILE,
+
+    /**
+     * Array of device symbol names that will be relocated to the corresponing
+     * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * When loding a device module, driver will relocate all encountered
+     * unresolved symbols to the host addresses.\n
+     * It is only allowed to register symbols that correspond to unresolved
+     * global variables.\n
+     * It is illegal to register the same device symbol at multiple addresses.\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_NAMES,
+
+    /**
+     * Array of host addresses that will be used to relocate corresponding
+     * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n
+     * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
+     * Option type: void **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
+
+    /**
+     * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and
+     * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_GLOBAL_SYMBOL_COUNT,
+
+    /**
+     * Enable link-time optimization (-dlto) for device code (Disabled by default).\n
+     * This option is not supported on 32-bit platforms.\n
+     * Option type: int\n
+     * Applies to: compiler and linker
+     */
+    CU_JIT_LTO,
+
+    /**
+     * Control single-precision denormals (-ftz) support (0: false, default).
+     * 1 : flushes denormal values to zero
+     * 0 : preserves denormal values
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_FTZ,
+
+    /**
+     * Control single-precision floating-point division and reciprocals
+     * (-prec-div) support (1: true, default).
+     * 1 : Enables the IEEE round-to-nearest mode
+     * 0 : Enables the fast approximation mode
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_PREC_DIV,
+
+    /**
+     * Control single-precision floating-point square root
+     * (-prec-sqrt) support (1: true, default).
+     * 1 : Enables the IEEE round-to-nearest mode
+     * 0 : Enables the fast approximation mode
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_PREC_SQRT,
+
+    /**
+     * Enable/Disable the contraction of floating-point multiplies
+     * and adds/subtracts into floating-point multiply-add (-fma)
+     * operations (1: Enable, default; 0: Disable).
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_FMA,
+
+    /**
+     * Array of kernel names that should be preserved at link time while others
+     * can be removed.\n
+     * Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n
+     * Note that kernel names can be mangled by the compiler in which case the
+     * mangled name needs to be specified.\n
+     * Wildcard "*" can be used to represent zero or more characters instead of
+     * specifying the full or mangled name.\n
+     * It is important to note that the wildcard "*" is also added implicitly.
+     * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and
+     * thus preserve all kernels with those names. This can be avoided by providing
+     * a more specific name like "barfoobaz".\n
+     * Option type: const char **\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_REFERENCED_KERNEL_NAMES,
+
+    /**
+     * Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array.\n
+     * Option type: unsigned int\n
+     * Applies to: dynamic linker only
+     */
+    CU_JIT_REFERENCED_KERNEL_COUNT,
+
+    /**
+     * Array of variable names (__device__ and/or __constant__) that should be
+     * preserved at link time while others can be removed.\n
+     * Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n
+     * Note that variable names can be mangled by the compiler in which case the
+     * mangled name needs to be specified.\n
+     * Wildcard "*" can be used to represent zero or more characters instead of
+     * specifying the full or mangled name.\n
+     * It is important to note that the wildcard "*" is also added implicitly.
+     * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and
+     * thus preserve all variables with those names. This can be avoided by providing
+     * a more specific name like "barfoobaz".\n
+     * Option type: const char **\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_REFERENCED_VARIABLE_NAMES,
+
+    /**
+     * Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array.\n
+     * Option type: unsigned int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_REFERENCED_VARIABLE_COUNT,
+
+    /**
+     * This option serves as a hint to enable the JIT compiler/linker
+     * to remove constant (__constant__) and device (__device__) variables
+     * unreferenced in device code (Disabled by default).\n
+     * Note that host references to constant and device variables using APIs like
+     * ::cuModuleGetGlobal() with this option specified may result in undefined behavior unless
+     * the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n
+     * Option type: int\n
+     * Applies to: link-time optimization specified with CU_JIT_LTO
+     */
+    CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES,
+
+    CU_JIT_NUM_OPTIONS
+
+} CUjit_option;
+
+/**
+ * Online compilation targets
+ */
+typedef enum CUjit_target_enum
+{
+    CU_TARGET_COMPUTE_20 = 20,       /**< Compute device class 2.0 */
+    CU_TARGET_COMPUTE_21 = 21,       /**< Compute device class 2.1 */
+    CU_TARGET_COMPUTE_30 = 30,       /**< Compute device class 3.0 */
+    CU_TARGET_COMPUTE_32 = 32,       /**< Compute device class 3.2 */
+    CU_TARGET_COMPUTE_35 = 35,       /**< Compute device class 3.5 */
+    CU_TARGET_COMPUTE_37 = 37,       /**< Compute device class 3.7 */
+    CU_TARGET_COMPUTE_50 = 50,       /**< Compute device class 5.0 */
+    CU_TARGET_COMPUTE_52 = 52,       /**< Compute device class 5.2 */
+    CU_TARGET_COMPUTE_53 = 53,       /**< Compute device class 5.3 */
+    CU_TARGET_COMPUTE_60 = 60,       /**< Compute device class 6.0.*/
+    CU_TARGET_COMPUTE_61 = 61,       /**< Compute device class 6.1.*/
+    CU_TARGET_COMPUTE_62 = 62,       /**< Compute device class 6.2.*/
+    CU_TARGET_COMPUTE_70 = 70,       /**< Compute device class 7.0.*/
+    CU_TARGET_COMPUTE_72 = 72,       /**< Compute device class 7.2.*/
+    CU_TARGET_COMPUTE_75 = 75,       /**< Compute device class 7.5.*/
+    CU_TARGET_COMPUTE_80 = 80,       /**< Compute device class 8.0.*/
+    CU_TARGET_COMPUTE_86 = 86,       /**< Compute device class 8.6.*/
+    CU_TARGET_COMPUTE_87 = 87,       /**< Compute device class 8.7.*/
+    CU_TARGET_COMPUTE_89 = 89,       /**< Compute device class 8.9.*/
+    CU_TARGET_COMPUTE_90 = 90        /**< Compute device class 9.0.*/
+} CUjit_target;
+
+/**
+ * Cubin matching fallback strategies
+ */
+typedef enum CUjit_fallback_enum
+{
+    CU_PREFER_PTX = 0,  /**< Prefer to compile ptx if exact binary match not found */
+
+    CU_PREFER_BINARY    /**< Prefer to fall back to compatible binary code if exact match not found */
+
+} CUjit_fallback;
+
+/**
+ * Caching modes for dlcm
+ */
+typedef enum CUjit_cacheMode_enum
+{
+    CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */
+    CU_JIT_CACHE_OPTION_CG,       /**< Compile with L1 cache disabled */
+    CU_JIT_CACHE_OPTION_CA        /**< Compile with L1 cache enabled */
+} CUjit_cacheMode;
+
+/**
+ * Device code formats
+ */
+typedef enum CUjitInputType_enum
+{
+    /**
+     * Compiled device-class-specific device code\n
+     * Applicable options: none
+     */
+    CU_JIT_INPUT_CUBIN = 0,
+
+    /**
+     * PTX source code\n
+     * Applicable options: PTX compiler options
+     */
+    CU_JIT_INPUT_PTX,
+
+    /**
+     * Bundle of multiple cubins and/or PTX of some device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_FATBINARY,
+
+    /**
+     * Host object with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_OBJECT,
+
+    /**
+     * Archive of host objects with embedded device code\n
+     * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY
+     */
+    CU_JIT_INPUT_LIBRARY,
+
+    /**
+     * High-level intermediate code for link-time optimization\n
+     * Applicable options: NVVM compiler options, PTX compiler options
+     */
+    CU_JIT_INPUT_NVVM,
+
+    CU_JIT_NUM_INPUT_TYPES
+} CUjitInputType;
+
+typedef struct CUlinkState_st *CUlinkState;
+
+/**
+ * Flags to register a graphics resource
+ */
+typedef enum CUgraphicsRegisterFlags_enum {
+    CU_GRAPHICS_REGISTER_FLAGS_NONE           = 0x00,
+    CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY      = 0x01,
+    CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD  = 0x02,
+    CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST   = 0x04,
+    CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08
+} CUgraphicsRegisterFlags;
+
+/**
+ * Flags for mapping and unmapping interop resources
+ */
+typedef enum CUgraphicsMapResourceFlags_enum {
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02
+} CUgraphicsMapResourceFlags;
+
+/**
+ * Array indices for cube faces
+ */
+typedef enum CUarray_cubemap_face_enum {
+    CU_CUBEMAP_FACE_POSITIVE_X  = 0x00, /**< Positive X face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_X  = 0x01, /**< Negative X face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Y  = 0x02, /**< Positive Y face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Y  = 0x03, /**< Negative Y face of cubemap */
+    CU_CUBEMAP_FACE_POSITIVE_Z  = 0x04, /**< Positive Z face of cubemap */
+    CU_CUBEMAP_FACE_NEGATIVE_Z  = 0x05  /**< Negative Z face of cubemap */
+} CUarray_cubemap_face;
+
+/**
+ * Limits
+ */
+typedef enum CUlimit_enum {
+    CU_LIMIT_STACK_SIZE                       = 0x00, /**< GPU thread stack size */
+    CU_LIMIT_PRINTF_FIFO_SIZE                 = 0x01, /**< GPU printf FIFO size */
+    CU_LIMIT_MALLOC_HEAP_SIZE                 = 0x02, /**< GPU malloc heap size */
+    CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH           = 0x03, /**< GPU device runtime launch synchronize depth */
+    CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */
+    CU_LIMIT_MAX_L2_FETCH_GRANULARITY         = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    CU_LIMIT_PERSISTING_L2_CACHE_SIZE         = 0x06, /**< A size in bytes for L2 persisting lines cache size */
+    CU_LIMIT_MAX
+} CUlimit;
+
+/**
+ * Resource types
+ */
+typedef enum CUresourcetype_enum {
+    CU_RESOURCE_TYPE_ARRAY           = 0x00, /**< Array resoure */
+    CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
+    CU_RESOURCE_TYPE_LINEAR          = 0x02, /**< Linear resource */
+    CU_RESOURCE_TYPE_PITCH2D         = 0x03  /**< Pitch 2D resource */
+} CUresourcetype;
+
+#ifdef _WIN32
+#define CUDA_CB __stdcall
+#else
+#define CUDA_CB
+#endif
+
+/**
+ * CUDA host function
+ * \param userData Argument value passed to the function
+ */
+typedef void (CUDA_CB *CUhostFn)(void *userData);
+
+/**
+ * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members.
+ */
+typedef enum CUaccessProperty_enum {
+    CU_ACCESS_PROPERTY_NORMAL           = 0,    /**< Normal cache persistence. */
+    CU_ACCESS_PROPERTY_STREAMING        = 1,    /**< Streaming access is less likely to persit from cache. */
+    CU_ACCESS_PROPERTY_PERSISTING       = 2     /**< Persisting access is more likely to persist in cache.*/
+} CUaccessProperty;
+
+/**
+ * Specifies an access policy for a window, a contiguous extent of memory
+ * beginning at base_ptr and ending at base_ptr + num_bytes.
+ * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE.
+ * Partition into many segments and assign segments such that:
+ * sum of "hit segments" / window == approx. ratio.
+ * sum of "miss segments" / window == approx 1-ratio.
+ * Segments and ratio specifications are fitted to the capabilities of
+ * the architecture.
+ * Accesses in a hit segment apply the hitProp access policy.
+ * Accesses in a miss segment apply the missProp access policy.
+ */
+typedef struct CUaccessPolicyWindow_st {
+    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
+    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
+    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
+    CUaccessProperty hitProp;           /**< ::CUaccessProperty set for hit. */
+    CUaccessProperty missProp;          /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */
+} CUaccessPolicyWindow_v1;
+typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow;
+
+/**
+ * GPU kernel node parameters
+ */
+typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+    CUfunction func;             /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+    void **extra;                /**< Extra options */
+} CUDA_KERNEL_NODE_PARAMS_v1;
+typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS;
+
+/**
+ * Memset node parameters
+ */
+typedef struct CUDA_MEMSET_NODE_PARAMS_st {
+    CUdeviceptr dst;                        /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+} CUDA_MEMSET_NODE_PARAMS_v1;
+typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS;
+
+/**
+ * Host node parameters
+ */
+typedef struct CUDA_HOST_NODE_PARAMS_st {
+    CUhostFn fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+} CUDA_HOST_NODE_PARAMS_v1;
+typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS;
+
+/**
+ * Graph node types
+ */
+typedef enum CUgraphNodeType_enum {
+    CU_GRAPH_NODE_TYPE_KERNEL           = 0, /**< GPU kernel node */
+    CU_GRAPH_NODE_TYPE_MEMCPY           = 1, /**< Memcpy node */
+    CU_GRAPH_NODE_TYPE_MEMSET           = 2, /**< Memset node */
+    CU_GRAPH_NODE_TYPE_HOST             = 3, /**< Host (executable) node */
+    CU_GRAPH_NODE_TYPE_GRAPH            = 4, /**< Node which executes an embedded graph */
+    CU_GRAPH_NODE_TYPE_EMPTY            = 5, /**< Empty (no-op) node */
+    CU_GRAPH_NODE_TYPE_WAIT_EVENT       = 6, /**< External event wait node */
+    CU_GRAPH_NODE_TYPE_EVENT_RECORD     = 7, /**< External event record node */
+    CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */
+    CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT   = 9, /**< External semaphore wait node */
+    CU_GRAPH_NODE_TYPE_MEM_ALLOC        = 10,/**< Memory Allocation Node */
+    CU_GRAPH_NODE_TYPE_MEM_FREE         = 11 /**< Memory Free Node */
+    ,
+    CU_GRAPH_NODE_TYPE_BATCH_MEM_OP     = 12 /**< Batch MemOp Node */
+} CUgraphNodeType;
+
+typedef enum CUsynchronizationPolicy_enum {
+    CU_SYNC_POLICY_AUTO = 1,
+    CU_SYNC_POLICY_SPIN = 2,
+    CU_SYNC_POLICY_YIELD = 3,
+    CU_SYNC_POLICY_BLOCKING_SYNC = 4
+} CUsynchronizationPolicy;
+
+/**
+ * Cluster scheduling policies. These may be passed to ::cuFuncSetAttribute
+ */
+typedef enum CUclusterSchedulingPolicy_enum {
+    CU_CLUSTER_SCHEDULING_POLICY_DEFAULT        = 0, /**< the default policy */
+    CU_CLUSTER_SCHEDULING_POLICY_SPREAD         = 1, /**< spread the blocks within a cluster to the SMs */
+    CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING = 2  /**< allow the hardware to load-balance the blocks in a cluster to the SMs */
+} CUclusterSchedulingPolicy;
+
+typedef enum CUlaunchAttributeID_enum {
+    CU_LAUNCH_ATTRIBUTE_IGNORE = 0 /**< Ignored entry, for convenient composition */
+  , CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW   = 1 /**< Valid for streams, graph nodes, launches. */
+  , CU_LAUNCH_ATTRIBUTE_COOPERATIVE            = 2 /**< Valid for graph nodes, launches. */
+  , CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< Valid for streams. */
+  , CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION                    = 4 /**< Valid for graph nodes, launches. */
+  , CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5 /**< Valid for graph nodes, launches. */
+  , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION    = 6 /**< Valid for launches. Setting
+                                                                      programmaticStreamSerializationAllowed to non-0
+                                                                      signals that the kernel will use programmatic
+                                                                      means to resolve its stream dependency, so that
+                                                                      the CUDA runtime should opportunistically allow
+                                                                      the grid's execution to overlap with the previous
+                                                                      kernel in the stream, if that kernel requests the
+                                                                      overlap. */
+  , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT                   = 7 /**< Valid for launches. Event recorded through this
+                                                                      launch attribute is guaranteed to only trigger
+                                                                      after all block in the associated kernel trigger
+                                                                      the event. A block can trigger the event through
+                                                                      PTX griddepcontrol.launch_dependents. A trigger
+                                                                      can also be inserted at the beginning of each
+                                                                      block's execution if triggerAtBlockStart is set to
+                                                                      non-0. Note that dependents (including the CPU
+                                                                      thread calling cuEventSynchronize()) are not
+                                                                      guaranteed to observe the release precisely when
+                                                                      it is released. For example, cuEventSynchronize()
+                                                                      may only observe the event trigger long after the
+                                                                      associated kernel has completed. This recording
+                                                                      type is primarily meant for establishing
+                                                                      programmatic dependency between device tasks. The
+                                                                      event supplied must not be an interprocess or
+                                                                      interop event. The event must disable timing
+                                                                      (i.e. created with ::CU_EVENT_DISABLE_TIMING flag
+                                                                      set). */
+  , CU_LAUNCH_ATTRIBUTE_PRIORITY               = 8 /**< Valid for graph nodes. */
+} CUlaunchAttributeID;
+
+typedef union CUlaunchAttributeValue_union {
+    char pad[64]; /**< Pad to 64 bytes */
+    CUaccessPolicyWindow accessPolicyWindow;
+    int cooperative;
+    CUsynchronizationPolicy syncPolicy;
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } clusterDim;
+    CUclusterSchedulingPolicy clusterSchedulingPolicyPreference;
+    int programmaticStreamSerializationAllowed;
+    struct {
+        CUevent event;
+        int flags;                      /* Does not accept ::CU_EVENT_RECORD_EXTERNAL */
+        int triggerAtBlockStart;
+    } programmaticEvent;
+    int priority;
+} CUlaunchAttributeValue;
+
+typedef struct CUlaunchAttribute_st {
+    CUlaunchAttributeID id;
+    char pad[8 - sizeof(CUlaunchAttributeID)];
+    CUlaunchAttributeValue value;
+} CUlaunchAttribute;
+
+typedef struct CUlaunchConfig_st {
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    CUstream hStream;            /**< Stream identifier */
+    CUlaunchAttribute *attrs;    /**< nullable if numAttrs == 0 */
+    unsigned int numAttrs;       /**< number of attributes populated in attrs */
+} CUlaunchConfig;
+
+/**
+ * Graph kernel node Attributes
+ */
+typedef CUlaunchAttributeID CUkernelNodeAttrID;
+#define CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+#define CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE          CU_LAUNCH_ATTRIBUTE_COOPERATIVE
+#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_DIMENSION                    CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
+#define CU_KERNEL_NODE_ATTRIBUTE_PRIORITY             CU_LAUNCH_ATTRIBUTE_PRIORITY
+
+/**
+ * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute
+ */
+typedef CUlaunchAttributeValue CUkernelNodeAttrValue_v1;
+typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue;
+
+/**
+ * Possible stream capture statuses returned by ::cuStreamIsCapturing
+ */
+typedef enum CUstreamCaptureStatus_enum {
+    CU_STREAM_CAPTURE_STATUS_NONE        = 0, /**< Stream is not capturing */
+    CU_STREAM_CAPTURE_STATUS_ACTIVE      = 1, /**< Stream is actively capturing */
+    CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2  /**< Stream is part of a capture sequence that
+                                                   has been invalidated, but not terminated */
+} CUstreamCaptureStatus;
+
+/**
+ * Possible modes for stream capture thread interactions. For more details see
+ * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode
+ */
+typedef enum CUstreamCaptureMode_enum {
+    CU_STREAM_CAPTURE_MODE_GLOBAL       = 0,
+    CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1,
+    CU_STREAM_CAPTURE_MODE_RELAXED      = 2
+} CUstreamCaptureMode;
+
+/**
+ * Stream Attributes 
+ */
+typedef CUlaunchAttributeID CUstreamAttrID;
+#define CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW   CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+#define CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY
+
+/**
+ * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute
+ */
+typedef CUlaunchAttributeValue CUstreamAttrValue_v1;
+typedef CUstreamAttrValue_v1 CUstreamAttrValue;
+
+/**
+ * Flags to specify search options. For more details see ::cuGetProcAddress
+ */
+typedef enum CUdriverProcAddress_flags_enum {
+    CU_GET_PROC_ADDRESS_DEFAULT = 0,                        /**< Default search mode for driver symbols. */
+    CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0,             /**< Search for legacy versions of driver symbols. */
+    CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1  /**< Search for per-thread versions of driver symbols. */ 
+} CUdriverProcAddress_flags;
+
+/**
+ * Execution Affinity Types 
+ */
+typedef enum CUexecAffinityType_enum {
+    CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0,  /**< Create a context with limited SMs. */
+    CU_EXEC_AFFINITY_TYPE_MAX
+} CUexecAffinityType;
+
+/**
+ * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT
+ */
+typedef struct CUexecAffinitySmCount_st {
+    unsigned int val;    /**< The number of SMs the context is limited to use. */
+} CUexecAffinitySmCount_v1;
+typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount;
+
+/**
+ * Execution Affinity Parameters 
+ */
+typedef struct CUexecAffinityParam_st {
+    CUexecAffinityType type;
+    union {
+        CUexecAffinitySmCount smCount;    /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */
+    } param;
+} CUexecAffinityParam_v1;
+typedef CUexecAffinityParam_v1 CUexecAffinityParam;
+
+/**
+ * Error codes
+ */
+typedef enum cudaError_enum {
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cuEventQuery() and ::cuStreamQuery()).
+     */
+    CUDA_SUCCESS                              = 0,
+
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    CUDA_ERROR_INVALID_VALUE                  = 1,
+
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    CUDA_ERROR_OUT_OF_MEMORY                  = 2,
+
+    /**
+     * This indicates that the CUDA driver has not been initialized with
+     * ::cuInit() or that initialization has failed.
+     */
+    CUDA_ERROR_NOT_INITIALIZED                = 3,
+
+    /**
+     * This indicates that the CUDA driver is in the process of shutting down.
+     */
+    CUDA_ERROR_DEINITIALIZED                  = 4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    CUDA_ERROR_PROFILER_DISABLED              = 5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cuProfilerStart or
+     * ::cuProfilerStop without initialization.
+     */
+    CUDA_ERROR_PROFILER_NOT_INITIALIZED       = 6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStart() when profiling is already enabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STARTED       = 7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cuProfilerStop() when profiling is already disabled.
+     */
+    CUDA_ERROR_PROFILER_ALREADY_STOPPED       = 8,
+
+    /**
+     * This indicates that the CUDA driver that the application has loaded is a
+     * stub library. Applications that run with the stub rather than a real
+     * driver loaded will result in CUDA API returning this error.
+     */
+    CUDA_ERROR_STUB_LIBRARY                   = 34,
+
+    /**  
+     * This indicates that requested CUDA device is unavailable at the current
+     * time. Devices are often unavailable due to use of
+     * ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED.
+     */
+    CUDA_ERROR_DEVICE_UNAVAILABLE            = 46,
+
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    CUDA_ERROR_NO_DEVICE                      = 100,
+
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device or that the action requested is
+     * invalid for the specified device.
+     */
+    CUDA_ERROR_INVALID_DEVICE                 = 101,
+
+    /**
+     * This error indicates that the Grid license is not applied.
+     */
+    CUDA_ERROR_DEVICE_NOT_LICENSED            = 102,
+
+    /**
+     * This indicates that the device kernel image is invalid. This can also
+     * indicate an invalid CUDA module.
+     */
+    CUDA_ERROR_INVALID_IMAGE                  = 200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    CUDA_ERROR_INVALID_CONTEXT                = 201,
+
+    /**
+     * This indicated that the context being supplied as a parameter to the
+     * API call was already the active context.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.2. It is no longer an
+     * error to attempt to push the active context via ::cuCtxPushCurrent().
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_CURRENT        = 202,
+
+    /**
+     * This indicates that a map or register operation has failed.
+     */
+    CUDA_ERROR_MAP_FAILED                     = 205,
+
+    /**
+     * This indicates that an unmap or unregister operation has failed.
+     */
+    CUDA_ERROR_UNMAP_FAILED                   = 206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    CUDA_ERROR_ARRAY_IS_MAPPED                = 207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    CUDA_ERROR_ALREADY_MAPPED                 = 208,
+
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    CUDA_ERROR_NO_BINARY_FOR_GPU              = 209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    CUDA_ERROR_ALREADY_ACQUIRED               = 210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    CUDA_ERROR_NOT_MAPPED                     = 211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_ARRAY            = 212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    CUDA_ERROR_NOT_MAPPED_AS_POINTER          = 213,
+
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    CUDA_ERROR_ECC_UNCORRECTABLE              = 214,
+
+    /**
+     * This indicates that the ::CUlimit passed to the API call is not
+     * supported by the active device.
+     */
+    CUDA_ERROR_UNSUPPORTED_LIMIT              = 215,
+
+    /**
+     * This indicates that the ::CUcontext passed to the API call can
+     * only be bound to a single CPU thread at a time but is already
+     * bound to a CPU thread.
+     */
+    CUDA_ERROR_CONTEXT_ALREADY_IN_USE         = 216,
+
+    /**
+     * This indicates that peer access is not supported across the given
+     * devices.
+     */
+    CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        = 217,
+
+    /**
+     * This indicates that a PTX JIT compilation failed.
+     */
+    CUDA_ERROR_INVALID_PTX                    = 218,
+
+    /**
+     * This indicates an error with OpenGL or DirectX context.
+     */
+    CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       = 219,
+
+    /**
+    * This indicates that an uncorrectable NVLink error was detected during the
+    * execution.
+    */
+    CUDA_ERROR_NVLINK_UNCORRECTABLE           = 220,
+
+    /**
+    * This indicates that the PTX JIT compiler library was not found.
+    */
+    CUDA_ERROR_JIT_COMPILER_NOT_FOUND         = 221,
+
+    /**
+     * This indicates that the provided PTX was compiled with an unsupported toolchain.
+     */
+
+    CUDA_ERROR_UNSUPPORTED_PTX_VERSION        = 222,
+
+    /**
+     * This indicates that the PTX JIT compilation was disabled.
+     */
+    CUDA_ERROR_JIT_COMPILATION_DISABLED       = 223,
+
+    /**
+     * This indicates that the ::CUexecAffinityType passed to the API call is not
+     * supported by the active device.
+     */ 
+    CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY      = 224,
+
+    /**
+     * This indicates that the device kernel source is invalid. This includes
+     * compilation/linker errors encountered in device code or user error.
+     */
+    CUDA_ERROR_INVALID_SOURCE                 = 300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    CUDA_ERROR_FILE_NOT_FOUND                 = 301,
+
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302,
+
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      = 303,
+
+    /**
+     * This indicates that an OS call failed.
+     */
+    CUDA_ERROR_OPERATING_SYSTEM               = 304,
+
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::CUstream and ::CUevent.
+     */
+    CUDA_ERROR_INVALID_HANDLE                 = 400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    CUDA_ERROR_ILLEGAL_STATE                  = 401,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, driver function names, texture names,
+     * and surface names.
+     */
+    CUDA_ERROR_NOT_FOUND                      = 500,
+
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::CUDA_SUCCESS (which indicates completion). Calls that
+     * may return this value include ::cuEventQuery() and ::cuStreamQuery().
+     */
+    CUDA_ERROR_NOT_READY                      = 600,
+
+    /**
+     * While executing a kernel, the device encountered a
+     * load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_ADDRESS                = 700,
+
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. This error usually indicates that the user has
+     * attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register
+     * count. Passing arguments of the wrong size (i.e. a 64-bit pointer
+     * when a 32-bit int is expected) is equivalent to passing too many
+     * arguments and can also result in this error.
+     */
+    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        = 701,
+
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device attribute
+     * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_TIMEOUT                 = 702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  = 703,
+
+    /**
+     * This error indicates that a call to ::cuCtxEnablePeerAccess() is
+     * trying to re-enable peer access to a context which has already
+     * had peer access to it enabled.
+     */
+    CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    = 704,
+
+    /**
+     * This error indicates that ::cuCtxDisablePeerAccess() is
+     * trying to disable peer access which has not been enabled yet
+     * via ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        = 705,
+
+    /**
+     * This error indicates that the primary context for the specified device
+     * has already been initialized.
+     */
+    CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         = 708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    CUDA_ERROR_CONTEXT_IS_DESTROYED           = 709,
+
+    /**
+     * A device-side assert triggered during kernel execution. The context
+     * cannot be used anymore, and must be destroyed. All existing device
+     * memory allocations from this context are invalid and must be
+     * reconstructed if the program is to continue using CUDA.
+     */
+    CUDA_ERROR_ASSERT                         = 710,
+
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices
+     * passed to ::cuCtxEnablePeerAccess().
+     */
+    CUDA_ERROR_TOO_MANY_PEERS                 = 711,
+
+    /**
+     * This error indicates that the memory range passed to ::cuMemHostRegister()
+     * has already been registered.
+     */
+    CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712,
+
+    /**
+     * This error indicates that the pointer passed to ::cuMemHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     = 713,
+
+    /**
+     * While executing a kernel, the device encountered a stack error.
+     * This can be due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_HARDWARE_STACK_ERROR           = 714,
+
+    /**
+     * While executing a kernel, the device encountered an illegal instruction.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_ILLEGAL_INSTRUCTION            = 715,
+
+    /**
+     * While executing a kernel, the device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_MISALIGNED_ADDRESS             = 716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_ADDRESS_SPACE          = 717,
+
+    /**
+     * While executing a kernel, the device program counter wrapped its address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_INVALID_PC                     = 718,
+
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    CUDA_ERROR_LAUNCH_FAILED                  = 719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+     */
+    CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE   = 720,
+
+    /**
+     * This error indicates that the attempted operation is not permitted.
+     */
+    CUDA_ERROR_NOT_PERMITTED                  = 800,
+
+    /**
+     * This error indicates that the attempted operation is not supported
+     * on the current system or device.
+     */
+    CUDA_ERROR_NOT_SUPPORTED                  = 801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    CUDA_ERROR_SYSTEM_NOT_READY               = 802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    CUDA_ERROR_SYSTEM_DRIVER_MISMATCH         = 803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804,
+
+    /**
+     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+     */
+    CUDA_ERROR_MPS_CONNECTION_FAILED          = 805,
+
+    /**
+     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+     */
+    CUDA_ERROR_MPS_RPC_FAILURE                = 806,
+
+    /**
+     * This error indicates that the MPS server is not ready to accept new MPS client requests.
+     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+     */
+    CUDA_ERROR_MPS_SERVER_NOT_READY           = 807,
+
+    /**
+     * This error indicates that the hardware resources required to create MPS client have been exhausted.
+     */
+    CUDA_ERROR_MPS_MAX_CLIENTS_REACHED        = 808,
+
+    /**
+     * This error indicates the the hardware resources required to support device connections have been exhausted.
+     */
+    CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED    = 809,
+
+    /**
+     * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
+     */
+    CUDA_ERROR_MPS_CLIENT_TERMINATED          = 810,
+
+    /**
+     * This error indicates that the operation is not permitted when
+     * the stream is capturing.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED     = 900,
+
+    /**
+     * This error indicates that the current capture sequence on the stream
+     * has been invalidated due to a previous error.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_INVALIDATED     = 901,
+
+    /**
+     * This error indicates that the operation would have resulted in a merge
+     * of two independent capture sequences.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_MERGE           = 902,
+
+    /**
+     * This error indicates that the capture was not initiated in this stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNMATCHED       = 903,
+
+    /**
+     * This error indicates that the capture sequence contains a fork that was
+     * not joined to the primary stream.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_UNJOINED        = 904,
+
+    /**
+     * This error indicates that a dependency would have been created which
+     * crosses the capture sequence boundary. Only implicit in-stream ordering
+     * dependencies are allowed to cross the boundary.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_ISOLATION       = 905,
+
+    /**
+     * This error indicates a disallowed implicit dependency on a current capture
+     * sequence from cudaStreamLegacy.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_IMPLICIT        = 906,
+
+    /**
+     * This error indicates that the operation is not permitted on an event which
+     * was last recorded in a capturing stream.
+     */
+    CUDA_ERROR_CAPTURED_EVENT                 = 907,
+
+    /**
+     * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED
+     * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a
+     * different thread.
+     */
+    CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD    = 908,
+
+    /**
+     * This error indicates that the timeout specified for the wait operation has lapsed.
+     */
+    CUDA_ERROR_TIMEOUT                        = 909,
+
+    /**
+     * This error indicates that the graph update was not performed because it included 
+     * changes which violated constraints specific to instantiated graph update.
+     */
+    CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE      = 910,
+
+    /**
+     * This indicates that an async error has occurred in a device outside of CUDA.
+     * If CUDA was waiting for an external device's signal before consuming shared data,
+     * the external device signaled an error indicating that the data is not valid for
+     * consumption. This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must be
+     * terminated and relaunched.
+     */
+    CUDA_ERROR_EXTERNAL_DEVICE               = 911,
+
+    /**
+     * Indicates a kernel launch error due to cluster misconfiguration.
+     */
+    CUDA_ERROR_INVALID_CLUSTER_SIZE           = 912,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    CUDA_ERROR_UNKNOWN                        = 999
+} CUresult;
+
+/**
+ * P2P Attributes
+ */
+typedef enum CUdevice_P2PAttribute_enum {
+    CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK                     = 0x01,  /**< A relative value indicating the performance of the link between two devices */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED                     = 0x02,  /**< P2P Access is enable */
+    CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED              = 0x03,  /**< Atomic operation over the link supported */
+    CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED              = 0x04,  /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */
+    CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED          = 0x04   /**< Accessing CUDA arrays over the link supported */
+} CUdevice_P2PAttribute;
+
+/**
+ * CUDA stream callback
+ * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback.  May be NULL.
+ * \param status ::CUDA_SUCCESS or any persistent error on the stream.
+ * \param userData User parameter provided at registration.
+ */
+typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData);
+
+/**
+ * Block size to per-block dynamic shared memory mapping for a certain
+ * kernel \param blockSize Block size of the kernel.
+ *
+ * \return The dynamic shared memory needed by a block.
+ */
+typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_PORTABLE        0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_DEVICEMAP       0x02
+
+/**
+ * If set, host memory is allocated as write-combined - fast to write,
+ * faster to DMA, slow to read except via SSE4 streaming load instruction
+ * (MOVNTDQA).
+ * Flag for ::cuMemHostAlloc()
+ */
+#define CU_MEMHOSTALLOC_WRITECOMBINED   0x04
+
+/**
+ * If set, host memory is portable between CUDA contexts.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_PORTABLE     0x01
+
+/**
+ * If set, host memory is mapped into CUDA address space and
+ * ::cuMemHostGetDevicePointer() may be called on the host pointer.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_DEVICEMAP    0x02
+
+/**
+ * If set, the passed memory pointer is treated as pointing to some
+ * memory-mapped I/O space, e.g. belonging to a third-party PCIe device.
+ * On Windows the flag is a no-op.
+ * On Linux that memory is marked as non cache-coherent for the GPU and
+ * is expected to be physically contiguous. It may return
+ * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
+ * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
+ * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED
+ * is returned.
+ * Flag for ::cuMemHostRegister()
+ */
+#define CU_MEMHOSTREGISTER_IOMEMORY     0x04
+
+/**
+* If set, the passed memory pointer is treated as pointing to memory that is
+* considered read-only by the device.  On platforms without
+* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+* required in order to register memory mapped to the CPU as read-only.  Support
+* for the use of this flag can be queried from the device attribute
+* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+* a current context associated with a device that does not have this attribute
+* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED.
+*/
+#define CU_MEMHOSTREGISTER_READ_ONLY    0x08
+
+/**
+ * 2D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY2D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+
+    size_t WidthInBytes;        /**< Width of 2D memory copy in bytes */
+    size_t Height;              /**< Height of 2D memory copy */
+} CUDA_MEMCPY2D_v2;
+typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D;
+
+/**
+ * 3D memory copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    void *reserved0;            /**< Must be NULL */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    void *reserved1;            /**< Must be NULL */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_v2;
+typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D;
+
+/**
+ * 3D memory cross-context copy parameters
+ */
+typedef struct CUDA_MEMCPY3D_PEER_st {
+    size_t srcXInBytes;         /**< Source X in bytes */
+    size_t srcY;                /**< Source Y */
+    size_t srcZ;                /**< Source Z */
+    size_t srcLOD;              /**< Source LOD */
+    CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+    const void *srcHost;        /**< Source host pointer */
+    CUdeviceptr srcDevice;      /**< Source device pointer */
+    CUarray srcArray;           /**< Source array reference */
+    CUcontext srcContext;       /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t srcPitch;            /**< Source pitch (ignored when src is array) */
+    size_t srcHeight;           /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+    size_t dstXInBytes;         /**< Destination X in bytes */
+    size_t dstY;                /**< Destination Y */
+    size_t dstZ;                /**< Destination Z */
+    size_t dstLOD;              /**< Destination LOD */
+    CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+    void *dstHost;              /**< Destination host pointer */
+    CUdeviceptr dstDevice;      /**< Destination device pointer */
+    CUarray dstArray;           /**< Destination array reference */
+    CUcontext dstContext;       /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */
+    size_t dstPitch;            /**< Destination pitch (ignored when dst is array) */
+    size_t dstHeight;           /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+    size_t WidthInBytes;        /**< Width of 3D memory copy in bytes */
+    size_t Height;              /**< Height of 3D memory copy */
+    size_t Depth;               /**< Depth of 3D memory copy */
+} CUDA_MEMCPY3D_PEER_v1;
+typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER;
+
+/**
+ * Array descriptor
+ */
+typedef struct CUDA_ARRAY_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of array */
+    size_t Height;            /**< Height of array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+} CUDA_ARRAY_DESCRIPTOR_v2;
+typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR;
+
+/**
+ * 3D array descriptor
+ */
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_st
+{
+    size_t Width;             /**< Width of 3D array */
+    size_t Height;            /**< Height of 3D array */
+    size_t Depth;             /**< Depth of 3D array */
+
+    CUarray_format Format;    /**< Array format */
+    unsigned int NumChannels; /**< Channels per array element */
+    unsigned int Flags;       /**< Flags */
+} CUDA_ARRAY3D_DESCRIPTOR_v2;
+typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR;
+
+/**
+ * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
+ */
+#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1
+
+/**
+ * CUDA array sparse properties
+ */
+typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st {
+    struct {
+        unsigned int width;     /**< Width of sparse tile in elements */
+        unsigned int height;    /**< Height of sparse tile in elements */
+        unsigned int depth;     /**< Depth of sparse tile in elements */
+    } tileExtent;
+
+    /**
+     * First mip level at which the mip tail begins.
+     */
+    unsigned int miptailFirstLevel;
+    /**
+     * Total size of the mip tail.
+     */
+    unsigned long long miptailSize;
+    /**
+     * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
+     */
+    unsigned int flags;
+    unsigned int reserved[4];
+} CUDA_ARRAY_SPARSE_PROPERTIES_v1;
+typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES;
+
+/**
+ * CUDA array memory requirements
+ */
+typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st {
+    size_t size;                /**< Total required memory size */
+    size_t alignment;           /**< alignment requirement */
+    unsigned int reserved[4];
+} CUDA_ARRAY_MEMORY_REQUIREMENTS_v1;
+typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS;
+
+/**
+ * CUDA Resource descriptor
+ */
+typedef struct CUDA_RESOURCE_DESC_st
+{
+    CUresourcetype resType;                   /**< Resource type */
+
+    union {
+        struct {
+            CUarray hArray;                   /**< CUDA array */
+        } array;
+        struct {
+            CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t sizeInBytes;               /**< Size in bytes */
+        } linear;
+        struct {
+            CUdeviceptr devPtr;               /**< Device pointer */
+            CUarray_format format;            /**< Array format */
+            unsigned int numChannels;         /**< Channels per array element */
+            size_t width;                     /**< Width of the array in elements */
+            size_t height;                    /**< Height of the array in elements */
+            size_t pitchInBytes;              /**< Pitch between two rows in bytes */
+        } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
+    } res;
+
+    unsigned int flags;                       /**< Flags (must be zero) */
+} CUDA_RESOURCE_DESC_v1;
+typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC;
+
+/**
+ * Texture descriptor
+ */
+typedef struct CUDA_TEXTURE_DESC_st {
+    CUaddress_mode addressMode[3];  /**< Address modes */
+    CUfilter_mode filterMode;       /**< Filter mode */
+    unsigned int flags;             /**< Flags */
+    unsigned int maxAnisotropy;     /**< Maximum anisotropy ratio */
+    CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */
+    float mipmapLevelBias;          /**< Mipmap level bias */
+    float minMipmapLevelClamp;      /**< Mipmap minimum level clamp */
+    float maxMipmapLevelClamp;      /**< Mipmap maximum level clamp */
+    float borderColor[4];           /**< Border Color */
+    int reserved[12];
+} CUDA_TEXTURE_DESC_v1;
+typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC;
+
+/**
+ * Resource view format
+ */
+typedef enum CUresourceViewFormat_enum
+{
+    CU_RES_VIEW_FORMAT_NONE          = 0x00, /**< No resource view format (use underlying resource format) */
+    CU_RES_VIEW_FORMAT_UINT_1X8      = 0x01, /**< 1 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X8      = 0x02, /**< 2 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X8      = 0x03, /**< 4 channel unsigned 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X8      = 0x04, /**< 1 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X8      = 0x05, /**< 2 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X8      = 0x06, /**< 4 channel signed 8-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X16     = 0x07, /**< 1 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X16     = 0x08, /**< 2 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X16     = 0x09, /**< 4 channel unsigned 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X16     = 0x0a, /**< 1 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X16     = 0x0b, /**< 2 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X16     = 0x0c, /**< 4 channel signed 16-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_1X32     = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_2X32     = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_UINT_4X32     = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_1X32     = 0x10, /**< 1 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_2X32     = 0x11, /**< 2 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_SINT_4X32     = 0x12, /**< 4 channel signed 32-bit integers */
+    CU_RES_VIEW_FORMAT_FLOAT_1X16    = 0x13, /**< 1 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X16    = 0x14, /**< 2 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X16    = 0x15, /**< 4 channel 16-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_1X32    = 0x16, /**< 1 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_2X32    = 0x17, /**< 2 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_FLOAT_4X32    = 0x18, /**< 4 channel 32-bit floating point */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC1  = 0x19, /**< Block compressed 1 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC2  = 0x1a, /**< Block compressed 2 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC3  = 0x1b, /**< Block compressed 3 */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC4  = 0x1c, /**< Block compressed 4 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC4    = 0x1d, /**< Block compressed 4 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC5  = 0x1e, /**< Block compressed 5 unsigned */
+    CU_RES_VIEW_FORMAT_SIGNED_BC5    = 0x1f, /**< Block compressed 5 signed */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    CU_RES_VIEW_FORMAT_SIGNED_BC6H   = 0x21, /**< Block compressed 6 signed half-float */
+    CU_RES_VIEW_FORMAT_UNSIGNED_BC7  = 0x22  /**< Block compressed 7 */
+} CUresourceViewFormat;
+
+/**
+ * Resource view descriptor
+ */
+typedef struct CUDA_RESOURCE_VIEW_DESC_st
+{
+    CUresourceViewFormat format;   /**< Resource view format */
+    size_t width;                  /**< Width of the resource view */
+    size_t height;                 /**< Height of the resource view */
+    size_t depth;                  /**< Depth of the resource view */
+    unsigned int firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int firstLayer;       /**< First layer index */
+    unsigned int lastLayer;        /**< Last layer index */
+    unsigned int reserved[16];
+} CUDA_RESOURCE_VIEW_DESC_v1;
+typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC;
+
+/**
+ * GPU Direct v3 tokens
+ */
+typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st {
+    unsigned long long p2pToken;
+    unsigned int vaSpaceToken;
+} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1;
+typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS;
+
+/**
+* Access flags that specify the level of access the current context's device has
+* on the memory referenced.
+*/
+typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum {
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE      = 0x0,   /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ      = 0x1,   /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */
+    CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3    /**< Read-write access, the device has full read-write access to the memory */
+} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS;
+
+/**
+ * Kernel launch parameters
+ */
+typedef struct CUDA_LAUNCH_PARAMS_st {
+    CUfunction function;         /**< Kernel to launch */
+    unsigned int gridDimX;       /**< Width of grid in blocks */
+    unsigned int gridDimY;       /**< Height of grid in blocks */
+    unsigned int gridDimZ;       /**< Depth of grid in blocks */
+    unsigned int blockDimX;      /**< X dimension of each thread block */
+    unsigned int blockDimY;      /**< Y dimension of each thread block */
+    unsigned int blockDimZ;      /**< Z dimension of each thread block */
+    unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */
+    CUstream hStream;            /**< Stream identifier */
+    void **kernelParams;         /**< Array of pointers to kernel parameters */
+} CUDA_LAUNCH_PARAMS_v1;
+typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS;
+
+/**
+ * External memory handle types
+ */
+typedef enum CUexternalMemoryHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
+    /**
+     * Handle is a D3D12 heap object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
+    /**
+     * Handle is a D3D12 committed resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+    /**
+     * Handle is a shared NT handle to a D3D11 resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+    /**
+     * Handle is a globally shared handle to a D3D11 resource
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+    /**
+     * Handle is an NvSciBuf object
+     */
+    CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8
+} CUexternalMemoryHandleType;
+
+/**
+ * Indicates that the external memory object is a dedicated resource
+ */
+#define CUDA_EXTERNAL_MEMORY_DEDICATED   0x1
+
+/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+ * contains this flag, it indicates that signaling an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01
+
+/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+ * contains this flag, it indicates that waiting on an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02
+
+/**
+ * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application needs signaler specific NvSciSyncAttr
+ * to be filled by ::cuDeviceGetNvSciSyncAttributes.
+ */
+#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1
+
+/**
+ * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application needs waiter specific NvSciSyncAttr
+ * to be filled by ::cuDeviceGetNvSciSyncAttributes.
+ */
+#define CUDA_NVSCISYNC_ATTR_WAIT 0x2
+/**
+ * External memory handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalMemoryHandleType type;
+    union {
+        /**
+         * File descriptor referencing the memory object. Valid
+         * when type is
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+         * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid memory object.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * A handle representing an NvSciBuf Object. Valid when type
+         * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF
+         */
+        const void *nvSciBufObject;
+    } handle;
+    /**
+     * Size of the memory allocation
+     */
+    unsigned long long size;
+    /**
+     * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+
+/**
+ * External memory buffer descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+    /**
+     * Offset into the memory object where the buffer's base is
+     */
+    unsigned long long offset;
+    /**
+     * Size of the buffer
+     */
+    unsigned long long size;
+    /**
+     * Flags reserved for future use. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+
+/**
+ * External memory mipmap descriptor
+ */
+typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+    /**
+     * Offset into the memory object where the base level of the
+     * mipmap chain is.
+     */
+    unsigned long long offset;
+    /**
+     * Format, dimension and type of base level of the mipmap chain
+     */
+    CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+    /**
+     * Total number of levels in the mipmap chain
+     */
+    unsigned int numLevels;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1;
+typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+
+/**
+ * External semaphore handle types
+ */
+typedef enum CUexternalSemaphoreHandleType_enum {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD             = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32          = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT      = 3,
+    /**
+     * Handle is a shared NT handle referencing a D3D12 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE           = 4,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 fence object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE           = 5,
+    /**
+     * Opaque handle to NvSciSync Object
+	 */
+	CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC             = 6,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX     = 7,
+    /**
+     * Handle is a globally shared handle referencing a D3D11 keyed mutex object
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8,
+    /**
+     * Handle is an opaque file descriptor referencing a timeline semaphore
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9,
+    /**
+     * Handle is an opaque shared NT handle referencing a timeline semaphore
+     */
+    CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
+} CUexternalSemaphoreHandleType;
+
+/**
+ * External semaphore handle descriptor
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+    /**
+     * Type of the handle
+     */
+    CUexternalSemaphoreHandleType type;
+    union {
+        /**
+         * File descriptor referencing the semaphore object. Valid
+         * when type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+         * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid synchronization primitive.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * Valid NvSciSyncObj. Must be non NULL
+         */
+        const void* nvSciSyncObj;
+    } handle;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+
+/**
+ * External semaphore signal parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType
+             * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+        unsigned int reserved[12];
+    } params;
+    /**
+     * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to
+     * signal a ::CUexternalSemaphore of type
+     * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
+     * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates
+     * that while signaling the ::CUexternalSemaphore, no memory synchronization
+     * operations should be performed for any external memory object imported
+     * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
+     * For all other types of ::CUexternalSemaphore, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS;
+
+/**
+ * External semaphore wait parameters
+ */
+typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be waited on
+             */
+            unsigned long long value;
+        } fence;
+        /**
+         * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType
+         * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC.
+         */
+        union {
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+        unsigned int reserved[10];
+    } params;
+    /**
+     * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on
+     * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+     * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+     * which indicates that while waiting for the ::CUexternalSemaphore, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
+     * For all other types of ::CUexternalSemaphore, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1;
+typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS;
+
+/**
+ * Semaphore signal node parameters
+ */
+typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st {
+    CUexternalSemaphore* extSemArray;                         /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                  /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1;
+typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS;
+
+/**
+ * Semaphore wait node parameters
+ */
+typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st {
+    CUexternalSemaphore* extSemArray;                       /**< Array of external semaphore handles. */
+    const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1;
+typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS;
+
+typedef unsigned long long CUmemGenericAllocationHandle_v1;
+typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
+
+/**
+ * Flags for specifying particular handle types
+ */
+typedef enum CUmemAllocationHandleType_enum {
+    CU_MEM_HANDLE_TYPE_NONE                  = 0x0,  /**< Does not allow any export mechanism. > */
+    CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
+    CU_MEM_HANDLE_TYPE_WIN32                 = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
+    CU_MEM_HANDLE_TYPE_WIN32_KMT             = 0x4,  /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
+    CU_MEM_HANDLE_TYPE_MAX                   = 0x7FFFFFFF
+} CUmemAllocationHandleType;
+
+/**
+ * Specifies the memory protection flags for mapping.
+ */
+typedef enum CUmemAccess_flags_enum {
+    CU_MEM_ACCESS_FLAGS_PROT_NONE        = 0x0,  /**< Default, make the address range not accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READ        = 0x1,  /**< Make the address range read accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_READWRITE   = 0x3,  /**< Make the address range read-write accessible */
+    CU_MEM_ACCESS_FLAGS_PROT_MAX         = 0x7FFFFFFF
+} CUmemAccess_flags;
+
+/**
+ * Specifies the type of location
+ */
+typedef enum CUmemLocationType_enum {
+    CU_MEM_LOCATION_TYPE_INVALID = 0x0,
+    CU_MEM_LOCATION_TYPE_DEVICE  = 0x1,  /**< Location is a device location, thus id is a device ordinal */
+    CU_MEM_LOCATION_TYPE_MAX     = 0x7FFFFFFF
+} CUmemLocationType;
+
+/**
+* Defines the allocation types available
+*/
+typedef enum CUmemAllocationType_enum {
+    CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+
+    /** This allocation type is 'pinned', i.e. cannot migrate from its current
+      * location while the application is actively using it
+      */
+    CU_MEM_ALLOCATION_TYPE_PINNED  = 0x1,
+    CU_MEM_ALLOCATION_TYPE_MAX     = 0x7FFFFFFF
+} CUmemAllocationType;
+
+/**
+* Flag for requesting different optimal and required granularities for an allocation.
+*/
+typedef enum CUmemAllocationGranularity_flags_enum {
+    CU_MEM_ALLOC_GRANULARITY_MINIMUM     = 0x0,     /**< Minimum required granularity for allocation */
+    CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1      /**< Recommended granularity for allocation for best performance */
+} CUmemAllocationGranularity_flags;
+
+/**
+* Specifies the handle type for address range
+*/
+typedef enum CUmemRangeHandleType_enum
+{
+    CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 0x1,
+    CU_MEM_RANGE_HANDLE_TYPE_MAX        = 0x7FFFFFFF
+} CUmemRangeHandleType;
+
+/**
+ * Sparse subresource types
+ */
+typedef enum CUarraySparseSubresourceType_enum {
+    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+    CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+} CUarraySparseSubresourceType;
+
+/**
+ * Memory operation types
+ */
+typedef enum CUmemOperationType_enum {
+    CU_MEM_OPERATION_TYPE_MAP = 1,
+    CU_MEM_OPERATION_TYPE_UNMAP = 2
+} CUmemOperationType;
+
+/**
+ * Memory handle types
+ */
+typedef enum CUmemHandleType_enum {
+    CU_MEM_HANDLE_TYPE_GENERIC = 0
+} CUmemHandleType;
+
+/**
+ * Specifies the CUDA array or CUDA mipmapped array memory mapping information
+ */
+typedef struct CUarrayMapInfo_st {    
+    CUresourcetype resourceType;                    /**< Resource type */
+
+    union {
+        CUmipmappedArray mipmap;
+        CUarray array;
+    } resource;
+
+    CUarraySparseSubresourceType subresourceType;   /**< Sparse subresource type */
+
+    union {
+        struct {
+            unsigned int level;                     /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */            
+            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
+            unsigned int offsetX;                   /**< Starting X offset in elements */
+            unsigned int offsetY;                   /**< Starting Y offset in elements */
+            unsigned int offsetZ;                   /**< Starting Z offset in elements */            
+            unsigned int extentWidth;               /**< Width in elements */
+            unsigned int extentHeight;              /**< Height in elements */
+            unsigned int extentDepth;               /**< Depth in elements */
+        } sparseLevel;
+        struct {
+            unsigned int layer;                     /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */
+            unsigned long long offset;              /**< Offset within mip tail */
+            unsigned long long size;                /**< Extent in bytes */
+        } miptail;
+    } subresource;
+    
+    CUmemOperationType memOperationType;            /**< Memory operation type */
+    CUmemHandleType memHandleType;                  /**< Memory handle type */
+
+    union {
+        CUmemGenericAllocationHandle memHandle;
+    } memHandle;
+    
+    unsigned long long offset;                      /**< Offset within the memory */
+    unsigned int deviceBitMask;                     /**< Device ordinal bit mask */
+    unsigned int flags;                             /**< flags for future use, must be zero now. */
+    unsigned int reserved[2];                       /**< Reserved for future use, must be zero now. */
+} CUarrayMapInfo_v1;
+typedef CUarrayMapInfo_v1 CUarrayMapInfo;
+
+/**
+ * Specifies a memory location.
+ */
+typedef struct CUmemLocation_st {
+    CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */
+    int id;                 /**< identifier for a given this location's ::CUmemLocationType. */
+} CUmemLocation_v1;
+typedef CUmemLocation_v1 CUmemLocation;
+
+/**
+ * Specifies compression attribute for an allocation.
+ */
+typedef enum CUmemAllocationCompType_enum {
+    CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */
+    CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating  compressible memory */
+} CUmemAllocationCompType;
+
+/**
+ * This flag if set indicates that the memory will be used as a tile pool.
+ */
+#define CU_MEM_CREATE_USAGE_TILE_POOL    0x1
+
+/**
+* Specifies the allocation properties for a allocation.
+*/
+typedef struct CUmemAllocationProp_st {
+    /** Allocation type */
+    CUmemAllocationType type;
+    /** requested ::CUmemAllocationHandleType */
+    CUmemAllocationHandleType requestedHandleTypes;
+    /** Location of allocation */
+    CUmemLocation location;
+    /**
+     * Windows-specific POBJECT_ATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This object atributes structure
+     * includes security attributes that define
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32HandleMetaData;
+    struct {
+         /**
+         * Allocation hint for requesting compressible memory.
+         * On devices that support Compute Data Compression, compressible
+         * memory can be used to accelerate accesses to data with unstructured
+         * sparsity and other compressible data patterns. Applications are 
+         * expected to query allocation property of the handle obtained with 
+         * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to 
+         * validate if the obtained allocation is compressible or not. Note that 
+         * compressed memory may not be mappable on all devices.
+         */
+         unsigned char compressionType;
+         unsigned char gpuDirectRDMACapable;
+         /** Bitmask indicating intended usage for this allocation */
+         unsigned short usage;
+         unsigned char reserved[4];
+    } allocFlags;
+} CUmemAllocationProp_v1;
+typedef CUmemAllocationProp_v1 CUmemAllocationProp;
+
+/**
+ * Memory access descriptor
+ */
+typedef struct CUmemAccessDesc_st {
+    CUmemLocation location;        /**< Location on which the request is to change it's accessibility */
+    CUmemAccess_flags flags;       /**< ::CUmemProt accessibility flags to set on the request */
+} CUmemAccessDesc_v1;
+typedef CUmemAccessDesc_v1 CUmemAccessDesc;
+
+typedef enum CUgraphExecUpdateResult_enum {
+    CU_GRAPH_EXEC_UPDATE_SUCCESS                     = 0x0, /**< The update succeeded */
+    CU_GRAPH_EXEC_UPDATE_ERROR                       = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
+    CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED      = 0x2, /**< The update failed because the topology changed */
+    CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED     = 0x3, /**< The update failed because a node type changed */
+    CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED      = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
+    CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED    = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
+    CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED         = 0x6, /**< The update failed because something about the node is not supported */
+    CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */
+    CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED    = 0x8  /**< The update failed because the node attributes changed in a way that is not supported */
+} CUgraphExecUpdateResult;
+
+/**
+ * CUDA memory pool attributes
+ */
+typedef enum CUmemPool_attribute_enum {
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to use memory asynchronously freed
+     * in another streams as long as a stream ordering dependency
+     * of the allocating stream on the free action exists.
+     * Cuda events and null stream interactions can create the required
+     * stream ordered dependencies. (default enabled)
+     */
+    CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1,
+
+    /**
+     * (value type = int)
+     * Allow reuse of already completed frees when there is no dependency
+     * between the free and allocation. (default enabled)
+     */
+    CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
+
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to insert new stream dependencies
+     * in order to establish the stream ordering required to reuse
+     * a piece of memory released by cuFreeAsync (default enabled).
+     */
+    CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of reserved memory in bytes to hold onto before trying
+     * to release memory back to the OS. When more than the release
+     * threshold bytes of memory are held by the memory pool, the
+     * allocator will try to release memory back to the OS on the
+     * next call to stream, event or context synchronize. (default 0)
+     */
+    CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of backing memory currently allocated for the mempool.
+     */
+    CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of backing memory allocated for the mempool since the
+     * last time it was reset. High watermark can only be reset to zero.
+     */
+    CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory from the pool that is currently in use by the application.
+     */
+    CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of the amount of memory from the pool that was in use by the application since
+     * the last time it was reset. High watermark can only be reset to zero.
+     */
+    CU_MEMPOOL_ATTR_USED_MEM_HIGH
+} CUmemPool_attribute;
+
+/**
+ * Specifies the properties of allocations made from the pool.
+ */
+typedef struct CUmemPoolProps_st {
+    CUmemAllocationType allocType;         /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */
+    CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */
+    CUmemLocation location;                /**< Location where allocations should reside. */
+    /**
+     * Windows-specific LPSECURITYATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This security attribute defines
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32SecurityAttributes;
+    unsigned char reserved[64]; /**< reserved for future use, must be 0 */
+} CUmemPoolProps_v1;
+typedef CUmemPoolProps_v1 CUmemPoolProps;
+
+/**
+ * Opaque data for exporting a pool allocation
+ */
+typedef struct CUmemPoolPtrExportData_st {
+    unsigned char reserved[64];
+} CUmemPoolPtrExportData_v1;
+typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData;
+
+/**
+ * Memory allocation node parameters
+ */
+typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported.
+    */
+    CUmemPoolProps poolProps;
+    const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    size_t accessDescCount; /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t bytesize; /**< in: size in bytes of the requested allocation */
+    CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */
+} CUDA_MEM_ALLOC_NODE_PARAMS;
+
+typedef enum CUgraphMem_attribute_enum {
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently associated with graphs
+     */
+    CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, associated with graphs since the
+     * last time it was reset.  High watermark can only be reset to zero.
+     */
+    CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
+} CUgraphMem_attribute;
+
+/**
+ * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only
+ * waits for prior work in the stream corresponding to that GPU to complete before the
+ * kernel begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC   0x01
+
+/**
+ * If set, any subsequent work pushed in a stream that participated in a call to
+ * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
+ * the GPU corresponding to that stream to complete before it begins execution.
+ */
+#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC  0x02
+
+/**
+ * If set, the CUDA array is a collection of layers, where each layer is either a 1D
+ * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number
+ * of layers, not the depth of a 3D array.
+ */
+#define CUDA_ARRAY3D_LAYERED        0x01
+
+/**
+ * Deprecated, use CUDA_ARRAY3D_LAYERED
+ */
+#define CUDA_ARRAY3D_2DARRAY        0x01
+
+/**
+ * This flag must be set in order to bind a surface reference
+ * to the CUDA array
+ */
+#define CUDA_ARRAY3D_SURFACE_LDST   0x02
+
+/**
+ * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The
+ * width of such a CUDA array must be equal to its height, and Depth must be six.
+ * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps
+ * and Depth must be a multiple of six.
+ */
+#define CUDA_ARRAY3D_CUBEMAP        0x04
+
+/**
+ * This flag must be set in order to perform texture gather operations
+ * on a CUDA array.
+ */
+#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08
+
+/**
+ * This flag if set indicates that the CUDA
+ * array is a DEPTH_TEXTURE.
+ */
+#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10
+
+/**
+ * This flag indicates that the CUDA array may be bound as a color target
+ * in an external graphics API
+ */
+#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20
+
+/**
+ * This flag if set indicates that the CUDA array or CUDA mipmapped array
+ * is a sparse CUDA array or CUDA mipmapped array respectively
+ */
+#define CUDA_ARRAY3D_SPARSE 0x40
+
+/**
+ * This flag if set indicates that the CUDA array or CUDA mipmapped array
+ * will allow deferred memory mapping
+ */
+#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80
+
+/**
+ * Override the texref format with a format inferred from the array.
+ * Flag for ::cuTexRefSetArray()
+ */
+#define CU_TRSA_OVERRIDE_FORMAT 0x01
+
+/**
+ * Read the texture as integers rather than promoting the values to floats
+ * in the range [0,1].
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_READ_AS_INTEGER         0x01
+
+/**
+ * Use normalized texture coordinates in the range [0,1) instead of [0,dim).
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_NORMALIZED_COORDINATES  0x02
+
+/**
+ * Perform sRGB->linear conversion during texture read.
+ * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+ */
+#define CU_TRSF_SRGB  0x10
+
+ /**
+  * Disable any trilinear filtering optimizations.
+  * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate()
+  */
+#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION  0x20
+
+/**
+ * Enable seamless cube map filtering.
+ * Flag for ::cuTexObjectCreate()
+ */
+#define CU_TRSF_SEAMLESS_CUBEMAP  0x40
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_END
+ */
+#define CU_LAUNCH_PARAM_END_AS_INT     0x00
+
+/**
+ * End of array terminator for the \p extra parameter to
+ * ::cuLaunchKernel
+ */
+#define CU_LAUNCH_PARAM_END            ((void*)CU_LAUNCH_PARAM_END_AS_INT)
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT 0x01
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a buffer containing all kernel
+ * parameters used for launching kernel \p f.  This buffer needs to
+ * honor all alignment/padding requirements of the individual parameters.
+ * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the
+ * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no
+ * effect.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_POINTER        ((void*)CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT)
+
+/**
+ * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT 0x02
+
+/**
+ * Indicator that the next value in the \p extra parameter to
+ * ::cuLaunchKernel will be a pointer to a size_t which contains the
+ * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER.
+ * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified
+ * in the \p extra array if the value associated with
+ * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero.
+ */
+#define CU_LAUNCH_PARAM_BUFFER_SIZE        ((void*)CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT)
+
+/**
+ * For texture references loaded into the module, use default texunit from
+ * texture reference.
+ */
+#define CU_PARAM_TR_DEFAULT -1
+
+/**
+ * Device that represents the CPU
+ */
+#define CU_DEVICE_CPU               ((CUdevice)-1)
+
+/**
+ * Device that represents an invalid device
+ */
+#define CU_DEVICE_INVALID           ((CUdevice)-2)
+
+/**
+ * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS
+ */
+typedef enum CUflushGPUDirectRDMAWritesOptions_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST   = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */
+} CUflushGPUDirectRDMAWritesOptions;
+
+/**
+ * Platform native ordering for GPUDirect RDMA writes
+ */
+typedef enum CUGPUDirectRDMAWritesOrdering_enum {
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE        = 0,   /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER       = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */
+    CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200  /**< Any CUDA device in the system can consistently consume remote writes to this device. */
+} CUGPUDirectRDMAWritesOrdering;
+
+/**
+ * The scopes for ::cuFlushGPUDirectRDMAWrites
+ */
+typedef enum CUflushGPUDirectRDMAWritesScope_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER       = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
+} CUflushGPUDirectRDMAWritesScope;
+ 
+/**
+ * The targets for ::cuFlushGPUDirectRDMAWrites
+ */
+typedef enum CUflushGPUDirectRDMAWritesTarget_enum {
+    CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
+} CUflushGPUDirectRDMAWritesTarget;
+
+/**
+ * The additional write options for ::cuGraphDebugDotPrint
+ */
+typedef enum CUgraphDebugDot_flags_enum {
+    CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE                        = 1<<0,  /** Output all debug data as if every debug flag is enabled */
+    CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES                  = 1<<1,  /** Use CUDA Runtime structures for output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS             = 1<<2,  /** Adds CUDA_KERNEL_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS             = 1<<3,  /** Adds CUDA_MEMCPY3D values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS             = 1<<4,  /** Adds CUDA_MEMSET_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS               = 1<<5,  /** Adds CUDA_HOST_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS              = 1<<6,  /** Adds CUevent handle from record and wait nodes to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS   = 1<<7,  /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS     = 1<<8,  /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES         = 1<<9,  /** Adds CUkernelNodeAttrValue values to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES                        = 1<<10, /** Adds node handles and every kernel function handle to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS          = 1<<11, /** Adds memory alloc node parameters to output */
+    CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS           = 1<<12  /** Adds memory free node parameters to output */
+    ,
+    CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS       = 1<<13  /** Adds batch mem op node parameters to output */
+} CUgraphDebugDot_flags;
+
+/**
+ * Flags for user objects for graphs
+ */
+typedef enum CUuserObject_flags_enum {
+    CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
+} CUuserObject_flags;
+
+/**
+ * Flags for retaining user object references for graphs
+ */
+typedef enum CUuserObjectRetain_flags_enum {
+    CU_GRAPH_USER_OBJECT_MOVE = 1  /**< Transfer references from the caller rather than creating new references. */
+} CUuserObjectRetain_flags;
+
+/**
+ * Flags for instantiating a graph
+ */
+typedef enum CUgraphInstantiate_flags_enum {
+    CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH  = 1 /**< Automatically free memory allocated in a graph before relaunching. */
+  , CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY    = 8 /**< Run the graph using the per-node priority attributes rather than the
+                                                              priority of the stream it is launched into. */
+} CUgraphInstantiate_flags;
+
+/** @} */ /* END CUDA_TYPES */
+
+#if defined(__GNUC__)
+  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
+    #pragma GCC visibility push(default)
+  #endif
+#endif
+
+#ifdef _WIN32
+#define CUDAAPI __stdcall
+#else
+#define CUDAAPI
+#endif
+
+/**
+ * \defgroup CUDA_ERROR Error Handling
+ *
+ * ___MANBRIEF___ error handling functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the error handling functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the string description of an error code
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string description
+ * of the error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorString
+ */
+CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr);
+
+/**
+ * \brief Gets the string representation of an error code enum name
+ *
+ * Sets \p *pStr to the address of a NULL-terminated string representation
+ * of the name of the enum error code \p error.
+ * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE
+ * will be returned and \p *pStr will be set to the NULL address.
+ *
+ * \param error - Error code to convert to string
+ * \param pStr - Address of the string pointer.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::CUresult,
+ * ::cudaGetErrorName
+ */
+CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr);
+
+/** @} */ /* END CUDA_ERROR */
+
+/**
+ * \defgroup CUDA_INITIALIZE Initialization
+ *
+ * ___MANBRIEF___ initialization functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the initialization functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Initialize the CUDA driver API
+ *
+ * Initializes the driver API and must be called before any other function from
+ * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit()
+ * has not been called, any function from the driver API will return
+ * ::CUDA_ERROR_NOT_INITIALIZED.
+ *
+ * \param Flags - Initialization flag for CUDA.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH,
+ * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE
+ * \notefnerr
+ */
+CUresult CUDAAPI cuInit(unsigned int Flags);
+
+/** @} */ /* END CUDA_INITIALIZE */
+
+/**
+ * \defgroup CUDA_VERSION Version Management
+ *
+ * ___MANBRIEF___ version management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the version management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the latest CUDA version supported by driver
+ *
+ * Returns in \p *driverVersion the version of CUDA supported by
+ * the driver.  The version is returned as
+ * (1000 &times; major + 10 &times; minor). For example, CUDA 9.2
+ * would be represented by 9020.
+ *
+ * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if
+ * \p driverVersion is NULL.
+ *
+ * \param driverVersion - Returns the CUDA driver version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaDriverGetVersion,
+ * ::cudaRuntimeGetVersion
+ */
+CUresult CUDAAPI cuDriverGetVersion(int *driverVersion);
+
+/** @} */ /* END CUDA_VERSION */
+
+/**
+ * \defgroup CUDA_DEVICE Device Management
+ *
+ * ___MANBRIEF___ device management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given an ordinal in the range <b>[0,
+ * ::cuDeviceGetCount()-1]</b>.
+ *
+ * \param device  - Returned device handle
+ * \param ordinal - Device number to get handle for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport
+ */
+CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
+
+/**
+ * \brief Returns the number of compute-capable devices
+ *
+ * Returns in \p *count the number of devices with compute capability greater
+ * than or equal to 2.0 that are available for execution. If there is no such
+ * device, ::cuDeviceGetCount() returns 0.
+ *
+ * \param count - Returned number of compute-capable devices
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceCount
+ */
+CUresult CUDAAPI cuDeviceGetCount(int *count);
+
+/**
+ * \brief Returns an identifer string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p name. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param name - Returned identifier string for the device
+ * \param len  - Maximum length of string to store in \p name
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
+
+/**
+ * \brief Return an UUID for the device
+ *
+ * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will
+ * supplant this version in 12.0, which is retained for minor version compatibility.
+ *
+ * Returns 16-octets identifing the device \p dev in the structure
+ * pointed by the \p uuid.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetUuid_v2
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
+
+/**
+ * \brief Return an UUID for the device (11.4+)
+ *
+ * Returns 16-octets identifing the device \p dev in the structure
+ * pointed by the \p uuid. If the device is in MIG mode, returns its
+ * MIG UUID which uniquely identifies the subscribed MIG compute instance.
+ *
+ * \param uuid - Returned UUID
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetLuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev);
+
+/**
+ * \brief Return an LUID and device node mask for the device
+ *
+ * Return identifying information (\p luid and \p deviceNodeMask) to allow
+ * matching device with graphics APIs.
+ *
+ * \param luid - Returned LUID
+ * \param deviceNodeMask - Returned device node mask
+ * \param dev  - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev);
+
+/**
+ * \brief Returns the total amount of memory on the device
+ *
+ * Returns in \p *bytes the total amount of memory available on the device
+ * \p dev in bytes.
+ *
+ * \param bytes - Returned memory available on device in bytes
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev);
+
+/**
+ * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size.
+ *
+ * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture
+ * for given \p format and \p numChannels.
+ *
+ * \param maxWidthInElements    - Returned maximum number of texture elements allocatable for given \p format and \p numChannels.
+ * \param format                - Texture format.
+ * \param numChannels           - Number of channels per texture element.
+ * \param dev                   - Device handle.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cudaMemGetInfo,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev);
+
+/**
+ * \brief Returns information about the device
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on device
+ * \p dev. The supported attributes are:
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
+ *   block;
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
+ *   shared memory available to a thread block in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
+ *   __constant__ variables in a CUDA C kernel in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
+ *   memory copy functions that involve memory regions allocated through
+ *   ::cuMemAllocPitch()
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
+ *  for a 1D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 1D texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
+ *  texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
+ *  for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
+ *  for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
+ *  in bytes for a 2D texture bound to linear memory
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
+ *  mipmapped 2D texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
+ *  mipmapped 2D texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
+ *  texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
+ *  texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
+ *  texture depth
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
+ *  Alternate maximum 3D texture width, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
+ *  Alternate maximum 3D texture height, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
+ *  Alternate maximum 3D texture depth, 0 if no alternate
+ *  maximum 3D texture size is supported
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
+ *  Maximum cubemap texture width or height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
+ *  Maximum 1D layered texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
+ *  Maximum 2D layered texture width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered texture height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered texture width or height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered texture
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
+ *   Maximum 1D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
+ *   Maximum 2D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
+ *   Maximum 2D surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
+ *   Maximum 3D surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
+ *   Maximum 3D surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
+ *   Maximum 3D surface depth
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
+ *   Maximum 1D layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
+ *   Maximum layers in a 1D layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
+ *   Maximum 2D layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
+ *   Maximum 2D layered surface height
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
+ *   Maximum layers in a 2D layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
+ *   Maximum cubemap surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
+ *   Maximum cubemap layered surface width
+ * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
+ *   Maximum layers in a cubemap layered surface
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
+ *   registers available to a thread block
+ * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
+ *   base addresses aligned to ::textureAlign bytes do not need an offset
+ *   applied to texture fetches
+ * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
+ *   for 2D texture references bound to pitched memory
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
+ *   memory between host and device while executing a kernel, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
+ *   the device
+ * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
+ *   for kernels executed on the device, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
+ *   memory subsystem, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
+ *   memory into the CUDA address space, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
+ *   in. Available modes are as follows:
+ *   - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
+ *     can have multiple CUDA contexts present at a single time.
+ *   - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is
+ *     prohibited from creating new CUDA contexts.
+ *   - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS:  Compute-exclusive-process mode - Device
+ *     can have only one context used by a single process at a time.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports
+ *   executing multiple kernels within the same context simultaneously, or 0 if
+ *   not. It is not guaranteed that multiple kernels will be resident
+ *   on the device concurrently so this feature should not be relied upon for
+ *   correctness.
+ * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
+ *    device, 0 if error correction is disabled or not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
+ *   of the device
+ * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
+ * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
+ *    is only available on Tesla hardware running Windows Vista or later
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits
+ * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
+ *   the host, or 0 if not
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number
+ * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
+ *    in L1 cache, 0 if caching globals in L1 cache is not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
+ *    in L1 cache, 0 if caching locals in L1 cache is not supported by the device
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
+ *   shared memory available to a multiprocessor in bytes; this amount is shared
+ *   by all thread blocks simultaneously resident on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
+ *   registers available to a multiprocessor; this number is shared by all thread
+ *   blocks simultaneously resident on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
+ *   on this system, 0 if allocating managed memory is not supported by the device on this system.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
+ * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices
+ *   associated with the same board. Devices on the same multi-GPU board will share the same identifier.
+ * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host
+ *   supports native atomic operations.
+ * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
+ *   (in floating-point operations per second) to double precision performance.
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing
+ *   pageable memory without calling cudaHostRegister on it.
+ * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
+ *   concurrently with the CPU.
+ * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption.
+ * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
+ *   memory at the same virtual address as the CPU.
+ * -  ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
+ *    suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call.
+ *    For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+ * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
+ *   page tables.
+ * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration.
+ * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED:  Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED:  Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor
+ * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes 
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
+ * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes
+ * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. 
+ * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
+ * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum
+ * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
+ * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
+ * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+ *
+ * \param pi     - Returned device attribute value
+ * \param attrib - Device attribute to query
+ * \param dev    - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem,
+ * ::cuDeviceGetExecAffinitySupport,
+ * ::cudaDeviceGetAttribute,
+ * ::cudaGetDeviceProperties
+ */
+CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+
+/**
+ * \brief Return NvSciSync attributes that this device can support.
+ *
+ * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
+ * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
+ * can be used to create an NvSciSync object that matches this device's capabilities.
+ * 
+ * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
+ * already set this API will return ::CUDA_ERROR_INVALID_VALUE.
+ * 
+ * The applications should set \p nvSciSyncAttrList to a valid 
+ * NvSciSyncAttrList failing which this API will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ * 
+ * The \p flags controls how applications intends to use
+ * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
+ * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to 
+ * signal an NvSciSync on this CUDA device.
+ * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to 
+ * wait on an NvSciSync on this CUDA device.
+ *
+ * At least one of these flags must be set, failing which the API
+ * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal
+ * to one another: a developer may set both these flags that allows to
+ * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
+ *
+ * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
+ * \param dev                   - Valid Cuda Device to get NvSciSync attributes for.
+ * \param flags                 - flags describing NvSciSync usage.
+ *
+ * \return
+ *
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa
+ * ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags);
+
+/**
+ * \brief Sets the current memory pool of a device
+ *
+ * The memory pool must be local to the specified device.
+ * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device.
+ * By default, a device's current memory pool is its default memory pool.
+ *
+ * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different
+ * than the one the stream runs on. 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync
+ */
+CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool);
+
+/**
+ * \brief Gets the current mempool for a device
+ *
+ * Returns the last pool provided to ::cuDeviceSetMemPool for this device
+ * or the device's default memory pool if ::cuDeviceSetMemPool has never been called.
+ * By default the current mempool is the default mempool for a device.
+ * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool
+ */
+CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev);
+
+/**
+ * \brief Returns the default mempool of a device
+ *
+ * The default mempool of a device contains device memory from that device.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev);
+
+/**
+ * \brief Blocks until remote writes are visible to the specified scope
+ *
+ * Blocks until GPUDirect RDMA writes to the target context via mappings
+ * created through APIs like nvidia_p2p_get_pages (see
+ * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+ * visible to the specified scope.
+ *
+ * If the scope equals or lies within the scope indicated by
+ * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call
+ * will be a no-op and can be safely omitted for performance. This can be
+ * determined by comparing the numerical values between the two enums, with
+ * smaller scopes having smaller values.
+ *
+ * Users may query support for this API via
+ * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS.
+ *
+ * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget
+ * \param scope  - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+
+/** @} */ /* END CUDA_DEVICE */
+
+/**
+ * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated device management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns properties for a selected device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *prop the properties of device \p dev. The ::CUdevprop
+ * structure is defined as:
+ *
+ * \code
+     typedef struct CUdevprop_st {
+     int maxThreadsPerBlock;
+     int maxThreadsDim[3];
+     int maxGridSize[3];
+     int sharedMemPerBlock;
+     int totalConstantMemory;
+     int SIMDWidth;
+     int memPitch;
+     int regsPerBlock;
+     int clockRate;
+     int textureAlign
+  } CUdevprop;
+ * \endcode
+ * where:
+ *
+ * - ::maxThreadsPerBlock is the maximum number of threads per block;
+ * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block;
+ * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid;
+ * - ::sharedMemPerBlock is the total amount of shared memory available per
+ *   block in bytes;
+ * - ::totalConstantMemory is the total amount of constant memory available on
+ *   the device in bytes;
+ * - ::SIMDWidth is the warp size;
+ * - ::memPitch is the maximum pitch allowed by the memory copy functions that
+ *   involve memory regions allocated through ::cuMemAllocPitch();
+ * - ::regsPerBlock is the total number of registers available per block;
+ * - ::clockRate is the clock frequency in kilohertz;
+ * - ::textureAlign is the alignment requirement; texture base addresses that
+ *   are aligned to ::textureAlign bytes do not need an offset applied to
+ *   texture fetches.
+ *
+ * \param prop - Returned properties of device
+ * \param dev  - Device to get properties for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev);
+
+/**
+ * \brief Returns the compute capability of the device
+ *
+ * \deprecated
+ *
+ * This function was deprecated as of CUDA 5.0 and its functionality superceded
+ * by ::cuDeviceGetAttribute().
+ *
+ * Returns in \p *major and \p *minor the major and minor revision numbers that
+ * define the compute capability of the device \p dev.
+ *
+ * \param major - Major revision number
+ * \param minor - Minor revision number
+ * \param dev   - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev);
+
+/** @} */ /* END CUDA_DEVICE_DEPRECATED */
+
+/**
+ * \defgroup CUDA_PRIMARY_CTX Primary Context Management
+ *
+ * ___MANBRIEF___ primary context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the primary context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * The primary context is unique per device and shared with the CUDA runtime API.
+ * These functions allow integration with other libraries using CUDA.
+ *
+ * @{
+ */
+
+/**
+ * \brief Retain the primary context on the GPU
+ *
+ * Retains the primary context on the device.
+ * Once the user successfully retains the primary context, the primary context
+ * will be active and available to the user until the user releases it
+ * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset().
+ * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack.
+ *
+ * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN
+ * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function
+ * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to
+ * determine the compute mode  of the device.
+ * The <i>nvidia-smi</i> tool can be used to set the compute mode for
+ * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * Please note that the primary context always supports pinned allocations. Other
+ * flags can be specified by ::cuDevicePrimaryCtxSetFlags().
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param dev   - Device for which primary context is requested
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRelease,
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev);
+
+/**
+ * \brief Release the primary context on the GPU
+ *
+ * Releases the primary context interop on the device.
+ * A retained context should always be released once the user is done using
+ * it. The context is automatically reset once the last reference to it is
+ * released. This behavior is different when the primary context was retained
+ * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary
+ * context remains always active.
+ *
+ * Releasing a primary context that has not been previously retained will
+ * fail with ::CUDA_ERROR_INVALID_CONTEXT.
+ *
+ * Please note that unlike ::cuCtxDestroy() this method does not pop the context
+ * from stack in any circumstances.
+ *
+ * \param dev - Device which primary context is released
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+
+/**
+ * \brief Set flags for the primary context
+ *
+ * Sets the flags for the primary context on the device overwriting perviously
+ * set ones.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ *
+ * \param dev   - Device for which the primary context flags are set
+ * \param flags - New flags for the device
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxGetState,
+ * ::cuCtxCreate,
+ * ::cuCtxGetFlags,
+ * ::cudaSetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+
+/**
+ * \brief Get the state of the primary context
+ *
+ * Returns in \p *flags the flags for the primary context of \p dev, and in
+ * \p *active whether it is active.  See ::cuDevicePrimaryCtxSetFlags for flag
+ * values.
+ *
+ * \param dev    - Device to get primary context flags for
+ * \param flags  - Pointer to store flags
+ * \param active - Pointer to store context state; 0 = inactive, 1 = active
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDevicePrimaryCtxSetFlags,
+ * ::cuCtxGetFlags,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active);
+
+/**
+ * \brief Destroy all allocations and reset all state on the primary context
+ *
+ * Explicitly destroys and cleans up all resources associated with the current
+ * device in the current process.
+ *
+ * Note that it is responsibility of the calling function to ensure that no
+ * other module in the process is using the device any more. For that reason
+ * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases.
+ * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease()
+ * even after resetting the device.
+ * Resetting the primary context does not release it, an application that has
+ * retained the primary context should explicitly release its usage.
+ *
+ * \param dev - Device for which primary context is destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE
+ * \notefnerr
+ *
+ * \sa ::cuDevicePrimaryCtxRetain,
+ * ::cuDevicePrimaryCtxRelease,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceReset
+ */
+CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+
+/** @} */ /* END CUDA_PRIMARY_CTX */
+
+/**
+ * \brief Returns information about the execution affinity support of the device.
+ *
+ * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev.
+ * The supported types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device,
+ *   or 0 if not;
+ *
+ * \param pi   - 1 if the execution affinity type \p type is supported by the device, or 0 if not
+ * \param type - Execution affinity type to query
+ * \param dev  - Device handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetCount,
+ * ::cuDeviceGetName,
+ * ::cuDeviceGetUuid,
+ * ::cuDeviceGet,
+ * ::cuDeviceTotalMem
+ */
+CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev);
+
+/**
+ * \defgroup CUDA_CTX Context Management
+ *
+ * ___MANBRIEF___ context management functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * Please note that some functions are described in
+ * \ref CUDA_PRIMARY_CTX "Primary Context Management" section.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a CUDA context
+ *
+ * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain.
+ *
+ * Creates a new CUDA context and associates it with the calling thread. The
+ * \p flags parameter is described below. The context is created with a usage
+ * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or
+ * when done using the context. If a context is already current to the thread,
+ * it is supplanted by the newly created context and may be restored by a subsequent
+ * call to ::cuCtxPopCurrent().
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx  - Returned context handle of the new context
+ * \param flags - Context creation flags
+ * \param dev   - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+
+/**
+ * \brief Create a CUDA context with execution affinity
+ *
+ * Creates a new CUDA context with execution affinity and associates it with
+ * the calling thread. The \p paramsArray and \p flags parameter are described below.
+ * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must
+ * call ::cuCtxDestroy() or when done using the context. If a context is already
+ * current to the thread, it is supplanted by the newly created context and may
+ * be restored by a subsequent call to ::cuCtxPopCurrent().
+ *
+ * The type and the amount of execution resource the context can use is limited by \p paramsArray
+ * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams
+ * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type,
+ * the latter execution affinity parameter overrides the former execution affinity parameter.
+ * The supported execution affinity types are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion
+ *   of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally
+ *   rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution
+ *   affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute
+ *   is only supported under Volta+ MPS.
+ *
+ * The three LSBs of the \p flags parameter can be used to control how the OS
+ * thread, which owns the CUDA context at the time of an API call, interacts
+ * with the OS scheduler when waiting for results from the GPU. Only one of
+ * the scheduling flags can be set when creating a context.
+ *
+ * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for
+ * results from the GPU. This can decrease latency when waiting for the GPU,
+ * but may lower the performance of CPU threads if they are performing work in
+ * parallel with the CUDA thread.
+ *
+ * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for
+ * results from the GPU. This can increase latency when waiting for the GPU,
+ * but can increase the performance of CPU threads performing work in parallel
+ * with the GPU.
+ *
+ * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work.
+ *
+ * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a
+ * synchronization primitive when waiting for the GPU to finish work. <br>
+ * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was
+ * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC.
+ *
+ * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero,
+ * uses a heuristic based on the number of active CUDA contexts in the
+ * process \e C and the number of logical processors in the system \e P. If
+ * \e C > \e P, then CUDA will yield to other OS threads when waiting for
+ * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while
+ * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN).
+ * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on
+ * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC
+ * for low-powered devices.
+ *
+ * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations.
+ * This flag must be set in order to allocate pinned host memory that is
+ * accessible to the GPU.
+ *
+ * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * <b>Deprecated:</b> This flag is deprecated and the behavior enabled
+ * by this flag is now the default and cannot be disabled.
+ * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit().
+ *
+ * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of
+ * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute()
+ * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the
+ * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set
+ * the compute mode for * devices.
+ * Documentation for <i>nvidia-smi</i> can be obtained by passing a
+ * -h option to it.
+ *
+ * \param pctx        - Returned context handle of the new context
+ * \param paramsArray - Execution affinity parameters
+ * \param numParams   - Number of execution affinity parameters
+ * \param flags       - Context creation flags
+ * \param dev         - Device to create context on
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::CUexecAffinityParam
+ */
+CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev);
+
+/**
+ * \brief Destroy a CUDA context
+ *
+ * Destroys the CUDA context specified by \p ctx.  The context \p ctx will be
+ * destroyed regardless of how many threads it is current to.
+ * It is the responsibility of the calling function to ensure that no API
+ * call issues using \p ctx while ::cuCtxDestroy() is executing.
+ *
+ * Destroys and cleans up all resources associated with the context.
+ * It is the caller's responsibility to ensure that the context or its resources
+ * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
+ * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
+ * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
+ * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
+ *
+ * If \p ctx is current to the calling thread then \p ctx will also be
+ * popped from the current thread's context stack (as though ::cuCtxPopCurrent()
+ * were called).  If \p ctx is current to other threads, then \p ctx will
+ * remain current to those threads, and attempting to access \p ctx from
+ * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+
+/**
+ * \brief Pushes a context on the current CPU thread
+ *
+ * Pushes the given context \p ctx onto the CPU thread's stack of current
+ * contexts. The specified context becomes the CPU thread's current context, so
+ * all CUDA functions that operate on the current context are affected.
+ *
+ * The previous current context may be made current again by calling
+ * ::cuCtxDestroy() or ::cuCtxPopCurrent().
+ *
+ * \param ctx - Context to push
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+
+/**
+ * \brief Pops the current CUDA context from the current CPU thread.
+ *
+ * Pops the current CUDA context from the CPU thread and passes back the
+ * old context handle in \p *pctx. That context may then be made current
+ * to a different CPU thread by calling ::cuCtxPushCurrent().
+ *
+ * If a context was current to the CPU thread before ::cuCtxCreate() or
+ * ::cuCtxPushCurrent() was called, this function makes that context current to
+ * the CPU thread again.
+ *
+ * \param pctx - Returned popped context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+
+/**
+ * \brief Binds the specified CUDA context to the calling CPU thread
+ *
+ * Binds the specified CUDA context to the calling CPU thread.
+ * If \p ctx is NULL then the CUDA context previously bound to the
+ * calling CPU thread is unbound and ::CUDA_SUCCESS is returned.
+ *
+ * If there exists a CUDA context stack on the calling CPU thread, this
+ * will replace the top of that stack with \p ctx.
+ * If \p ctx is NULL then this will be equivalent to popping the top
+ * of the calling CPU thread's CUDA context stack (or a no-op if the
+ * calling CPU thread's CUDA context stack is empty).
+ *
+ * \param ctx - Context to bind to the calling CPU thread
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxGetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaSetDevice
+ */
+CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx);
+
+/**
+ * \brief Returns the CUDA context bound to the calling CPU thread.
+ *
+ * Returns in \p *pctx the CUDA context bound to the calling CPU thread.
+ * If no context is bound to the calling CPU thread then \p *pctx is
+ * set to NULL and ::CUDA_SUCCESS is returned.
+ *
+ * \param pctx - Returned context handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxSetCurrent,
+ * ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx);
+
+/**
+ * \brief Returns the device ID for the current context
+ *
+ * Returns in \p *device the ordinal of the current context's device.
+ *
+ * \param device - Returned device ID for the current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaGetDevice
+ */
+CUresult CUDAAPI cuCtxGetDevice(CUdevice *device);
+
+/**
+ * \brief Returns the flags for the current context
+ *
+ * Returns in \p *flags the flags of the current context. See ::cuCtxCreate
+ * for flag values.
+ *
+ * \param flags - Pointer to store flags of current context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetCurrent,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetLimit,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cudaGetDeviceFlags
+ */
+CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags);
+
+/**
+ * \brief Block for a context's tasks to complete
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed.
+ * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the
+ * CPU thread will block until the GPU context has finished its work.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cudaDeviceSynchronize
+ */
+CUresult CUDAAPI cuCtxSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the context. The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc). The application can use ::cuCtxGetLimit() to find out exactly
+ * what the limit has been set to.
+ *
+ * Setting each ::CUlimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread.
+ *   The driver automatically increases the per-thread stack size
+ *   for each kernel launch as needed. This size isn't reset back to the
+ *   original value after each launch. Setting this value will take effect 
+ *   immediately, and if necessary, the device will block until all preceding 
+ *   requested tasks are complete.
+ *
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used
+ *   by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE
+ *   must be performed before launching any kernel that uses the ::printf()
+ *   device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used
+ *   by the ::malloc() and ::free() device system calls. Setting
+ *   ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel
+ *   that uses the ::malloc() or ::free() device system calls, otherwise
+ *   ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of
+ *   a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+ *   this limit must be performed before any launch of a kernel that uses the
+ *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+ *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+ *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+ *   violated. This limit can be set smaller than the default or up the maximum
+ *   launch depth of 24. When setting this limit, keep in mind that additional
+ *   levels of sync depth require the driver to reserve large amounts of device
+ *   memory which can no longer be used for user allocations. If these
+ *   reservations of device memory fail, ::cuCtxSetLimit() will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of
+ *   outstanding device runtime launches that can be made from the current
+ *   context. A grid is outstanding from the point of launch up until the grid
+ *   is known to have been completed. Device runtime launches which violate
+ *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+ *   ::cudaGetLastError() is called after launch. If more pending launches than
+ *   the default (2048 launches) are needed for a module using the device
+ *   runtime, this limit can be increased. Keep in mind that being able to
+ *   sustain additional pending launches will require the driver to reserve
+ *   larger amounts of device memory upfront which can no longer be used for
+ *   allocations. If these reservations fail, ::cuCtxSetLimit() will return
+ *   ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being
+ *   returned.
+ *
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
+ *   Values can range from 0B to 128B. This is purely a performence hint and
+ *   it can be ignored or clamped depending on the platform.
+ *
+ * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for
+ *   persisting L2 cache. This is purely a performance hint and it can be
+ *   ignored or clamped depending on the platform.
+ *
+ * \param limit - Limit to set
+ * \param value - Size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceSetLimit
+ */
+CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *
+ * Returns in \p *pvalue the current size of \p limit.  The supported
+ * ::CUlimit values are:
+ * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread.
+ * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the
+ *   ::printf() device system call.
+ * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the
+ *   ::malloc() and ::free() device system calls.
+ * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread
+ *   can issue the device runtime call ::cudaDeviceSynchronize() to wait on
+ *   child grid launches to complete.
+ * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding
+ *   device runtime launches that can be made from this context.
+ * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity.
+ * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes
+ *
+ * \param limit  - Limit to query
+ * \param pvalue - Returned size of limit
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_LIMIT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetLimit
+ */
+CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit);
+
+/**
+ * \brief Returns the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this function returns through \p pconfig the preferred cache configuration
+ * for the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute functions.
+ *
+ * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param pconfig - Returned cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig
+ */
+CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig);
+
+/**
+ * \brief Sets the preferred cache configuration for the current context.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the current context. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute the function. Any function preference
+ * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide
+ * setting. Setting the context-wide cache configuration to
+ * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer
+ * to not change the cache configuration unless required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetCacheConfig
+ */
+CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config);
+
+/**
+ * \brief Returns the current shared memory configuration for the current context.
+ *
+ * This function will return in \p pConfig the current size of shared memory banks
+ * in the current context. On devices with configurable shared memory banks,
+ * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all
+ * subsequent kernel launches will by default use the new bank size. When
+ * ::cuCtxGetSharedMemConfig is called on devices without configurable shared
+ * memory, it will return the fixed bank size of the hardware.
+ *
+ * The returned bank configurations can be either:
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE:  shared memory bank width is
+ *   four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will
+ *   eight bytes.
+ *
+ * \param pConfig - returned shared memory configuration
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceGetSharedMemConfig
+ */
+CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig);
+
+/**
+ * \brief Sets the shared memory configuration for the current context.
+ *
+ * On devices with configurable shared memory banks, this function will set
+ * the context's shared memory bank size which is used for subsequent kernel
+ * launches.
+ *
+ * Changed the shared memory configuration between launches may insert a device
+ * side synchronization point between those launches.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial
+ *   setting (currently, four bytes).
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes.
+ *
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cudaDeviceSetSharedMemConfig
+ */
+CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config);
+
+/**
+ * \brief Gets the context's API version.
+ *
+ * Returns a version number in \p version corresponding to the capabilities of
+ * the context (e.g. 3010 or 3020), which library developers can use to direct
+ * callers to a specific API version. If \p ctx is NULL, returns the API version
+ * used to create the currently bound context.
+ *
+ * Note that new API versions are only introduced when context capabilities are
+ * changed that break binary compatibility, so the API version and driver version
+ * may be different. For example, it is valid for the API version to be 3020 while
+ * the driver version is 4020.
+ *
+ * \param ctx     - Context to check
+ * \param version - Pointer to version
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version);
+
+/**
+ * \brief Returns numerical values that correspond to the least and
+ * greatest stream priorities.
+ *
+ * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ * to the least and greatest stream priorities respectively. Stream priorities
+ * follow a convention where lower numbers imply greater priorities. The range of
+ * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ * If the user attempts to create a stream with a priority value that is
+ * outside the meaningful range as specified by this API, the priority is
+ * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ * respectively. See ::cuStreamCreateWithPriority for details on creating a
+ * priority stream.
+ * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ * is not desired.
+ *
+ * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ * the current context's device does not support stream priorities
+ * (see ::cuDeviceGetAttribute).
+ *
+ * \param leastPriority    - Pointer to an int in which the numerical value for least
+ *                           stream priority is returned
+ * \param greatestPriority - Pointer to an int in which the numerical value for greatest
+ *                           stream priority is returned
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize,
+ * ::cudaDeviceGetStreamPriorityRange
+ */
+CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+/**
+ * \brief Resets all persisting lines in cache to normal status.
+ *
+ * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal
+ * status. Takes effect on function return.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuCtxResetPersistingL2Cache(void);
+
+/**
+ * \brief Returns the execution affinity setting for the current context.
+ *
+ * Returns in \p *pExecAffinity the current value of \p type. The supported
+ * ::CUexecAffinityType values are:
+ * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use.
+ *
+ * \param type          - Execution affinity type to query
+ * \param pExecAffinity - Returned execution affinity
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY
+ * \notefnerr
+ *
+ * \sa
+ * ::CUexecAffinityParam
+ */
+CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+
+
+/** @} */ /* END CUDA_CTX */
+
+/**
+ * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated context management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated context management functions of the low-level
+ * CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Increment a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Increments the usage count of the context and passes back a context handle
+ * in \p *pctx that must be passed to ::cuCtxDetach() when the application is
+ * done with the context. ::cuCtxAttach() fails if there is no context current
+ * to the thread.
+ *
+ * Currently, the \p flags parameter must be 0.
+ *
+ * \param pctx  - Returned context handle of the current context
+ * \param flags - Context attach flags (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxDetach,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags);
+
+/**
+ * \brief Decrement a context's usage-count
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated and should not be used.
+ *
+ * Decrements the usage count of the context \p ctx, and destroys the context
+ * if the usage count goes to 0. The context must be a handle that was passed
+ * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the
+ * calling thread.
+ *
+ * \param ctx - Context to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate,
+ * ::cuCtxDestroy,
+ * ::cuCtxGetApiVersion,
+ * ::cuCtxGetCacheConfig,
+ * ::cuCtxGetDevice,
+ * ::cuCtxGetFlags,
+ * ::cuCtxGetLimit,
+ * ::cuCtxPopCurrent,
+ * ::cuCtxPushCurrent,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxSetLimit,
+ * ::cuCtxSynchronize
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx);
+
+/** @} */ /* END CUDA_CTX_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_MODULE Module Management
+ *
+ * ___MANBRIEF___ module management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the module management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Loads a compute module
+ *
+ * Takes a filename \p fname and loads the corresponding module \p module into
+ * the current context. The CUDA driver API does not attempt to lazily
+ * allocate the resources needed by a module; if the memory for functions and
+ * data (constant and global) needed by the module cannot be allocated,
+ * ::cuModuleLoad() fails. The file should be a \e cubin file as output by
+ * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or
+ * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later.
+ *
+ * \param module - Returned module
+ * \param fname  - Filename of module to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_FILE_NOT_FOUND,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The pointer may be obtained by mapping a \e cubin or
+ * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
+ * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
+ * object into the executable resources and using operating system calls such
+ * as Windows \c FindResource() to obtain the pointer.
+ *
+ * \param module - Returned module
+ * \param image  - Module data to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image);
+
+/**
+ * \brief Load a module's data with options
+ *
+ * Takes a pointer \p image and loads the corresponding module \p module into
+ * the current context. The pointer may be obtained by mapping a \e cubin or
+ * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file
+ * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin
+ * object into the executable resources and using operating system calls such
+ * as Windows \c FindResource() to obtain the pointer. Options are passed as
+ * an array via \p options and any corresponding parameters are passed in
+ * \p optionValues. The number of total options is supplied via \p numOptions.
+ * Any outputs will be returned via \p optionValues.
+ *
+ * \param module       - Returned module
+ * \param image        - Module data to load
+ * \param numOptions   - Number of options
+ * \param options      - Options for JIT
+ * \param optionValues - Option values for JIT
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Load a module's data
+ *
+ * Takes a pointer \p fatCubin and loads the corresponding module \p module
+ * into the current context. The pointer represents a <i>fat binary</i> object,
+ * which is a collection of different \e cubin and/or \e PTX files, all
+ * representing the same device code, but compiled and optimized for different
+ * architectures.
+ *
+ * Prior to CUDA 4.0, there was no documented API for constructing and using
+ * fat binary objects by programmers.  Starting with CUDA 4.0, fat binary
+ * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc.
+ * More information can be found in the \b nvcc document.
+ *
+ * \param module   - Returned module
+ * \param fatCubin - Fat binary to load
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU,
+ * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
+
+/**
+ * \brief Unloads a module
+ *
+ * Unloads a module \p hmod from the current context.
+ *
+ * \param hmod - Module to unload
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_destroy_ub
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary
+ */
+CUresult CUDAAPI cuModuleUnload(CUmodule hmod);
+
+/**
+ * CUDA Lazy Loading status
+ */
+typedef enum CUmoduleLoadingMode_enum {
+    CU_MODULE_EAGER_LOADING = 0x1, /**< Lazy Kernel Loading is not enabled */
+    CU_MODULE_LAZY_LOADING  = 0x2, /**< Lazy Kernel Loading is enabled */
+} CUmoduleLoadingMode;
+
+/**
+ * \brief Query lazy loading mode
+ *
+ * Returns lazy loading mode
+ * Module loading mode is controlled by CUDA_MODULE_LOADING env variable
+ *
+ * \param mode      - Returns the lazy loading mode
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleLoad,
+ */
+CUresult CUDAAPI cuModuleGetLoadingMode(CUmoduleLoadingMode *mode);
+
+/**
+ * \brief Returns a function handle
+ *
+ * Returns in \p *hfunc the handle of the function of name \p name located in
+ * module \p hmod. If no function of that name exists, ::cuModuleGetFunction()
+ * returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param hfunc - Returned function handle
+ * \param hmod  - Module to retrieve function from
+ * \param name  - Name of function to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload
+ */
+CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns a global pointer from a module
+ *
+ * Returns in \p *dptr and \p *bytes the base pointer and size of the
+ * global of name \p name located in module \p hmod. If no variable of that name
+ * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both
+ * parameters \p dptr and \p bytes are optional. If one of them is
+ * NULL, it is ignored.
+ *
+ * \param dptr  - Returned global device pointer
+ * \param bytes - Returned global size in bytes
+ * \param hmod  - Module to retrieve global from
+ * \param name  - Name of global to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuModuleGetFunction,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetSymbolAddress,
+ * ::cudaGetSymbolSize
+ */
+CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns a handle to a texture reference
+ *
+ * Returns in \p *pTexRef the handle of the texture reference of name \p name
+ * in the module \p hmod. If no texture reference of that name exists,
+ * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference
+ * handle should not be destroyed, since it will be destroyed when the module
+ * is unloaded.
+ *
+ * \param pTexRef  - Returned texture reference
+ * \param hmod     - Module to retrieve texture reference from
+ * \param name     - Name of texture reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetSurfRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetTextureReference
+ */
+CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name);
+
+/**
+ * \brief Returns a handle to a surface reference
+ *
+ * Returns in \p *pSurfRef the handle of the surface reference of name \p name
+ * in the module \p hmod. If no surface reference of that name exists,
+ * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND.
+ *
+ * \param pSurfRef  - Returned surface reference
+ * \param hmod     - Module to retrieve surface reference from
+ * \param name     - Name of surface reference to retrieve
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa
+ * ::cuModuleGetFunction,
+ * ::cuModuleGetGlobal,
+ * ::cuModuleGetTexRef,
+ * ::cuModuleLoad,
+ * ::cuModuleLoadData,
+ * ::cuModuleLoadDataEx,
+ * ::cuModuleLoadFatBinary,
+ * ::cuModuleUnload,
+ * ::cudaGetSurfaceReference
+ */
+CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+
+/**
+ * \brief Creates a pending JIT linker invocation.
+ *
+ * If the call is successful, the caller owns the returned CUlinkState, which
+ * should eventually be destroyed with ::cuLinkDestroy.  The
+ * device code machine size (32 or 64 bit) will match the calling application.
+ *
+ * Both linker and compiler options may be specified.  Compiler options will
+ * be applied to inputs to this linker action which must be compiled from PTX.
+ * The options ::CU_JIT_WALL_TIME,
+ * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+ * will accumulate data until the CUlinkState is destroyed.
+ *
+ * \p optionValues must remain valid for the life of the CUlinkState if output
+ * options are used.  No other references to inputs are maintained after this
+ * call returns.
+ *
+ * \param numOptions   Size of options arrays
+ * \param options      Array of linker and compiler options
+ * \param optionValues Array of option values, each cast to void *
+ * \param stateOut     On success, this will contain a CUlinkState to specify
+ *                     and complete this action
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND
+ * \notefnerr
+ *
+ * \sa ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+
+/**
+ * \brief Add an input to a pending linker invocation
+ *
+ * Ownership of \p data is retained by the caller.  No reference is retained to any
+ * inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the data must
+ * be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * \param state        A pending linker action.
+ * \param type         The type of the input data.
+ * \param data         The input data.  PTX must be NULL-terminated.
+ * \param size         The length of the input data.
+ * \param name         An optional name for this input in log messages.
+ * \param numOptions   Size of options.
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate).
+ * \param optionValues Array of option values, each cast to void *.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddFile,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Add a file input to a pending linker invocation
+ *
+ * No reference is retained to any inputs after this call returns.
+ *
+ * This method accepts only compiler options, which are used if the input
+ * must be compiled from PTX, and does not accept any of
+ * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER,
+ * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET.
+ *
+ * This method is equivalent to invoking ::cuLinkAddData on the contents
+ * of the file.
+ *
+ * \param state        A pending linker action
+ * \param type         The type of the input data
+ * \param path         Path to the input file
+ * \param numOptions   Size of options
+ * \param options      Options to be applied only for this input (overrides options from ::cuLinkCreate)
+ * \param optionValues Array of option values, each cast to void *
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_FILE_NOT_FOUND
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_PTX,
+ * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NO_BINARY_FOR_GPU
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkComplete,
+ * ::cuLinkDestroy
+ */
+CUresult CUDAAPI
+cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+    unsigned int numOptions, CUjit_option *options, void **optionValues);
+
+/**
+ * \brief Complete a pending linker invocation
+ *
+ * Completes the pending linker action and returns the cubin image for the linked
+ * device code, which can be used with ::cuModuleLoadData.  The cubin is owned by
+ * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy.
+ * This call does not destroy \p state.
+ *
+ * \param state    A pending linker invocation
+ * \param cubinOut On success, this will point to the output image
+ * \param sizeOut  Optional parameter to receive the size of the generated image
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuLinkCreate,
+ * ::cuLinkAddData,
+ * ::cuLinkAddFile,
+ * ::cuLinkDestroy,
+ * ::cuModuleLoadData
+ */
+CUresult CUDAAPI
+cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut);
+
+/**
+ * \brief Destroys state for a JIT linker invocation.
+ *
+ * \param state State object for the linker invocation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \sa ::cuLinkCreate
+ */
+CUresult CUDAAPI
+cuLinkDestroy(CUlinkState state);
+
+/** @} */ /* END CUDA_MODULE */
+
+
+/**
+ * \defgroup CUDA_MEM Memory Management
+ *
+ * ___MANBRIEF___ memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets free and total memory
+ *
+ * Returns in \p *total the total amount of memory available to the the current context.
+ * Returns in \p *free the amount of memory on the device that is free according to the OS.
+ * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
+ * In a multi-tenet situation, free estimate returned is prone to race condition where
+ * a new allocation/free done by a different process or a different thread in the same
+ * process between the time when free memory was estimated and reported, will result in
+ * deviation in free value reported and actual free memory.
+ *
+ * The integrated GPU on Tegra shares memory with CPU and other component
+ * of the SoC. The free and total values returned by the API excludes
+ * the SWAP memory space maintained by the OS on some platforms.
+ * The OS may move some of the memory pages into swap area as the GPU or
+ * CPU allocate or access memory. See Tegra app note on how to calculate
+ * total and free memory on Tegra.
+ *
+ * \param free  - Returned free memory in bytes
+ * \param total - Returned total memory in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemGetInfo
+ */
+CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total);
+
+/**
+ * \brief Allocates device memory
+ *
+ * Allocates \p bytesize bytes of linear memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc
+ */
+CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize);
+
+/**
+ * \brief Allocates pitched device memory
+ *
+ * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on
+ * the device and returns in \p *dptr a pointer to the allocated memory. The
+ * function may pad the allocation to ensure that corresponding pointers in
+ * any given row will continue to meet the alignment requirements for
+ * coalescing as the address is updated from row to row. \p ElementSizeBytes
+ * specifies the size of the largest reads and writes that will be performed
+ * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced
+ * memory transactions are not possible on other data sizes). If
+ * \p ElementSizeBytes is smaller than the actual read/write size of a kernel,
+ * the kernel will run correctly, but possibly at reduced speed. The pitch
+ * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the
+ * allocation. The intended usage of pitch is as a separate parameter of the
+ * allocation, used to compute addresses within the 2D array. Given the row
+ * and column of an array element of type \b T, the address is computed as:
+ * \code
+   T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column;
+ * \endcode
+ *
+ * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with
+ * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is
+ * recommended that programmers consider performing pitch allocations using
+ * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is
+ * especially true if the application will be performing 2D memory copies
+ * between different regions of device memory (whether linear memory or CUDA
+ * arrays).
+ *
+ * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed
+ * to match or exceed the alignment requirement for texture binding with
+ * ::cuTexRefSetAddress2D().
+ *
+ * \param dptr             - Returned device pointer
+ * \param pPitch           - Returned pitch of allocation in bytes
+ * \param WidthInBytes     - Requested allocation width in bytes
+ * \param Height           - Requested allocation height in rows
+ * \param ElementSizeBytes - Size of largest reads/writes for range
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocPitch
+ */
+CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+
+/**
+ * \brief Frees device memory
+ *
+ * Frees the memory space pointed to by \p dptr, which must have been returned
+ * by a previous call to one of the following memory allocation APIs - ::cuMemAlloc(), 
+ * ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync()
+ *
+ * Note - This API will not perform any implict synchronization when the pointer was allocated with
+ * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to the
+ * pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users
+ * should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator.
+ * 
+ * \param dptr - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemAllocManaged, ::cuMemAllocAsync, ::cuMemAllocFromPoolAsync, 
+ * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, ::cuMemcpy3D, ::cuMemcpy3DAsync,
+ * ::cuMemcpyAtoA, ::cuMemcpyAtoD, ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ * ::cuMemcpyHtoAAsync, ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, ::cuMemFreeAsync,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFree
+ */
+CUresult CUDAAPI cuMemFree(CUdeviceptr dptr);
+
+/**
+ * \brief Get information on memory allocations
+ *
+ * Returns the base address in \p *pbase and size in \p *psize of the
+ * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input
+ * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one
+ * of them is NULL, it is ignored.
+ *
+ * \param pbase - Returned base address
+ * \param psize - Returned size of device memory allocation
+ * \param dptr  - Device pointer to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32
+ */
+CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and
+ * accessible to the device. The driver tracks the virtual memory ranges
+ * allocated with this function and automatically accelerates calls to
+ * functions such as ::cuMemcpy(). Since the memory can be accessed directly by
+ * the device, it can be read or written with much higher bandwidth than
+ * pageable memory obtained with functions such as ::malloc(). Allocating
+ * excessive amounts of memory with ::cuMemAllocHost() may degrade system
+ * performance, since it reduces the amount of memory available to the system
+ * for paging. As a result, this function is best used sparingly to allocate
+ * staging areas for data exchange between host and device.
+ *
+ * Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * The device pointer that may be used to access this host memory from those
+ * contexts is always equal to the returned host pointer \p *pp.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned host pointer to page-locked memory
+ * \param bytesize - Requested allocation size in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocHost
+ */
+CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize);
+
+/**
+ * \brief Frees page-locked host memory
+ *
+ * Frees the memory space pointed to by \p p, which must have been returned by
+ * a previous call to ::cuMemAllocHost().
+ *
+ * \param p - Pointer to memory to free
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeHost
+ */
+CUresult CUDAAPI cuMemFreeHost(void *p);
+
+/**
+ * \brief Allocates page-locked host memory
+ *
+ * Allocates \p bytesize bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device,
+ * it can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between
+ * host and device.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined
+ *   (WC). WC memory can be transferred across the PCI Express bus more
+ *   quickly on some system configurations, but cannot be read efficiently by
+ *   most CPUs. WC memory is a good option for buffers that will be written by
+ *   the CPU and read by the GPU via mapped pinned memory or host->device
+ *   transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag.
+ *
+ * The memory allocated by this function must be freed with ::cuMemFreeHost().
+ *
+ * Note all host memory allocated using ::cuMemHostAlloc() will automatically
+ * be immediately accessible to all contexts on all devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING).
+ * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer
+ * that may be used to access this host memory from those contexts is always equal
+ * to the returned host pointer \p *pp.  If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED
+ * is specified, then the function ::cuMemHostGetDevicePointer() must be used
+ * to query the device pointer, even if the context supports unified addressing.
+ * See \ref CUDA_UNIFIED for additional details.
+ *
+ * \param pp       - Returned host pointer to page-locked memory
+ * \param bytesize - Requested allocation size in bytes
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostAlloc
+ */
+CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags);
+
+/**
+ * \brief Passes back device pointer of mapped pinned memory
+ *
+ * Passes back the device pointer \p pdptr corresponding to the mapped, pinned
+ * host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP
+ * flag was not specified at the time the memory was allocated, or if the
+ * function is called on a GPU that does not support mapped pinned memory.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p p and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p p. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only one of the two pointers and not both.
+ *
+ * \p Flags provides for future releases. For now, it must be set to 0.
+ *
+ * \param pdptr - Returned device pointer
+ * \param p     - Host pointer
+ * \param Flags - Options (must be 0)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaHostGetDevicePointer
+ */
+CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags);
+
+/**
+ * \brief Passes back flags that were used for a pinned allocation
+ *
+ * Passes back the flags \p pFlags that were specified when allocating
+ * the pinned host buffer \p p allocated by ::cuMemHostAlloc.
+ *
+ * ::cuMemHostGetFlags() will fail if the pointer does not reside in
+ * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc().
+ *
+ * \param pFlags - Returned flags word
+ * \param p     - Host pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemAllocHost,
+ * ::cuMemHostAlloc,
+ * ::cudaHostGetFlags
+ */
+CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p bytesize bytes of managed memory on the device and returns in
+ * \p *dptr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p bytesize
+ * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If
+ * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from
+ * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to
+ * ::cuStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cuStreamAttachMemAsync to
+ * a single stream, the default association as specifed during ::cuMemAllocManaged
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cuMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a
+ * non-zero value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all contexts created in
+ * that process on devices that support managed memory have to be peer-to-peer compatible
+ * with each other. Context creation will fail if a context is created on a device that
+ * supports managed memory and is not peer-to-peer compatible with any of the other
+ * managed memory supporting devices on which contexts were previously created, even if
+ * those contexts have been destroyed. These environment variables are described
+ * in the CUDA programming guide under the "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param dptr     - Returned device pointer
+ * \param bytesize - Requested allocation size in bytes
+ * \param flags    - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync,
+ * ::cudaMallocManaged
+ */
+CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags);
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device handle given a PCI bus ID string.
+ *
+ * \param dev      - Returned device handle
+ *
+ * \param pciBusId - String in one of the following forms:
+ * [domain]:[bus]:[device].[function]
+ * [domain]:[bus]:[device]
+ * [bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetPCIBusId,
+ * ::cudaDeviceGetByPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId);
+
+/**
+ * \brief Returns a PCI Bus Id string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param pciBusId - Returned identifier string for the device in the following format
+ * [domain]:[bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ * pciBusId should be large enough to store 13 characters including the NULL-terminator.
+ *
+ * \param len      - Maximum length of string to store in \p name
+ *
+ * \param dev      - Device to get identifier string for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceGet,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetByPCIBusId,
+ * ::cudaDeviceGetPCIBusId
+ */
+CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev);
+
+/**
+ * \brief Gets an interprocess handle for a previously allocated event
+ *
+ * Takes as input a previously allocated event. This event must have been
+ * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING
+ * flags set. This opaque handle may be copied into other processes and
+ * opened with ::cuIpcOpenEventHandle to allow efficient hardware
+ * synchronization between GPU work in different processes.
+ *
+ * After the event has been opened in the importing process,
+ * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and
+ * ::cuEventQuery may be used in either process. Performing operations
+ * on the imported event after the exported event has been freed
+ * with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param pHandle - Pointer to a user allocated CUipcEventHandle
+ *                    in which to return the opaque event handle
+ * \param event   - Event allocated with ::CU_EVENT_INTERPROCESS and
+ *                    ::CU_EVENT_DISABLE_TIMING flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetEventHandle
+ */
+CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event);
+
+/**
+ * \brief Opens an interprocess event handle for use in the current process
+ *
+ * Opens an interprocess event handle exported from another process with
+ * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like
+ * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified.
+ * This event must be freed with ::cuEventDestroy.
+ *
+ * Performing operations on the imported event after the exported event has
+ * been freed with ::cuEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param phEvent - Returns the imported event
+ * \param handle  - Interprocess handle to open
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuEventCreate,
+ * ::cuEventDestroy,
+ * ::cuEventSynchronize,
+ * ::cuEventQuery,
+ * ::cuStreamWaitEvent,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcOpenEventHandle
+ */
+CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle);
+
+/**
+ * \brief Gets an interprocess memory handle for an existing device memory
+ * allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created
+ * with ::cuMemAlloc and exports it for use in another process. This is a
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects.
+ *
+ * If a region of memory is freed with ::cuMemFree and a subsequent call
+ * to ::cuMemAlloc returns memory with the same device address,
+ * ::cuIpcGetMemHandle will return a unique handle for the
+ * new memory.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return
+ *                    the handle in.
+ * \param dptr    - Base pointer to previously allocated device memory
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cudaIpcGetMemHandle
+ */
+CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr);
+
+/**
+ * \brief Opens an interprocess memory handle exported from another process
+ * and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with ::cuIpcGetMemHandle into
+ * the current device address space. For contexts on different devices
+ * ::cuIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is
+ * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag.
+ * ::cuDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * Contexts that may open ::CUipcMemHandles are restricted in the following way.
+ * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened
+ * by one ::CUcontext per ::CUdevice per other process.
+ *
+ * If the memory handle has already been opened by the current context, the
+ * reference count on the handle is incremented by 1 and the existing device pointer
+ * is returned.
+ *
+ * Memory returned from ::cuIpcOpenMemHandle must be freed with
+ * ::cuIpcCloseMemHandle.
+ *
+ * Calling ::cuMemFree on an exported memory region before calling
+ * ::cuIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param pdptr  - Returned device pointer
+ * \param handle - ::CUipcMemHandle to open
+ * \param Flags  - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \note No guarantees are made about the address returned in \p *pdptr.
+ * In particular, multiple processes may not receive the same address for the same \p handle.
+ *
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcCloseMemHandle,
+ * ::cuCtxEnablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaIpcOpenMemHandle
+ */
+CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+
+/**
+ * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle
+ *
+ * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1.
+ * When the reference count reaches 0, this API unmaps the memory. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * IPC functionality is restricted to devices with support for unified
+ * addressing on Linux and Windows operating systems.
+ * IPC functionality on Windows is restricted to GPUs in TCC mode
+ *
+ * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_MAP_FAILED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \sa
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuIpcGetEventHandle,
+ * ::cuIpcOpenEventHandle,
+ * ::cuIpcGetMemHandle,
+ * ::cuIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle
+ */
+CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
+
+/**
+ * \brief Registers an existing host memory range for use by CUDA
+ *
+ * Page-locks the memory range specified by \p p and \p bytesize and maps it
+ * for the device(s) as specified by \p Flags. This memory range also is added
+ * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate
+ * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed
+ * directly by the device, it can be read or written with much higher bandwidth
+ * than pageable memory that has not been registered.  Page-locking excessive
+ * amounts of memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to register staging areas for data exchange between
+ * host and device.
+ *
+ * The \p Flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cuMemHostGetDevicePointer().
+ *
+ * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some
+ *   I/O memory space, e.g. the PCI Express resource of a 3rd party device.
+ *
+ * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory
+ *   that is considered read-only by the device.  On platforms without
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
+ *   required in order to register memory mapped to the CPU as read-only.  Support
+ *   for the use of this flag can be queried from the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED.  Using this flag with
+ *   a current context associated with a device that does not have this attribute
+ *   set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * All of these flags are orthogonal to one another: a developer may page-lock
+ * memory that is portable or mapped with no restrictions.
+ *
+ * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cuMemHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory
+ * can also be accessed from the device using the host pointer \p p.
+ * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not
+ * match the original host pointer \p ptr and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer()
+ * will match the original pointer \p ptr. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * The memory page-locked by this function must be unregistered with
+ * ::cuMemHostUnregister().
+ *
+ * \param p        - Host pointer to memory to page-lock
+ * \param bytesize - Size in bytes of the address range to page-lock
+ * \param Flags    - Flags for allocation request
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED,
+ * ::CUDA_ERROR_NOT_PERMITTED,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostUnregister,
+ * ::cuMemHostGetFlags,
+ * ::cuMemHostGetDevicePointer,
+ * ::cudaHostRegister
+ */
+CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+
+/**
+ * \brief Unregisters a memory range that was registered with cuMemHostRegister.
+ *
+ * Unmaps the memory range whose base address is specified by \p p, and makes
+ * it pageable again.
+ *
+ * The base address must be the same one specified to ::cuMemHostRegister().
+ *
+ * \param p - Host pointer to memory to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemHostRegister,
+ * ::cudaHostUnregister
+ */
+CUresult CUDAAPI cuMemHostUnregister(void *p);
+
+/**
+ * \brief Copies memory
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst - Destination unified virtual address space pointer
+ * \param src - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+
+/**
+ * \brief Copies device memory between two contexts
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeer
+ */
+CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol
+ */
+CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol
+ */
+CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Device to Array
+ *
+ * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting index of the destination data.
+ * \p srcDevice specifies the base pointer of the source. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Device
+ *
+ * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the
+ * base pointer of the destination and must be naturally aligned with the CUDA
+ * array elements. \p srcArray and \p srcOffset specify the CUDA array handle
+ * and the offset in bytes into the array where the copy is to begin.
+ * \p ByteCount specifies the number of bytes to copy and must be evenly
+ * divisible by the array element size.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the destination
+ * data.  \p pSrc specifies the base address of the source. \p ByteCount specifies
+ * the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyToArray
+ */
+CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination device pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyFromArray
+ */
+CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory from Array to Array
+ *
+ * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray
+ * specify the handles of the destination and source CUDA arrays for the copy,
+ * respectively. \p dstOffset and \p srcOffset specify the destination and
+ * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of
+ * bytes to be copied. The size of the elements in the CUDA arrays need not be
+ * the same format, but the elements must be the same size; and count must be
+ * evenly divisible by that size.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpyArrayToArray
+ */
+CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+          const void *srcHost;
+          CUdeviceptr srcDevice;
+          CUarray srcArray;
+          unsigned int srcPitch;
+
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+          void *dstHost;
+          CUdeviceptr dstDevice;
+          CUarray dstArray;
+          unsigned int dstPitch;
+
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ *
+ * \par
+ * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch().
+ * ::cuMemcpy2DUnaligned() does not have this restriction, but may run
+ * significantly slower in the cases where ::cuMemcpy2D() would have returned
+ * an error code.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DFromArray
+ */
+CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMemcpy3D
+ */
+CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy);
+
+/**
+ * \brief Copies memory between contexts
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_sync
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeer
+ */
+CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+
+/**
+ * \brief Copies memory asynchronously
+ *
+ * Copies data between two pointers.
+ * \p dst and \p src are base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ * Note that this function infers the type of the transfer (host to host, host to
+ *   device, device to device, or device to host) from the pointer values.  This
+ *   function is only allowed in contexts which support unified addressing.
+ *
+ * \param dst       - Destination unified virtual address space pointer
+ * \param src       - Source unified virtual address space pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies device memory between two contexts asynchronously.
+ *
+ * Copies from device memory in one context to device memory in another
+ * context. \p dstDevice is the base device pointer of the destination memory
+ * and \p dstContext is the destination context.  \p srcDevice is the base
+ * device pointer of the source memory and \p srcContext is the source pointer.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice  - Destination device pointer
+ * \param dstContext - Destination context
+ * \param srcDevice  - Source device pointer
+ * \param srcContext - Source context
+ * \param ByteCount  - Size of memory copy in bytes
+ * \param hStream    - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpyPeerAsync
+ */
+CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Host to Device
+ *
+ * Copies from host memory to device memory. \p dstDevice and \p srcHost are
+ * the base addresses of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Host
+ *
+ * Copies from device to host memory. \p dstHost and \p srcDevice specify the
+ * base pointers of the destination and source, respectively. \p ByteCount
+ * specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination host pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Device to Device
+ *
+ * Copies from device memory to device memory. \p dstDevice and \p srcDevice
+ * are the base pointers of the destination and source, respectively.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param srcDevice - Source device pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Host to Array
+ *
+ * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset
+ * specify the CUDA array handle and starting offset in bytes of the
+ * destination data. \p srcHost specifies the base address of the source.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstArray  - Destination array
+ * \param dstOffset - Offset in bytes of destination array
+ * \param srcHost   - Source host pointer
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyToArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory from Array to Host
+ *
+ * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base
+ * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA
+ * array handle and starting offset in bytes of the source data.
+ * \p ByteCount specifies the number of bytes to copy.
+ *
+ * \param dstHost   - Destination pointer
+ * \param srcArray  - Source array
+ * \param srcOffset - Offset in bytes of source array
+ * \param ByteCount - Size of memory copy in bytes
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_memcpy
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpyFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+
+/**
+ * \brief Copies memory for 2D arrays
+ *
+ * Perform a 2D memory copy according to the parameters specified in \p pCopy.
+ * The ::CUDA_MEMCPY2D structure is defined as:
+ *
+ * \code
+   typedef struct CUDA_MEMCPY2D_st {
+      unsigned int srcXInBytes, srcY;
+      CUmemorytype srcMemoryType;
+      const void *srcHost;
+      CUdeviceptr srcDevice;
+      CUarray srcArray;
+      unsigned int srcPitch;
+      unsigned int dstXInBytes, dstY;
+      CUmemorytype dstMemoryType;
+      void *dstHost;
+      CUdeviceptr dstDevice;
+      CUarray dstArray;
+      unsigned int dstPitch;
+      unsigned int WidthInBytes;
+      unsigned int Height;
+   } CUDA_MEMCPY2D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch
+ * specify the (host) base address of the source data and the bytes per row to
+ * apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch
+ * specify the (device) base address of the source data and the bytes per row
+ * to apply. ::srcArray is ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are
+ * ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data and the bytes per
+ * row to apply. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are
+ * ignored.
+ *
+ * - ::srcXInBytes and ::srcY specify the base address of the source data for
+ *   the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::dstXInBytes and ::dstY specify the base address of the destination data
+ *   for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes and ::Height specify the width (in bytes) and height of
+ *   the 2D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back
+ * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies
+ * (device to device, CUDA array to device, CUDA array to CUDA array),
+ * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch().
+ *
+ * \param pCopy   - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync
+ */
+CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+
+/**
+ * \brief Copies memory for 3D arrays
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as:
+ *
+ * \code
+        typedef struct CUDA_MEMCPY3D_st {
+
+            unsigned int srcXInBytes, srcY, srcZ;
+            unsigned int srcLOD;
+            CUmemorytype srcMemoryType;
+                const void *srcHost;
+                CUdeviceptr srcDevice;
+                CUarray srcArray;
+                unsigned int srcPitch;  // ignored when src is array
+                unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1
+
+            unsigned int dstXInBytes, dstY, dstZ;
+            unsigned int dstLOD;
+            CUmemorytype dstMemoryType;
+                void *dstHost;
+                CUdeviceptr dstDevice;
+                CUarray dstArray;
+                unsigned int dstPitch;  // ignored when dst is array
+                unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1
+
+            unsigned int WidthInBytes;
+            unsigned int Height;
+            unsigned int Depth;
+        } CUDA_MEMCPY3D;
+ * \endcode
+ * where:
+ * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the
+ *   source and destination, respectively; ::CUmemorytype_enum is defined as:
+ *
+ * \code
+   typedef enum CUmemorytype_enum {
+      CU_MEMORYTYPE_HOST = 0x01,
+      CU_MEMORYTYPE_DEVICE = 0x02,
+      CU_MEMORYTYPE_ARRAY = 0x03,
+      CU_MEMORYTYPE_UNIFIED = 0x04
+   } CUmemorytype;
+ * \endcode
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::srcArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and
+ * ::srcHeight specify the (host) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and
+ * ::srcHeight specify the (device) base address of the source data, the bytes
+ * per row, and the height of each 2D slice of the 3D array. ::srcArray is
+ * ignored.
+ *
+ * \par
+ * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the
+ * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and
+ * ::srcHeight are ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch
+ *   specify the (unified virtual address space) base address of the source data
+ *   and the bytes per row to apply.  ::dstArray is ignored.
+ * This value may be used only if unified addressing is supported in the calling
+ *   context.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch
+ * specify the (host) base address of the destination data, the bytes per row,
+ * and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch
+ * specify the (device) base address of the destination data, the bytes per
+ * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored.
+ *
+ * \par
+ * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the
+ * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and
+ * ::dstHeight are ignored.
+ *
+ * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source
+ *   data for the copy.
+ *
+ * \par
+ * For host pointers, the starting address is
+ * \code
+  void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - dstXInBytes, ::dstY and ::dstZ specify the base address of the
+ *   destination data for the copy.
+ *
+ * \par
+ * For host pointers, the base address is
+ * \code
+  void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes);
+ * \endcode
+ *
+ * \par
+ * For device pointers, the starting address is
+ * \code
+  CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes;
+ * \endcode
+ *
+ * \par
+ * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array
+ * element size.
+ *
+ * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height
+ *   and depth of the 3D copy being performed.
+ * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes +
+ *   ::srcXInBytes, and ::dstPitch must be greater than or equal to
+ *   ::WidthInBytes + dstXInBytes.
+ * - If specified, ::srcHeight must be greater than or equal to ::Height +
+ *   ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY.
+ *
+ * \par
+ * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum
+ * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH).
+ *
+ * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be
+ * set to 0.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemcpy3DAsync
+ */
+CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+
+/**
+ * \brief Copies memory between contexts asynchronously.
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p pCopy.  See the definition of the ::CUDA_MEMCPY3D_PEER structure
+ * for documentation of its parameters.
+ *
+ * \param pCopy - Parameters for the memory copy
+ * \param hStream - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync,
+ * ::cuMemcpy3DPeerAsync,
+ * ::cudaMemcpy3DPeerAsync
+ */
+CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32Async,
+ * ::cudaMemset
+ */
+CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+
+/**
+ * \brief Initializes device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2D
+ */
+CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 8-bit values to the specified value
+ * \p uc.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param uc        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 16-bit values to the specified value
+ * \p us. The \p dstDevice pointer must be two byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param us        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the memory range of \p N 32-bit values to the specified value
+ * \p ui. The \p dstDevice pointer must be four byte aligned.
+ *
+ * \param dstDevice - Destination device pointer
+ * \param ui        - Value to set
+ * \param N         - Number of elements
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32,
+ * ::cudaMemsetAsync
+ */
+CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 8-bit values to the specified value
+ * \p uc. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param uc        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 16-bit values to the specified value
+ * \p us. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be two byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param us        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Sets device memory
+ *
+ * Sets the 2D memory range of \p Width 32-bit values to the specified value
+ * \p ui. \p Height specifies the number of rows to set, and \p dstPitch
+ * specifies the number of bytes between each row. The \p dstDevice pointer
+ * and \p dstPitch offset must be four byte aligned. This function performs
+ * fastest when the pitch is one that has been passed back by
+ * ::cuMemAllocPitch().
+ *
+ * \param dstDevice - Destination device pointer
+ * \param dstPitch  - Pitch of destination device pointer(Unused if \p Height is 1)
+ * \param ui        - Value to set
+ * \param Width     - Width of row
+ * \param Height    - Number of rows
+ * \param hStream   - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32,
+ * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async,
+ * ::cuMemsetD32, ::cuMemsetD32Async,
+ * ::cudaMemset2DAsync
+ */
+CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+/**
+ * \brief Creates a 1D or 2D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        CUarray_format Format;
+        unsigned int NumChannels;
+    } CUDA_ARRAY_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, and \p Height are the width, and height of the CUDA array (in
+ * elements); the CUDA array is one-dimensional if height is 0, two-dimensional
+ * otherwise;
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 1;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit
+ * float16's:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * Description for a \p width x \p height CUDA array of 16-bit elements, each
+ * of which is two 8-bit unsigned chars:
+ * \code
+    CUDA_ARRAY_DESCRIPTOR arrayDesc;
+    desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
+    desc.NumChannels = 2;
+    desc.Width = width;
+    desc.Height = height;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - Array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMallocArray
+ */
+CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 1D or 2D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * \param pArrayDescriptor - Returned array descriptor
+ * \param hArray           - Array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA array
+ *
+ * Returns the layout properties of a sparse CUDA array in \p sparseProperties
+ * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero.
+ * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero.
+ * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained
+ * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties 
+ * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ * \param[in] array - CUDA array to get the sparse properties of
+ * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA mipmapped array
+ *
+ * Returns the sparse array layout properties in \p sparseProperties
+ * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE 
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the
+ * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
+ * is less than that of the tile.
+ * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL,
+ * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. 
+ * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer.
+ * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES
+ * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of
+ * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap);
+
+/**
+ * \brief Returns the memory requirements of a CUDA array
+ *
+ * Returns the memory requirements of a CUDA array in \p memoryRequirements
+ * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size 
+ * represents the total size of the CUDA array.
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment 
+ * represents the alignment necessary for mapping the CUDA array.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
+ * \param[in] array - CUDA array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device);
+ 
+/**
+ * \brief Returns the memory requirements of a CUDA mipmapped array
+ *
+ * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements
+ * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * ::CUDA_ERROR_INVALID_VALUE will be returned.
+ *
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size 
+ * represents the total size of the CUDA mipmapped array.
+ * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment 
+ * represents the alignment necessary for mapping the CUDA mipmapped  
+ * array.
+ *
+ * \return
+ * ::CUDA_SUCCESS
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS
+ * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync
+ */
+CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
+
+/**
+ * \brief Gets a CUDA array plane from a CUDA array
+ *
+ * Returns in \p pPlaneArray a CUDA array that represents a single format plane
+ * of the CUDA array \p hArray.
+ *
+ * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
+ * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns
+ * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
+ * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format.
+ *
+ * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
+ * \param hArray        - Multiplanar CUDA array
+ * \param planeIdx      - Plane index
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuArrayCreate,
+ * ::cudaGetArrayPlane
+ */
+CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
+
+/**
+ * \brief Destroys a CUDA array
+ *
+ * Destroys the CUDA array \p hArray.
+ *
+ * \param hArray - Array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaFreeArray
+ */
+CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
+
+/**
+ * \brief Creates a 3D CUDA array
+ *
+ * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle.
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D array is allocated if only \p Depth extent is zero.
+ *     - A 3D array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array.
+ *     If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array
+ *     to a surface reference.
+ *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH.
+ *
+ * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag
+ * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
+ * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ * Here are examples of CUDA array descriptions:
+ *
+ * Description for a CUDA array of 2048 floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 2048;
+    desc.Height = 0;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a 64 x 64 CUDA array of floats:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_FLOAT;
+    desc.NumChannels = 1;
+    desc.Width = 64;
+    desc.Height = 64;
+    desc.Depth = 0;
+ * \endcode
+ *
+ * Description for a \p width x \p height x \p depth CUDA array of 64-bit,
+ * 4x16-bit float16's:
+ * \code
+    CUDA_ARRAY3D_DESCRIPTOR desc;
+    desc.Format = CU_AD_FORMAT_HALF;
+    desc.NumChannels = 4;
+    desc.Width = width;
+    desc.Height = height;
+    desc.Depth = depth;
+ * \endcode
+ *
+ * \param pHandle        - Returned array
+ * \param pAllocateArray - 3D array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaMalloc3DArray
+ */
+CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray);
+
+/**
+ * \brief Get a 3D CUDA array descriptor
+ *
+ * Returns in \p *pArrayDescriptor a descriptor containing information on the
+ * format and dimensions of the CUDA array \p hArray. It is useful for
+ * subroutines that have been passed a CUDA array, but need to know the CUDA
+ * array parameters for validation or other purposes.
+ *
+ * This function may be called on 1D and 2D arrays, in which case the \p Height
+ * and/or \p Depth members of the descriptor struct will be set to 0.
+ *
+ * \param pArrayDescriptor - Returned 3D array descriptor
+ * \param hArray           - 3D array to get descriptor of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa ::cuArray3DCreate, ::cuArrayCreate,
+ * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost,
+ * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned,
+ * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD,
+ * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync,
+ * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync,
+ * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost,
+ * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc,
+ * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16,
+ * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32,
+ * ::cudaArrayGetInfo
+ */
+CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray);
+
+/**
+ * \brief Creates a CUDA mipmapped array
+ *
+ * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure
+ * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle.
+ * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is
+ * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+ *
+ * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as:
+ *
+ * \code
+    typedef struct {
+        unsigned int Width;
+        unsigned int Height;
+        unsigned int Depth;
+        CUarray_format Format;
+        unsigned int NumChannels;
+        unsigned int Flags;
+    } CUDA_ARRAY3D_DESCRIPTOR;
+ * \endcode
+ * where:
+ *
+ * - \p Width, \p Height, and \p Depth are the width, height, and depth of the
+ * CUDA array (in elements); the following types of CUDA arrays can be allocated:
+ *     - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero.
+ *     - A 2D mipmapped array is allocated if only \p Depth extent is zero.
+ *     - A 3D mipmapped array is allocated if all three extents are non-zero.
+ *     - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the
+ *       ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and
+ *       the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number
+ *       of layers is determined by the depth extent.
+ *     - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+ *       ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and
+ *       \p Depth must be six. A cubemap is a special type of 2D layered CUDA array,
+ *       where the six layers represent the six faces of a cube. The order of the six
+ *       layers in memory is the same as that listed in ::CUarray_cubemap_face.
+ *     - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero,
+ *       and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set.
+ *       \p Width must be equal to \p Height, and \p Depth must be a multiple of six.
+ *       A cubemap layered CUDA array is a special type of 2D layered CUDA array that
+ *       consists of a collection of cubemaps. The first six layers represent the first
+ *       cubemap, the next six layers form the second cubemap, and so on.
+ *
+ * - ::Format specifies the format of the elements; ::CUarray_format is
+ * defined as:
+ * \code
+    typedef enum CUarray_format_enum {
+        CU_AD_FORMAT_UNSIGNED_INT8 = 0x01,
+        CU_AD_FORMAT_UNSIGNED_INT16 = 0x02,
+        CU_AD_FORMAT_UNSIGNED_INT32 = 0x03,
+        CU_AD_FORMAT_SIGNED_INT8 = 0x08,
+        CU_AD_FORMAT_SIGNED_INT16 = 0x09,
+        CU_AD_FORMAT_SIGNED_INT32 = 0x0a,
+        CU_AD_FORMAT_HALF = 0x10,
+        CU_AD_FORMAT_FLOAT = 0x20
+    } CUarray_format;
+ *  \endcode
+ *
+ * - \p NumChannels specifies the number of packed components per CUDA array
+ * element; it may be 1, 2, or 4;
+ *
+ * - ::Flags may be set to
+ *   - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set,
+ *     \p Depth specifies the number of layers, not the depth of a 3D array.
+ *   - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of
+ *     the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to
+ *     bind a mipmap level of the CUDA mipmapped array to a surface reference.
+  *   - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be
+ *     equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set,
+ *     then \p Depth must be a multiple of six.
+ *   - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather.
+ *     Texture gather can only be performed on 2D CUDA mipmapped arrays.
+ *
+ * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table.
+ * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute
+ * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH.
+ *
+ * <table>
+ * <tr><td><b>CUDA array type</b></td>
+ * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range),
+ * (depth range)}</b></td>
+ * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br>
+ * {(width range in elements), (height range), (depth range)}</b></td></tr>
+ * <tr><td>1D</td>
+ * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td>
+ * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr>
+ * <tr><td>2D</td>
+ * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td>
+ * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr>
+ * <tr><td>3D</td>
+ * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) }
+ * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE),
+ * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td>
+ * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT),
+ * (1,SURFACE3D_DEPTH) }</small></td></tr>
+ * <tr><td>1D Layered</td>
+ * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0,
+ * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0,
+ * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>2D Layered</td>
+ * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT),
+ * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT),
+ * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr>
+ * <tr><td>Cubemap</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_WIDTH),
+ * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr>
+ * <tr><td>Cubemap Layered</td>
+ * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH),
+ * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td>
+ * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH),
+ * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr>
+ * </table>
+ *
+ *
+ * \param pHandle             - Returned mipmapped array
+ * \param pMipmappedArrayDesc - mipmapped array descriptor
+ * \param numMipmapLevels     - Number of mipmap levels
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayDestroy,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaMallocMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+
+/**
+ * \brief Gets a mipmap level of a CUDA mipmapped array
+ *
+ * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level
+ * of the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * If \p level is greater than the maximum number of levels in this mipmapped array,
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pLevelArray     - Returned mipmap level CUDA array
+ * \param hMipmappedArray - CUDA mipmapped array
+ * \param level           - Mipmap level
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayDestroy,
+ * ::cuArrayCreate,
+ * ::cudaGetMipmappedArrayLevel
+ */
+CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+
+/**
+ * \brief Destroys a CUDA mipmapped array
+ *
+ * Destroys the CUDA mipmapped array \p hMipmappedArray.
+ *
+ * \param hMipmappedArray - Mipmapped array to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ARRAY_IS_MAPPED,
+ * ::CUDA_ERROR_CONTEXT_IS_DESTROYED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMipmappedArrayCreate,
+ * ::cuMipmappedArrayGetLevel,
+ * ::cuArrayCreate,
+ * ::cudaFreeMipmappedArray
+ */
+CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray);
+
+/** 
+* \brief Retrieve handle for an address range 
+* 
+* Get a handle of the specified type to an address range. The address range
+* must have been obtained by a prior call to either ::cuMemAlloc or ::cuMemAddressReserve.
+* If the address range was obtained via ::cuMemAddressReserve, it must also be fully mapped via ::cuMemMap.
+* 
+* Users must ensure the \p dptr and \p size are aligned to the host page size.
+* 
+* When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD,
+* users are expected to query for dma_buf support for the platform
+* by using ::CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED device attribute before calling
+* this API. The \p handle will be interpreted as a pointer to an integer to store the dma_buf file descriptor.
+* Users must ensure the entire address range is backed and mapped when
+* the address range is allocated by ::cuMemAddressReserve. All the physical
+* allocations backing the address range must be resident on the same device and
+* have identical allocation properties. Users are also expected to retrieve a
+* new handle every time the underlying physical allocation(s) corresponding
+* to a previously queried VA range are changed.
+* 
+* \param[out] handle     - Pointer to the location where the returned handle will be stored. 
+* \param[in] dptr        - Pointer to a valid CUDA device allocation. Must be aligned to host page size.
+* \param[in] size        - Length of the address range. Must be aligned to host page size.
+* \param[in] handleType  - Type of handle requested (defines type and size of the \p handle output parameter)
+* \param[in] flags       - Reserved, must be zero 
+* 
+* \return
+* CUDA_SUCCESS 
+* CUDA_ERROR_INVALID_VALUE 
+* CUDA_ERROR_NOT_SUPPORTED 
+*/
+CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+
+/** @} */ /* END CUDA_MEM */
+
+/**
+ * \defgroup CUDA_VA Virtual Memory Management
+ *
+ * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the virtual memory management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+* \brief Allocate an address range reservation. 
+* 
+* Reserves a virtual address range based on the given parameters, giving
+* the starting address of the range in \p ptr.  This API requires a system that
+* supports UVA.  The size and address parameters must be a multiple of the
+* host page size and the alignment must be a power of two or zero for default
+* alignment.
+*
+* \param[out] ptr       - Resulting pointer to start of virtual address range allocated
+* \param[in]  size      - Size of the reserved virtual address range requested
+* \param[in]  alignment - Alignment of the reserved virtual address range requested
+* \param[in]  addr      - Fixed starting address range requested
+* \param[in]  flags     - Currently unused, must be zero
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemAddressFree
+*/
+CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags);
+
+/**
+* \brief Free an address range reservation.
+* 
+* Frees a virtual address range reserved by cuMemAddressReserve.  The size
+* must match what was given to memAddressReserve and the ptr given must
+* match what was returned from memAddressReserve.
+*
+* \param[in] ptr  - Starting address of the virtual address range to free
+* \param[in] size - Size of the virtual address region to free
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemAddressReserve
+*/
+CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
+
+/**
+* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties
+*
+* This creates a memory allocation on the target device specified through the
+* \p prop strcuture. The created allocation will not have any device or host
+* mappings. The generic memory \p handle for the allocation can be
+* mapped to the address space of calling process via ::cuMemMap. This handle
+* cannot be transmitted directly to other processes (see
+* ::cuMemExportToShareableHandle).  On Windows, the caller must also pass
+* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which
+* limits or allows access to this handle for a recepient process (see
+* ::CUmemAllocationProp::win32HandleMetaData for more).  The \p size of this
+* allocation must be a multiple of the the value given via
+* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM
+* flag.
+* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then
+* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays
+* and sparse CUDA mipmapped arrays.
+* (see ::cuMemMapArrayAsync).
+*
+* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle.
+* \param[in]  size   - Size of the allocation requested
+* \param[in]  prop   - Properties of the allocation to create.
+* \param[in]  flags  - flags for future use, must be zero now.
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags);
+
+/**
+* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate.
+* 
+* Frees the memory that was allocated on a device through cuMemCreate.
+*
+* The memory allocation will be freed when all outstanding mappings to the memory
+* are unmapped and when all outstanding references to the handle (including it's
+* shareable counterparts) are also released. The generic memory handle can be
+* freed when there are still outstanding mappings made with this handle. Each
+* time a recepient process imports a shareable handle, it needs to pair it with
+* ::cuMemRelease for the handle to be freed.  If \p handle is not a valid handle
+* the behavior is undefined. 
+*
+* \param[in] handle Value of handle which was returned previously by cuMemCreate.
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemCreate
+*/
+CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle);
+
+/**
+* \brief Maps an allocation handle to a reserved virtual address range.
+*
+* Maps bytes of memory represented by \p handle starting from byte \p offset to
+* \p size to address range [\p addr, \p addr + \p size]. This range must be an
+* address reservation previously reserved with ::cuMemAddressReserve, and
+* \p offset + \p size must be less than the size of the memory allocation.
+* Both \p ptr, \p size, and \p offset must be a multiple of the value given via
+* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag.
+* 
+* Please note calling ::cuMemMap does not make the address accessible,
+* the caller needs to update accessibility of a contiguous mapped VA
+* range by calling ::cuMemSetAccess.
+* 
+* Once a recipient process obtains a shareable memory handle
+* from ::cuMemImportFromShareableHandle, the process must
+* use ::cuMemMap to map the memory into its address ranges before
+* setting accessibility with ::cuMemSetAccess.
+*  
+* ::cuMemMap can only create mappings on VA range reservations 
+* that are not currently mapped.
+* 
+* \param[in] ptr    - Address where memory will be mapped. 
+* \param[in] size   - Size of the memory mapping. 
+* \param[in] offset - Offset into the memory represented by 
+*                   - \p handle from which to start mapping
+*                   - Note: currently must be zero.
+* \param[in] handle - Handle to a shareable memory 
+* \param[in] flags  - flags for future use, must be zero now. 
+* \return
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_OUT_OF_MEMORY,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+*
+* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags);
+
+/**
+ * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays
+ *
+ * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count.
+ * The structure ::CUarrayMapInfo is defined as follow:
+ \code
+     typedef struct CUarrayMapInfo_st {
+        CUresourcetype resourceType;                   
+        union {
+            CUmipmappedArray mipmap;
+            CUarray array;
+        } resource;
+
+        CUarraySparseSubresourceType subresourceType;   
+        union {
+            struct {
+                unsigned int level;                     
+                unsigned int layer;                     
+                unsigned int offsetX;                   
+                unsigned int offsetY;                   
+                unsigned int offsetZ;                   
+                unsigned int extentWidth;               
+                unsigned int extentHeight;              
+                unsigned int extentDepth;               
+            } sparseLevel;
+            struct {
+                unsigned int layer;
+                unsigned long long offset;              
+                unsigned long long size;                
+            } miptail;
+        } subresource;
+
+        CUmemOperationType memOperationType;
+        
+        CUmemHandleType memHandleType;                  
+        union {
+            CUmemGenericAllocationHandle memHandle;
+        } memHandle;
+
+        unsigned long long offset;                      
+        unsigned int deviceBitMask;                     
+        unsigned int flags;                             
+        unsigned int reserved[2];                       
+    } CUarrayMapInfo;
+ \endcode
+ *
+ * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on.
+ * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then 
+ * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle.
+ * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using
+ * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE
+ * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
+ * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned.
+ * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY 
+ * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle.
+ * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
+ * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE
+ * or ::CUDA_ARRAY3D_DEFERRED_MAPPING.
+ *
+ * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. 
+ * ::CUarraySparseSubresourceType_enum is defined as:
+ \code
+    typedef enum CUarraySparseSubresourceType_enum {
+        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0,
+        CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1
+    } CUarraySparseSubresourceType;
+ \endcode
+ *
+ * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a
+ * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which
+ * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by 
+ * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type.
+ *
+ * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
+ * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents.
+ * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY
+ * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively.
+ * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight
+ * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively.
+ * These offsets and extents must be aligned to the corresponding tile dimension.
+ * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise,
+ * must be zero.
+ * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise,
+ * must be zero.
+ * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth
+ * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays.
+ * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties
+ *
+ * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
+ * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in 
+ * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size.
+ * Both, mip tail offset and mip tail size must be aligned to the tile size. 
+ * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags
+ * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index.
+ * Otherwise, must be zero.
+ *
+ * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING
+ * flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored.
+ *
+ * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as:
+ \code
+    typedef enum CUmemOperationType_enum {
+        CU_MEM_OPERATION_TYPE_MAP = 1,
+        CU_MEM_OPERATION_TYPE_UNMAP = 2
+    } CUmemOperationType;
+ \endcode
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource 
+ * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. 
+ * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, 
+ * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC.
+ * 
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation
+ * is performed. ::CUarrayMapInfo::memHandle must be NULL.
+ *
+ * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. 
+ * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. 
+ * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match 
+ * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle.
+ *
+ * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ *
+ * \param[in] mapInfoList - List of ::CUarrayMapInfo
+ * \param[in] count       - Count of ::CUarrayMapInfo  in \p mapInfoList
+ * \param[in] hStream     - Stream identifier for the stream to use for map or unmap operations
+ *
+ * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties
+ */
+CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo  *mapInfoList, unsigned int count, CUstream hStream);
+
+/**
+* \brief Unmap the backing memory of a given address range.
+*
+* The range must be the entire contiguous address range that was mapped to.  In
+* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped
+* by ::cuMemCreate / ::cuMemMap.  Any backing memory allocations will be freed
+* if there are no existing mappings and there are no unreleased memory handles.
+*
+* When ::cuMemUnmap returns successfully the address range is converted to an
+* address reservation and can be used for a future calls to ::cuMemMap.  Any new
+* mapping to this virtual address will need to have access granted through
+* ::cuMemSetAccess, as all mappings start with no accessibility setup.
+*
+* \param[in] ptr  - Starting address for the virtual address range to unmap
+* \param[in] size - Size of the virtual address range to unmap
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+* \note_sync
+*
+* \sa ::cuMemCreate, ::cuMemAddressReserve
+*/
+CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size);
+
+/**
+* \brief Set the access flags for each location specified in \p desc for the given virtual address range
+* 
+* Given the virtual address range via \p ptr and \p size, and the locations
+* in the array given by \p desc and \p count, set the access flags for the
+* target locations.  The range must be a fully mapped address range
+* containing all allocations created by ::cuMemMap / ::cuMemCreate.
+*
+* \param[in] ptr   - Starting address for the virtual address range
+* \param[in] size  - Length of the virtual address range
+* \param[in] desc  - Array of ::CUmemAccessDesc that describe how to change the
+*                  - mapping for each location specified
+* \param[in] count - Number of ::CUmemAccessDesc in \p desc
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_SUPPORTED
+* \notefnerr
+* \note_sync
+*
+* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap
+*/
+CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count);
+
+/**
+* \brief Get the access \p flags set for the given \p location and \p ptr
+*
+* \param[out] flags   - Flags set for this location
+* \param[in] location - Location in which to check the flags for
+* \param[in] ptr      - Address in which to check the access flags for
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_INVALID_DEVICE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemSetAccess
+*/
+CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr);
+
+/**
+* \brief Exports an allocation to a requested shareable handle type
+*
+* Given a CUDA memory handle, create a shareable memory
+* allocation handle that can be used to share the memory with other
+* processes. The recipient process can convert the shareable handle back into a
+* CUDA memory handle using ::cuMemImportFromShareableHandle and map
+* it with ::cuMemMap. The implementation of what this handle is and how it
+* can be transferred is defined by the requested handle type in \p handleType
+*
+* Once all shareable handles are closed and the allocation is released, the allocated
+* memory referenced will be released back to the OS and uses of the CUDA handle afterward
+* will lead to undefined behavior.
+*
+* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL)
+* that support importing memory from the shareable type
+*
+* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type
+* \param[in] handle           - CUDA handle for the memory allocation
+* \param[in] handleType       - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter)
+* \param[in] flags            - Reserved, must be zero
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+
+/**
+* \brief Imports an allocation from a requested shareable handle type.
+*
+* If the current process cannot support the memory described by this shareable
+* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED.
+*
+* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc)
+* created on devices under an SLI group may not be supported, and thus this API will
+* return CUDA_ERROR_NOT_SUPPORTED.
+* There is no guarantee that the contents of \p handle will be the same CUDA memory handle
+* for the same given OS shareable handle, or the same underlying allocation.
+*
+* \param[out] handle       - CUDA Memory handle for the memory allocation.
+* \param[in]  osHandle     - Shareable Handle representing the memory allocation that is to be imported. 
+* \param[in]  shHandleType - handle type of the exported handle ::CUmemAllocationHandleType.
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease
+*/
+CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+
+/**
+* \brief Calculates either the minimal or recommended granularity 
+*
+* Calculates either the minimal or recommended granularity
+* for a given allocation specification and returns it in granularity.  This
+* granularity can be used as a multiple for alignment, size, or address mapping.
+*
+* \param[out] granularity Returned granularity.
+* \param[in]  prop Property for which to determine the granularity for
+* \param[in]  option Determines which granularity to return
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemMap
+*/
+CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option);
+
+/**
+* \brief Retrieve the contents of the property structure defining properties for this handle
+*
+* \param[out] prop  - Pointer to a properties structure which will hold the information about this handle
+* \param[in] handle - Handle which to perform the query on
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle
+*/
+CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle);
+
+/**
+* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation.
+*
+* The handle is guaranteed to be the same handle value used to map the memory. If the address
+* requested is not mapped, the function will fail. The returned handle must be released with
+* corresponding number of calls to ::cuMemRelease.
+*
+* \note The address \p addr, can be any address in a range previously mapped
+* by ::cuMemMap, and not necessarily the start address.
+*
+* \param[out] handle CUDA Memory handle for the backing memory allocation.
+* \param[in] addr Memory address to query, that has been mapped previously.
+* \returns
+* ::CUDA_SUCCESS,
+* ::CUDA_ERROR_INVALID_VALUE,
+* ::CUDA_ERROR_NOT_INITIALIZED,
+* ::CUDA_ERROR_DEINITIALIZED,
+* ::CUDA_ERROR_NOT_PERMITTED,
+* ::CUDA_ERROR_NOT_SUPPORTED
+*
+* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap
+*/
+CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr);
+
+/** @} */ /* END CUDA_VA */
+
+/**
+ * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator
+ *
+ * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
+ *                Functions for controlling the behavior of the underlying allocator.
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream ordered memory allocator exposed by the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_MALLOC_ASYNC_overview overview
+ *
+ * The asynchronous allocator allows the user to allocate and free in stream order.
+ * All asynchronous accesses of the allocation must happen between
+ * the stream executions of the allocation and the free. If the memory is accessed
+ * outside of the promised stream order, a use before allocation / use after free error
+ * will cause undefined behavior.
+ *
+ * The allocator is free to reallocate the memory as long as it can guarantee
+ * that compliant memory accesses will not overlap temporally.
+ * The allocator may refer to internal stream ordering as well as inter-stream dependencies
+ * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
+ * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. 
+ *
+ * \section CUDA_MALLOC_ASYNC_support Supported Platforms
+ *
+ * Whether or not a device supports the integrated stream ordered memory allocator
+ * may be queried by calling ::cuDeviceGetAttribute() with the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
+ */
+
+/**
+ * \brief Frees memory with stream ordered semantics
+ *
+ * Inserts a free operation into \p hStream.
+ * The allocation must not be accessed after stream execution reaches the free.
+ * After this API returns, accessing the memory from any subsequent work launched on the GPU
+ * or querying its pointer attributes results in undefined behavior.
+ *
+ * \note During stream capture, this function results in the creation of a free node and
+ *       must therefore be passed the address of a graph allocation.
+ * 
+ * \param dptr - memory to free
+ * \param hStream - The stream establishing the stream ordering contract. 
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ */
+CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
+
+/**
+ * \brief Allocates memory with stream ordered semantics
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the memory pool current to the stream's device.
+ *
+ * \note The default memory pool of a device contains device memory from that device.
+ * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs. 
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] dptr    - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool,
+ *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+ *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
+
+/**
+ * \brief Tries to release memory back to the OS
+ *
+ * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
+ * reserved bytes, or there is no more memory that the allocator can safely release.
+ * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ * The OS allocations may happen at different granularity from the user allocations.
+ *
+ * \note: Allocations that have not been freed count as outstanding. 
+ * \note: Allocations that have been asynchronously freed but whose completion has
+ *        not been observed on the host (eg. by a synchronize) can count as outstanding.
+ *
+ * \param[in] pool           - The memory pool to trim
+ * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
+ * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
+ * at least minBytesToKeep bytes reserved after the operation.
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep);
+
+/**
+ * \brief Sets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of backing memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of used memory that was
+ *                    allocated for the memory pool.
+ *
+ * \param[in] pool  - The memory pool to modify
+ * \param[in] attr  - The attribute to modify
+ * \param[in] value - Pointer to the value to assign
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+
+/**
+ * \brief Gets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int)
+ *                    Allow ::cuMemAllocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cuMemFreeAsync (default enabled).
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t)
+ *                    Amount of backing memory currently allocated for the mempool
+ * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t)
+ *                    High watermark of backing memory allocated for the mempool since the
+ *                    last time it was reset.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t)
+ *                    Amount of memory from the pool that is currently in use by the application.
+ * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t)
+ *                    High watermark of the amount of memory from the pool that was in use by the application.
+ *
+ * \param[in] pool   - The memory pool to get attributes of
+ * \param[in] attr   - The attribute to get 
+ * \param[out] value - Retrieved value
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+
+/**
+ * \brief Controls visibility of pools between devices
+ *
+ * \param[in] pool  - The pool being modified
+ * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu.
+ * \param[in] count - Number of descriptors in the map array.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count);
+
+/**
+ * \brief Returns the accessibility of a pool from a device
+ *
+ * Returns the accessibility of the pool's memory from the specified location. 
+ *
+ * \param[out] flags   - the accessibility of the pool from the specified location
+ * \param[in] memPool  - the pool being queried
+ * \param[in] location - the location accessing the pool
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location);
+
+/**
+ * \brief Creates a memory pool
+ *
+ * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
+ * the properties of the pool such as the backing device and IPC capabilities. 
+ *
+ * By default, the pool's memory will be accessible from the device it is allocated on.
+ *
+ * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ *
+ * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool,
+ *     ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle
+ */
+CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps);
+
+/**
+ * \brief Destroys the specified memory pool
+ *
+ * If any pointers obtained from this pool haven't been freed or
+ * the pool has free operations that haven't completed
+ * when ::cuMemPoolDestroy is invoked, the function will return immediately and the
+ * resources associated with the pool will be released automatically
+ * once there are no more outstanding allocations. 
+ *
+ * Destroying the current mempool of a device sets the default mempool of
+ * that device as the current mempool for that device.
+ *
+ * \note A device's default memory pool cannot be destroyed.
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool,
+ *     ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate
+ */
+CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool);
+
+/**
+ * \brief Allocates memory from a specified pool with stream ordered semantics.
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the specified memory pool.
+ *
+ * \note
+ *    -  The specified memory pool may be from a device different than that of the specified \p hStream. 
+ * 
+ *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs. 
+ *
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] dptr    - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] pool     - The pool to allocate from 
+ * \param[in] hStream  - The stream establishing the stream ordering semantic
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context),
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool,
+ *     ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess,
+ *     ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+
+/**
+ * \brief Exports a memory pool to the requested handle type.
+ *
+ * Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle.
+ * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs.
+ * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
+ * handle type.
+ *
+ * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
+ *
+ * \param[out] handle_out  - Returned OS handle 
+ * \param[in] pool         - pool to export 
+ * \param[in] handleType   - the type of handle to create 
+ * \param[in] flags        - must be 0 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer,
+ *     ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync,
+ *     ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate,
+ *     ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute
+ */
+CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+
+/**
+ * \brief imports a memory pool from a shared handle.
+ *
+ * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer.
+ *
+ * \note Imported memory pools do not support creating new allocations.
+ *       As such imported memory pools may not be used in cuDeviceSetMemPool
+ *       or ::cuMemAllocFromPoolAsync calls.
+ *
+ * \param[out] pool_out    - Returned memory pool
+ * \param[in] handle       - OS handle of the pool to open 
+ * \param[in] handleType   - The type of handle being imported 
+ * \param[in] flags        - must be 0 
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer
+ */
+CUresult CUDAAPI cuMemPoolImportFromShareableHandle(
+        CUmemoryPool *pool_out,
+        void *handle,
+        CUmemAllocationHandleType handleType,
+        unsigned long long flags);
+
+/**
+ * \brief Export data to share a memory pool allocation between processes.
+ *
+ * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
+ * The recipient process can import the allocation with the ::cuMemPoolImportPointer api.
+ * The data is not a handle and may be shared through any IPC mechanism.
+ *
+ * \param[out] shareData_out - Returned export data  
+ * \param[in] ptr            - pointer to memory being exported
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer
+ */
+CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr);
+
+/**
+ * \brief Import a memory pool allocation from another process.
+ *
+ * Returns in \p ptr_out a pointer to the imported memory.
+ * The imported memory must not be accessed before the allocation operation completes
+ * in the exporting process. The imported memory must be freed from all importing processes before
+ * being freed in the exporting process. The pointer may be freed with cuMemFree
+ * or cuMemFreeAsync.  If cuMemFreeAsync is used, the free must be completed
+ * on the importing process before the free operation on the exporting process.
+ *
+ * \note The cuMemFreeAsync api may be used in the exporting process before
+ *       the cuMemFreeAsync operation completes in its stream as long as the
+ *       cuMemFreeAsync in the exporting process specifies a stream with
+ *       a stream dependency on the importing process's cuMemFreeAsync.
+ *
+ * \param[out] ptr_out  - pointer to imported memory
+ * \param[in] pool      - pool from which to import
+ * \param[in] shareData - data specifying the memory to import
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer
+ */
+CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData);
+
+/** @} */ /* END CUDA_MALLOC_ASYNC */
+
+/**
+ * \defgroup CUDA_UNIFIED Unified Addressing
+ *
+ * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the unified addressing functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ *
+ * \section CUDA_UNIFIED_overview Overview
+ *
+ * CUDA devices can share a unified address space with the host.
+ * For these devices there is no distinction between a device
+ * pointer and a host pointer -- the same pointer value may be
+ * used to access memory from the host program and from a kernel
+ * running on the device (with exceptions enumerated below).
+ *
+ * \section CUDA_UNIFIED_support Supported Platforms
+ *
+ * Whether or not a device supports unified addressing may be
+ * queried by calling ::cuDeviceGetAttribute() with the device
+ * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
+ *
+ * Unified addressing is automatically enabled in 64-bit processes
+ *
+ * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values
+ *
+ * It is possible to look up information about the memory which backs a
+ * pointer value.  For instance, one may want to know if a pointer points
+ * to host or device memory.  As another example, in the case of device
+ * memory, one may want to know on which CUDA device the memory
+ * resides.  These properties may be queried using the function
+ * ::cuPointerGetAttribute()
+ *
+ * Since pointers are unique, it is not necessary to specify information
+ * about the pointers specified to the various copy functions in the
+ * CUDA API.  The function ::cuMemcpy() may be used to perform a copy
+ * between two pointers, ignoring whether they point to host or device
+ * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH()
+ * unnecessary for devices supporting unified addressing).  For
+ * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be
+ * used to specify that the CUDA driver should infer the location of the
+ * pointer from its value.
+ *
+ * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
+ *
+ * All host memory allocated in all contexts using ::cuMemAllocHost() and
+ * ::cuMemHostAlloc() is always directly accessible from all contexts on
+ * all devices that support unified addressing.  This is the case regardless
+ * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and
+ * ::CU_MEMHOSTALLOC_DEVICEMAP are specified.
+ *
+ * The pointer value through which allocated host memory may be accessed
+ * in kernels on all devices that support unified addressing is the same
+ * as the pointer value through which that memory is accessed on the host,
+ * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device
+ * pointer for these allocations.
+ *
+ * Note that this is not the case for memory allocated using the flag
+ * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
+ *
+ * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory
+ *
+ * Upon enabling direct access from a context that supports unified addressing
+ * to another peer context that supports unified addressing using
+ * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using
+ * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible
+ * by the current context.  The device pointer value through
+ * which any peer memory may be accessed in the current context
+ * is the same pointer value through which that memory may be
+ * accessed in the peer context.
+ *
+ * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing
+ *
+ * Not all memory may be accessed on devices through the same pointer
+ * value through which they are accessed on the host.  These exceptions
+ * are host memory registered using ::cuMemHostRegister() and host memory
+ * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED.  For these
+ * exceptions, there exists a distinct host and device address for the
+ * memory.  The device address is guaranteed to not overlap any valid host
+ * pointer range and is guaranteed to have the same value across all
+ * contexts that support unified addressing.
+ *
+ * This device address may be queried using ::cuMemHostGetDevicePointer()
+ * when a context using unified addressing is current.  Either the host
+ * or the unified device pointer value may be used to refer to this memory
+ * through ::cuMemcpy() and similar functions using the
+ * ::CU_MEMORYTYPE_UNIFIED memory type.
+ *
+ */
+
+/**
+ * \brief Returns information about a pointer
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT:
+ *
+ *      Returns in \p *data the ::CUcontext in which \p ptr was allocated or
+ *      registered.
+ *      The type of \p data must be ::CUcontext *.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE:
+ *
+ *      Returns in \p *data the physical memory type of the memory that
+ *      \p ptr addresses as a ::CUmemorytype enumerated value.
+ *      The type of \p data must be unsigned int.
+ *
+ *      If \p ptr addresses device memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_DEVICE.  The particular ::CUdevice on which the
+ *      memory resides is the ::CUdevice of the ::CUcontext returned by the
+ *      ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr.
+ *
+ *      If \p ptr addresses host memory then \p *data is set to
+ *      ::CU_MEMORYTYPE_HOST.
+ *
+ *      If \p ptr was not allocated by, mapped by, or registered with
+ *      a ::CUcontext which uses unified virtual addressing then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If the current ::CUcontext does not support unified virtual
+ *      addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER:
+ *
+ *      Returns in \p *data the device pointer value through which
+ *      \p ptr may be accessed by kernels running in the current
+ *      ::CUcontext.
+ *      The type of \p data must be CUdeviceptr *.
+ *
+ *      If there exists no device pointer value through which
+ *      kernels running in the current ::CUcontext may access
+ *      \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      If there is no current ::CUcontext then
+ *      ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER:
+ *
+ *      Returns in \p *data the host pointer value through which
+ *      \p ptr may be accessed by by the host program.
+ *      The type of \p data must be void **.
+ *      If there exists no host pointer value through which
+ *      the host program may directly access \p ptr then
+ *      ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ *      Except in the exceptional disjoint addressing cases discussed
+ *      below, the value returned in \p *data will equal the input
+ *      value \p ptr.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS:
+ *
+ *      Returns in \p *data two tokens for use with the nv-p2p.h Linux
+ *      kernel interface. \p data must be a struct of type
+ *      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
+ *
+ *      \p ptr must be a pointer to memory obtained from :cuMemAlloc().
+ *      Note that p2pToken and vaSpaceToken are only valid for the
+ *      lifetime of the source allocation. A subsequent allocation at
+ *      the same address may return completely different tokens.
+ *      Querying this attribute has a side effect of setting the attribute
+ *      ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that
+ *      \p ptr points to.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute which when set, ensures that synchronous memory operations
+ *      initiated on the region of memory that \p ptr points to will always synchronize.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID:
+ *
+ *      Returns in \p *data a buffer ID which is guaranteed to be unique within the process.
+ *      \p data must point to an unsigned long long.
+ *
+ *      \p ptr must be a pointer to memory obtained from a CUDA memory allocation API.
+ *      Every memory allocation from any of the CUDA memory allocation APIs will
+ *      have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs
+ *      from previous freed allocations. IDs are only unique within a single process.
+ *
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED:
+ *
+ *      Returns in \p *data a boolean that indicates whether the pointer points to
+ *      managed memory or not.
+ *
+ *      If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL:
+ *
+ *      Returns in \p *data an integer representing a device ordinal of a device against
+ *      which the memory was allocated or registered.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE:
+ *
+ *      Returns in \p *data a boolean that indicates if this pointer maps to
+ *      an allocation that is suitable for ::cudaIpcGetMemHandle.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR:
+ *
+ *      Returns in \p *data the starting address for the allocation referenced
+ *      by the device pointer \p ptr.  Note that this is not necessarily the
+ *      address of the mapped region, but the address of the mappable address
+ *      range \p ptr references (e.g. from ::cuMemAddressReserve).
+ *
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE:
+ *
+ *      Returns in \p *data the size for the allocation referenced by the device
+ *      pointer \p ptr.  Note that this is not necessarily the size of the mapped
+ *      region, but the size of the mappable address range \p ptr references
+ *      (e.g. from ::cuMemAddressReserve).  To retrieve the size of the mapped
+ *      region, see ::cuMemGetAddressRange
+ *
+ * - ::CU_POINTER_ATTRIBUTE_MAPPED:
+ *
+ *      Returns in \p *data a boolean that indicates if this pointer is in a
+ *      valid address range that is mapped to a backing allocation.
+ *
+ * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES:
+ *
+ *      Returns a bitmask of the allowed handle types for an allocation that may
+ *      be passed to ::cuMemExportToShareableHandle.
+ * 
+ * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE:
+ * 
+ *      Returns in \p *data the handle to the mempool that the allocation was obtained from.
+ *
+ * \par
+ *
+ * Note that for most allocations in the unified virtual address space
+ * the host and device pointer for accessing the allocation will be the
+ * same.  The exceptions to this are
+ *  - user memory registered using ::cuMemHostRegister
+ *  - host memory allocated using ::cuMemHostAlloc with the
+ *    ::CU_MEMHOSTALLOC_WRITECOMBINED flag
+ * For these types of allocation there will exist separate, disjoint host
+ * and device addresses for accessing the allocation.  In particular
+ *  - The host address will correspond to an invalid unmapped device address
+ *    (which will result in an exception if accessed from the device)
+ *  - The device address will correspond to an invalid unmapped host address
+ *    (which will result in an exception if accessed from the host).
+ * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host
+ * and device addresses from either address.
+ *
+ * \param data      - Returned pointer attribute value
+ * \param attribute - Pointer attribute to query
+ * \param ptr       - Pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerSetAttribute,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr);
+
+/**
+ * \brief Prefetches memory to the specified destination device
+ *
+ * Prefetches memory to the specified destination device.  \p devPtr is the
+ * base device pointer of the memory to be prefetched and \p dstDevice is the
+ * destination device. \p count specifies the number of bytes to copy. \p hStream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables.
+ *
+ * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If
+ * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
+ * must be non-zero. Additionally, \p hStream must be associated with a device that has a
+ * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cuMemAdvise as described
+ * below:
+ *
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on \p dstDevice.
+ *
+ * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory
+ * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param hStream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemAdvise,
+ * ::cudaMemPrefetchAsync
+ */
+CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync
+ * is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. The \p device argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY:  Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ *
+ * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
+ * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
+ * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location
+ * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cuMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p device will not result in a read-only copy being created on that device as outlined in description for
+ * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect. Note however that this behavior may change in the future.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
+ * and changes the preferred location to none.
+ *
+ * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
+ * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p device, then the policies
+ * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to
+ * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has
+ * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES,
+ * then this call has no effect.
+ *
+ * \param devPtr - Pointer to memory to set the advice for
+ * \param count  - Size in bytes of the memory range
+ * \param advice - Advice to be applied for the specified memory range
+ * \param device - Device to apply the advice for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync,
+ * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync,
+ * ::cudaMemAdvise
+ */
+CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device);
+
+/**
+ * \brief Query an attribute of a given memory range
+ *
+ * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables.
+ *
+ * The \p attribute parameter can take the following values:
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted
+ * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+ * memory range have read-duplication enabled, or 0 otherwise.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+ * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU
+ * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID
+ * if either all the pages don't have the same preferred location or some of the pages don't have a
+ * preferred location at all. Note that the actual location of the pages in the memory range at the time of
+ * the query may be different from the preferred location.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted
+ * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+ * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range.
+ * If any device does not have that advice set for the entire memory range, that device will not be included.
+ * If \p data is larger than the number of devices that have that advice set for that memory range,
+ * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12
+ * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+ * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have
+ * that advice set, then only as many devices will be returned as can fit in the array. There is no
+ * guarantee on which specific devices will be returned, however.
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be
+ * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+ * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be
+ * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
+ * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+ * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
+ * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
+ * whether the prefetch operation to that location has completed or even begun.
+ *
+ * \param data      - A pointers to a memory location where the result
+ *                    of each attribute query will be written to.
+ * \param dataSize  - Array containing the size of data
+ * \param attribute - The attribute to query
+ * \param devPtr    - Start of the range to query
+ * \param count     - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ *
+ * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync,
+ * ::cuMemAdvise,
+ * ::cudaMemRangeGetAttribute
+ */
+CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count);
+
+/**
+ * \brief Query attributes of a given memory range.
+ *
+ * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via
+ * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ * The results of the query will be stored in \p data.
+ *
+ * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for
+ * attribute descriptions and restrictions.
+ *
+ * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION
+ * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY
+ * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION
+ *
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                        locations where the result of each attribute query will be written to.
+ * \param dataSizes     - Array containing the sizes of each result
+ * \param attributes    - An array of attributes to query
+ *                        (numAttributes and the number of attributes in this array should match)
+ * \param numAttributes - Number of attributes to query
+ * \param devPtr        - Start of the range to query
+ * \param count         - Size of the range to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise,
+ * ::cuMemPrefetchAsync,
+ * ::cudaMemRangeGetAttributes
+ */
+CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count);
+
+/**
+ * \brief Set attributes on a previously allocated memory region
+ *
+ * The supported attributes are:
+ *
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS:
+ *
+ *      A boolean attribute that can either be set (1) or unset (0). When set,
+ *      the region of memory that \p ptr points to is guaranteed to always synchronize
+ *      memory operations that are synchronous. If there are some previously initiated
+ *      synchronous memory operations that are pending when this attribute is set, the
+ *      function does not return until those memory operations are complete.
+ *      See further documentation in the section titled "API synchronization behavior"
+ *      to learn more about cases when synchronous memory operations can
+ *      exhibit asynchronous behavior.
+ *      \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set.
+ *
+ * \param value     - Pointer to memory containing the value to be set
+ * \param attribute - Pointer attribute to set
+ * \param ptr       - Pointer to a memory region allocated using CUDA memory allocation APIs
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa ::cuPointerGetAttribute,
+ * ::cuPointerGetAttributes,
+ * ::cuMemAlloc,
+ * ::cuMemFree,
+ * ::cuMemAllocHost,
+ * ::cuMemFreeHost,
+ * ::cuMemHostAlloc,
+ * ::cuMemHostRegister,
+ * ::cuMemHostUnregister
+ */
+CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr);
+
+/**
+ * \brief Returns information about a pointer.
+ *
+ * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions):
+ *
+ * - ::CU_POINTER_ATTRIBUTE_CONTEXT
+ * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER
+ * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
+ * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID
+ * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED
+ * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
+ * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE
+ * - ::CU_POINTER_ATTRIBUTE_MAPPED
+ * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
+ * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
+ * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
+ *
+ * \param numAttributes - Number of attributes to query
+ * \param attributes    - An array of attributes to query
+ *                      (numAttributes and the number of attributes in this array should match)
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                      locations where the result of each attribute query will be written to.
+ * \param ptr           - Pointer to query
+ *
+ * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr
+ * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values
+ * and CUDA_SUCCESS is returned.
+ *
+ * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA
+ * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuPointerGetAttribute,
+ * ::cuPointerSetAttribute,
+ * ::cudaPointerGetAttributes
+ */
+CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr);
+
+/** @} */ /* END CUDA_UNIFIED */
+
+/**
+ * \defgroup CUDA_STREAM Stream Management
+ *
+ * ___MANBRIEF___ stream management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create a stream
+ *
+ * Creates a stream and returns a handle in \p phStream.  The \p Flags argument
+ * determines behaviors of the stream.
+ *
+ * Valid values for \p Flags are:
+ * - ::CU_STREAM_DEFAULT: Default stream creation flag.
+ * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created
+ *   stream may run concurrently with work in stream 0 (the NULL stream), and that
+ *   the created stream should perform no implicit synchronization with stream 0.
+ *
+ * \param phStream - Returned newly created stream
+ * \param Flags    - Parameters for stream creation
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags);
+
+/**
+ * \brief Create a stream with the given priority
+ *
+ * Creates a stream with the specified priority and returns a handle in \p phStream.
+ * This API alters the scheduler priority of work in the stream. Work in a higher
+ * priority stream may preempt work already executing in a low priority stream.
+ *
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param phStream    - Returned newly created stream
+ * \param flags       - Flags for stream creation. See ::cuStreamCreate for a list of
+ *                      valid flags
+ * \param priority    - Stream priority. Lower numbers represent higher priorities.
+ *                      See ::cuCtxGetStreamPriorityRange for more information about
+ *                      meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \note Stream priorities are supported only on GPUs
+ * with compute capability 3.5 or higher.
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreateWithPriority
+ */
+CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority);
+
+
+/**
+ * \brief Query the priority of a given stream
+ *
+ * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ * and return the priority in \p priority. Note that if the stream was created with a
+ * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange,
+ * this function returns the clamped priority.
+ * See ::cuStreamCreateWithPriority for details about priority clamping.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamCreateWithPriority,
+ * ::cuCtxGetStreamPriorityRange,
+ * ::cuStreamGetFlags,
+ * ::cudaStreamGetPriority
+ */
+CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+
+/**
+ * \brief Query the flags of a given stream
+ *
+ * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority
+ * and return the flags in \p flags.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param flags      - Pointer to an unsigned integer in which the stream's flags are returned
+ *                     The value returned in \p flags is a logical 'OR' of all flags that
+ *                     were used while creating this stream. See ::cuStreamCreate for the list
+ *                     of valid flags
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreate,
+ * ::cuStreamGetPriority,
+ * ::cudaStreamGetFlags
+ */
+CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+
+/**
+ * \brief Query the context associated with a stream
+ *
+ * Returns the CUDA context that the stream is associated with.
+ *
+ * The stream handle \p hStream can refer to any of the following:
+ * <ul>
+ *   <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate
+ *   and ::cuStreamCreateWithPriority, or their runtime API equivalents such as
+ *   ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority.
+ *   The returned context is the context that was active in the calling thread when the
+ *   stream was created. Passing an invalid handle will result in undefined behavior.</li>
+ *   <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and
+ *   ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted,
+ *   which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively.
+ *   Specifying any of the special handles will return the context current to the
+ *   calling thread. If no context is current to the calling thread,
+ *   ::CUDA_ERROR_INVALID_CONTEXT is returned.</li>
+ * </ul>
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param pctx    - Returned context associated with the stream
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \notefnerr
+ *
+ * \sa ::cuStreamDestroy,
+ * ::cuStreamCreateWithPriority,
+ * ::cuStreamGetPriority,
+ * ::cuStreamGetFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags
+ */
+CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+
+/**
+ * \brief Make a compute stream wait on an event
+ *
+ * Makes all future work submitted to \p hStream wait for all work captured in
+ * \p hEvent.  See ::cuEventRecord() for details on what is captured by an event.
+ * The synchronization will be performed efficiently on the device when applicable.
+ * \p hEvent may be from a different context or device than \p hStream.
+ *
+ * flags include:
+ * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external
+ *   event node when performing stream capture. This flag is invalid outside
+ *   of stream capture.
+ *
+ * \param hStream - Stream to wait
+ * \param hEvent  - Event to wait on (may not be NULL)
+ * \param Flags   - See ::CUevent_capture_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuEventRecord,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cuStreamDestroy,
+ * ::cudaStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+
+/**
+ * \brief Add a callback to a compute stream
+ *
+ * \note This function is slated for eventual deprecation and removal. If
+ * you do not require the callback to execute in case of a device error,
+ * consider using ::cuLaunchHostFunc. Additionally, this function is not
+ * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike
+ * ::cuLaunchHostFunc.
+ *
+ * Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each
+ * cuStreamAddCallback call, the callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * The callback may be passed ::CUDA_SUCCESS or an error code.  In the event
+ * of a device error, all subsequently executed callbacks will receive an
+ * appropriate ::CUresult.
+ *
+ * Callbacks must not make any CUDA API calls.  Attempting to use a CUDA API
+ * will result in ::CUDA_ERROR_NOT_PERMITTED.  Callbacks must not perform any
+ * synchronization that may depend on outstanding device work or other callbacks
+ * that are not mandated to run earlier.  Callbacks without a mandated order
+ * (in independent streams) execute in undefined order and may be serialized.
+ *
+ * For the purposes of Unified Memory, callback execution makes a number of
+ * guarantees:
+ * <ul>
+ *   <li>The callback stream is considered idle for the duration of the
+ *   callback.  Thus, for example, a callback may always use memory attached
+ *   to the callback stream.</li>
+ *   <li>The start of execution of a callback has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the callback.  It thus synchronizes streams which have been "joined"
+ *   prior to the callback.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a callback might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   callback with an event.</li>
+ *   <li>Completion of a callback does not cause a stream to become
+ *   active except as described above.  The callback stream will remain idle
+ *   if no device work follows the callback, and will remain idle across
+ *   consecutive callbacks without device work in between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a callback at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * \param hStream  - Stream to add callback to
+ * \param callback - The function to call once preceding stream operations are complete
+ * \param userData - User specified data to be passed to the callback function
+ * \param flags    - Reserved for future use, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuStreamLaunchHostFunc,
+ * ::cudaStreamAddCallback
+ */
+CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+
+/**
+ * \brief Begins graph capture on a stream
+ *
+ * Begin graph capture on \p hStream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated
+ * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which
+ * it was initiated, and it may only be initiated if the stream is not already in capture
+ * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \param hStream - Stream in which to initiate capture
+ * \param mode    - Controls the interaction of this capture sequence with other API
+ *                  calls that are potentially unsafe. For more details see
+ *                  ::cuThreadExchangeStreamCaptureMode.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamEndCapture,
+ * ::cuThreadExchangeStreamCaptureMode
+ */
+CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode);
+
+/**
+ * \brief Swaps the stream capture interaction mode for a thread
+ *
+ * Sets the calling thread's stream capture interaction mode to the value contained
+ * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ * facilitate deterministic behavior across function or module boundaries, callers
+ * are encouraged to use this API in a push-pop fashion: \code
+     CUstreamCaptureMode mode = desiredMode;
+     cuThreadExchangeStreamCaptureMode(&mode);
+     ...
+     cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ * \endcode
+ *
+ * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call
+ * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ * not enqueued asynchronously to a stream, and is not observed by stream capture.
+ * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture
+ * depended on the allocation being replayed whenever the graph is launched, the
+ * captured graph would be invalid.
+ *
+ * Therefore, stream capture places restrictions on API calls that can be made within
+ * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This
+ * behavior can be controlled via this API and flags to ::cuStreamBeginCapture.
+ *
+ * A thread's mode is one of the following:
+ * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has
+ *   an ongoing capture sequence that was not initiated with
+ *   \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread
+ *   has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL,
+ *   this thread is prohibited from potentially unsafe API calls.
+ * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture
+ *   sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited
+ *   from potentially unsafe API calls. Concurrent capture sequences in other threads
+ *   are ignored.
+ * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially
+ *   unsafe API calls. Note that the thread is still prohibited from API calls which
+ *   necessarily conflict with stream capture, for example, attempting ::cuEventQuery
+ *   on an event that was last recorded inside a capture sequence.
+ *
+ * \param mode - Pointer to mode value to swap with the current mode
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBeginCapture
+ */
+CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode);
+
+/**
+ * \brief Ends capture on a stream, returning the captured graph
+ *
+ * End capture on \p hStream, returning the captured graph via \p phGraph.
+ * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture.
+ * If capture was invalidated, due to a violation of the rules of stream capture, then
+ * a NULL graph will be returned.
+ *
+ * If the \p mode argument to ::cuStreamBeginCapture was not
+ * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as
+ * ::cuStreamBeginCapture.
+ *
+ * \param hStream - Stream to query
+ * \param phGraph - The captured graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing
+ */
+CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+
+/**
+ * \brief Returns a stream's capture status
+ *
+ * Return the capture status of \p hStream via \p captureStatus. After a successful
+ * call, \p *captureStatus will contain one of the following:
+ * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing.
+ * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error
+ *   has invalidated the capture sequence. The capture sequence must be terminated
+ *   with ::cuStreamEndCapture on the stream where it was initiated in order to
+ *   continue using \p hStream.
+ *
+ * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while
+ * a blocking stream in the same context is capturing, it will return
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified
+ * after the call. The blocking stream capture is not invalidated.
+ *
+ * When a blocking stream is capturing, the legacy stream is in an
+ * unusable state until the blocking stream capture is terminated. The legacy
+ * stream is not supported for stream capture, but attempted use would have an
+ * implicit dependency on the capturing stream(s).
+ *
+ * \param hStream       - Stream to query
+ * \param captureStatus - Returns the stream's capture status
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamCreate,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamEndCapture
+ */
+CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+
+/**
+ * \brief Query capture status of a stream
+ *
+ * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will
+ * supplant this version in 12.0, which is retained for minor version compatibility.
+ *
+ * Query the capture status of a stream and and get an id for 
+ * the capture sequence, which is unique over the lifetime of the process.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * A valid id is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamGetCaptureInfo_v2,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing
+ */
+CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+
+/**
+ * \brief Query a stream's capture state (11.3+)
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created 
+ * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns CUDA_SUCCESS
+ * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE
+ *
+ * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the
+ * previous version in 12.0. Developers requiring compatibility across minor versions to
+ * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback
+ * path.
+ *
+ * \param hStream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cuStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cuStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until end of
+ *           capture. The node handles may be copied out and are valid until they or the
+ *           graph is destroyed. The driver-owned array may also be passed directly to
+ *           APIs that operate on the graph (not the stream) without copying.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamGetCaptureInfo,
+ * ::cuStreamBeginCapture,
+ * ::cuStreamIsCapturing,
+ * ::cuStreamUpdateCaptureDependencies
+ */
+CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out,
+        cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (11.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and
+ * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at
+ * ::cuStreamEndCapture.
+ *
+ * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing.
+ *
+ * This API is new in CUDA 11.3. Developers requiring compatibility across minor
+ * versions to CUDA 11.0 should not use this API or provide a fallback.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_ILLEGAL_STATE
+ *
+ * \sa
+ * ::cuStreamBeginCapture,
+ * ::cuStreamGetCaptureInfo,
+ * ::cuStreamGetCaptureInfo_v2
+ */
+CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p hStream to specify stream association of
+ * \p length bytes of memory starting from \p dptr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p dptr must point to one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cuMemAllocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable host allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::CUmemAttach_flags.
+ * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS.
+ * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with
+ * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p hStream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p hStream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p hStream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cuMemAllocManaged. For __managed__ variables, the default
+ * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param hStream - Stream in which to enqueue the attach operation
+ * \param dptr    - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  pageable memory)
+ * \param length  - Length of memory
+ * \param flags   - Must be one of ::CUmemAttach_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cudaStreamAttachMemAsync
+ */
+CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+
+/**
+ * \brief Determine status of a compute stream
+ *
+ * Returns ::CUDA_SUCCESS if all operations in the stream specified by
+ * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuStreamSynchronize().
+ *
+ * \param hStream - Stream to query status of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamQuery
+ */
+CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+
+/**
+ * \brief Wait until a stream's tasks are completed
+ *
+ * Waits until the device has completed all operations in the stream specified
+ * by \p hStream. If the context was created with the
+ * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the
+ * stream is finished with all of its tasks.
+ *
+ * \param hStream - Stream to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamDestroy,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamSynchronize
+ */
+CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+
+/**
+ * \brief Destroys a stream
+ *
+ * Destroys the stream specified by \p hStream.
+ *
+ * In case the device is still doing work in the stream \p hStream
+ * when ::cuStreamDestroy() is called, the function will return immediately
+ * and the resources associated with \p hStream will be released automatically
+ * once the device has completed all work in \p hStream.
+ *
+ * \param hStream - Stream to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamAddCallback,
+ * ::cudaStreamDestroy
+ */
+CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+
+/**
+ * \brief Copies attributes from source stream to destination stream.
+ *
+ * Copies attributes from source stream \p src to destination stream \p dst.
+ * Both streams must have the same context.
+ *
+ * \param[out] dst Destination stream
+ * \param[in] src Source stream
+ * For list of attributes see ::CUstreamAttrID
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src);
+
+/**
+ * \brief Queries stream attribute.
+ *
+ * Queries attribute \p attr from \p hStream and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hStream
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      CUstreamAttrValue *value_out);
+
+/**
+ * \brief Sets stream attribute.
+ *
+ * Sets attribute \p attr on \p hStream from corresponding attribute of
+ * \p value. The updated attribute will be applied to subsequent work
+ * submitted to the stream. It will not affect previously submitted work.
+ *
+ * \param[out] hStream
+ * \param[in] attr
+ * \param[in] value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                                      const CUstreamAttrValue *value);
+
+/** @} */ /* END CUDA_STREAM */
+
+
+/**
+ * \defgroup CUDA_EVENT Event Management
+ *
+ * ___MANBRIEF___ event management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the event management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates an event
+ *
+ * Creates an event *phEvent for the current context with the flags specified via
+ * \p Flags. Valid flags include:
+ * - ::CU_EVENT_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking
+ *   synchronization.  A CPU thread that uses ::cuEventSynchronize() to wait on
+ *   an event created with this flag will block until the event has actually
+ *   been recorded.
+ * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best
+ *   performance when used with ::cuStreamWaitEvent() and ::cuEventQuery().
+ * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an
+ *   interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must
+ *   be specified along with ::CU_EVENT_DISABLE_TIMING.
+ *
+ * \param phEvent - Returns newly created event
+ * \param Flags   - Event creation flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventCreate,
+ * ::cudaEventCreateWithFlags
+ */
+CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecord() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventRecord,
+ * ::cuEventRecordWithFlags
+ */
+CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p hEvent the contents of \p hStream at the time of this call.
+ * \p hEvent and \p hStream must be from the same context.
+ * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p hStream after this call do not modify \p hEvent. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cuEventRecordWithFlags() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cuStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an
+ * event represents an empty set of work, so for example ::cuEventQuery()
+ * would return ::CUDA_SUCCESS.
+ *
+ * flags include:
+ * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag.
+ * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external
+ *   event node when performing stream capture. This flag is invalid outside
+ *   of stream capture.
+ *
+ * \param hEvent  - Event to record
+ * \param hStream - Stream to record event for
+ * \param flags   - See ::CUevent_capture_flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cuEventRecord,
+ * ::cudaEventRecord
+ */
+CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
+
+/**
+ * \brief Queries an event's status
+ *
+ * Queries the status of all work currently captured by \p hEvent. See
+ * ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Returns ::CUDA_SUCCESS if all captured work has been completed, or
+ * ::CUDA_ERROR_NOT_READY if any captured work is incomplete.
+ *
+ * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS
+ * is equivalent to having called ::cuEventSynchronize().
+ *
+ * \param hEvent - Event to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_READY
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventQuery
+ */
+CUresult CUDAAPI cuEventQuery(CUevent hEvent);
+
+/**
+ * \brief Waits for an event to complete
+ *
+ * Waits until the completion of all work currently captured in \p hEvent.
+ * See ::cuEventRecord() for details on what is captured by an event.
+ *
+ * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC
+ * flag will cause the calling CPU thread to block until the event has
+ * been completed by the device.  If the ::CU_EVENT_BLOCKING_SYNC flag has
+ * not been set, then the CPU thread will busy-wait until the event has
+ * been completed by the device.
+ *
+ * \param hEvent - Event to wait for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventDestroy,
+ * ::cuEventElapsedTime,
+ * ::cudaEventSynchronize
+ */
+CUresult CUDAAPI cuEventSynchronize(CUevent hEvent);
+
+/**
+ * \brief Destroys an event
+ *
+ * Destroys the event specified by \p hEvent.
+ *
+ * An event may be destroyed before it is complete (i.e., while
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the
+ * call does not block on completion of the event, and any associated
+ * resources will automatically be released asynchronously at completion.
+ *
+ * \param hEvent - Event to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventElapsedTime,
+ * ::cudaEventDestroy
+ */
+CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+
+/**
+ * \brief Computes the elapsed time between two events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds).
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cuEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cuEventRecord() has not been called on either event then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called
+ * on both events but one or both of them has not yet been completed (that is,
+ * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the
+ * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with
+ * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return
+ * ::CUDA_ERROR_INVALID_HANDLE.
+ *
+ * \param pMilliseconds - Time between \p hStart and \p hEnd in ms
+ * \param hStart        - Starting event
+ * \param hEnd          - Ending event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_READY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuEventCreate,
+ * ::cuEventRecord,
+ * ::cuEventQuery,
+ * ::cuEventSynchronize,
+ * ::cuEventDestroy,
+ * ::cudaEventElapsedTime
+ */
+CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+
+/** @} */ /* END CUDA_EVENT */
+
+/**
+ * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability
+ *
+ * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the external resource interoperability functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+ /**
+ * \brief Imports an external memory object
+ *
+ * Imports an externally allocated memory object and returns
+ * a handle to that in \p extMem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure
+ * is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st {
+            CUexternalMemoryHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void *nvSciBufObject;
+            } handle;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type
+ * of handle being imported. ::CUexternalMemoryHandleType is
+ * defined as:
+ *
+ * \code
+        typedef enum CUexternalMemoryHandleType_enum {
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD          = 1,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32       = 2,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT   = 3,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP         = 4,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE     = 5,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE     = 6,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7,
+            CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF           = 8
+        } CUexternalMemoryHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a memory object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a memory object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a memory object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * memory object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Heap object. This handle holds a reference to the underlying
+ * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Heap object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one
+ * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Resource object. This handle holds a reference to the
+ * underlying object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Resource object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * represent a valid shared NT handle that is returned by
+ * IDXGIResource1::CreateSharedHandle when referring to a
+ * ID3D11Resource object. If
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D11Resource object.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must
+ * represent a valid shared KMT handle that is returned by
+ * IDXGIResource::GetSharedHandle when referring to a
+ * ID3D11Resource object and
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name
+ * must be NULL.
+ *
+ * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL
+ * and reference a valid NvSciBuf object.
+ * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
+ * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync
+ * as appropriate barriers to maintain coherence between CUDA and the other drivers.
+ * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
+ * for memory synchronization.
+ *
+ *
+ * The size of the memory object must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size.
+ *
+ * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in
+ * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the
+ * resource is a dedicated resource. The definition of what a
+ * dedicated resource is outside the scope of this extension.
+ * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type
+ * is one of the following:
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT
+ *
+ * \param extMem_out    - Returned handle to an external memory object
+ * \param memHandleDesc - Memory import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ * as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ * CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ * and Cache Control" chapter from Vulkan specification.
+ *
+ * \sa ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc);
+
+/**
+ * \brief Maps a buffer onto an imported memory object
+ *
+ * Maps a buffer onto an imported memory object and returns a device
+ * pointer in \p devPtr.
+ *
+ * The properties of the buffer being mapped must be described in
+ * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st {
+            unsigned long long offset;
+            unsigned long long size;
+            unsigned int flags;
+        } CUDA_EXTERNAL_MEMORY_BUFFER_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in
+ * the memory object where the buffer's base address is.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer.
+ * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero.
+ *
+ * The offset and size have to be suitably aligned to match the
+ * requirements of the external API. Mapping two buffers whose ranges
+ * overlap may or may not result in the same virtual address being
+ * returned for the overlapped portion. In such cases, the application
+ * must ensure that all accesses to that region from the GPU are
+ * volatile. Otherwise writes made via one address are not guaranteed
+ * to be visible via the other address, even if they're issued by the
+ * same thread. It is recommended that applications map the combined
+ * range instead of mapping separate buffers and then apply the
+ * appropriate offsets to the returned pointer to derive the
+ * individual buffers.
+ *
+ * The returned pointer \p devPtr must be freed using ::cuMemFree.
+ *
+ * \param devPtr     - Returned device pointer to buffer
+ * \param extMem     - Handle to external memory object
+ * \param bufferDesc - Buffer descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc);
+
+/**
+ * \brief Maps a CUDA mipmapped array onto an external memory object
+ *
+ * Maps a CUDA mipmapped array onto an external object and returns a
+ * handle to it in \p mipmap.
+ *
+ * The properties of the CUDA mipmapped array being mapped must be
+ * described in \p mipmapDesc. The structure
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st {
+            unsigned long long offset;
+            CUDA_ARRAY3D_DESCRIPTOR arrayDesc;
+            unsigned int numLevels;
+        } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the
+ * offset in the memory object where the base level of the mipmap
+ * chain is.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes
+ * the format, dimensions and type of the base level of the mipmap
+ * chain. For further details on these parameters, please refer to the
+ * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped
+ * array is bound as a color target in the graphics API, then the flag
+ * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags.
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies
+ * the total number of levels in the mipmap chain.
+ *
+ * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then
+ * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1.
+ *
+ * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy.
+ *
+ * \param mipmap     - Returned CUDA mipmapped array
+ * \param extMem     - Handle to external memory object
+ * \param mipmapDesc - CUDA array descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \note On Tegra devices, this API will always attempt to do a compressed mapping when the \p extMem is
+ * imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuDestroyExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer
+ */
+CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc);
+
+/**
+ * \brief Destroys an external memory object.
+ *
+ * Destroys the specified external memory object. Any existing buffers
+ * and CUDA mipmapped arrays mapped onto this object must no longer be
+ * used and must be explicitly freed using ::cuMemFree and
+ * ::cuMipmappedArrayDestroy respectively.
+ *
+ * \param extMem - External memory object to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalMemory,
+ * ::cuExternalMemoryGetMappedBuffer,
+ * ::cuExternalMemoryGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem);
+
+/**
+ * \brief Imports an external semaphore
+ *
+ * Imports an externally allocated synchronization object and returns
+ * a handle to that in \p extSem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is
+ * defined as follows:
+ *
+ * \code
+        typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st {
+            CUexternalSemaphoreHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void* NvSciSyncObj;
+            } handle;
+            unsigned int flags;
+        } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC;
+ * \endcode
+ *
+ * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of
+ * handle being imported. ::CUexternalSemaphoreHandleType is defined
+ * as:
+ *
+ * \code
+        typedef enum CUexternalSemaphoreHandleType_enum {
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD                = 1,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32             = 2,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT         = 3,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE              = 4,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE              = 5,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC                = 6,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX        = 7,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT    = 8,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD    = 9,
+            CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10
+        } CUexternalSemaphoreHandleType;
+ * \endcode
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must
+ * be non-NULL and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * synchronization object are destroyed.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Fence object. This handle holds a reference to the underlying
+ * object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D12Fence object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared NT handle that is returned by
+ * ID3D11Fence::CreateSharedHandle. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D11Fence object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj
+ * represents a valid NvSciSyncObj.
+ *
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared NT handle that
+ * is returned by IDXGIResource1::CreateSharedHandle when referring to
+ * a IDXGIKeyedMutex object. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid IDXGIKeyedMutex object.
+ *
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * represents a valid shared KMT handle that
+ * is returned by IDXGIResource::GetSharedHandle when referring to
+ * a IDXGIKeyedMutex object and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL.
+ * 
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid
+ * file descriptor referencing a synchronization object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ * 
+ * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one
+ * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be
+ * NULL. If
+ * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object.
+ *
+ * \param extSem_out    - Returned handle to an external semaphore
+ * \param semHandleDesc - Semaphore import handle descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc);
+
+/**
+ * \brief Signals a set of external semaphore objects
+ *
+ * Enqueues a signal operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of signaling a semaphore depends on the type of
+ * the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then signaling the semaphore will set it to the signaled state.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ * then the semaphore will be set to the value specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value.
+ *
+ * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence
+ * to a value that can be used by subsequent waiters of the same NvSciSync object
+ * to order operations with those currently submitted in \p stream. Such an update
+ * will overwrite previous contents of
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default,
+ * signaling such an external semaphore object causes appropriate memory synchronization
+ * operations to be performed over all external memory objects that are imported as
+ * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses
+ * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
+ * These operations can be skipped by specifying the flag
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return
+ * CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ * then the keyed mutex will be released with the key specified in
+ * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key.
+ *
+ * \param extSemArray - Set of external semaphores to be signaled
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to signal
+ * \param stream      - Stream to enqueue the signal operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Waits on a set of external semaphore objects
+ *
+ * Enqueues a wait operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of waiting on a semaphore depends on the type
+ * of the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT
+ * then waiting on the semaphore will wait until the semaphore reaches
+ * the signaled state. The semaphore will then be reset to the
+ * unsignaled state. Therefore for every signal operation, there can
+ * only be one wait operation.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32
+ * then waiting on the semaphore will wait until the value of the
+ * semaphore is greater than or equal to
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value.
+ *
+ * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC
+ * then, waiting on the semaphore will wait until the
+ * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the
+ * signaler of the NvSciSyncObj that was associated with this semaphore object.
+ * By default, waiting on such an external semaphore object causes appropriate
+ * memory synchronization operations to be performed over all external memory objects
+ * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that
+ * any subsequent accesses made by other importers of the same set of NvSciBuf memory
+ * object(s) are coherent. These operations can be skipped by specifying the flag
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return
+ * CUDA_ERROR_NOT_SUPPORTED.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX,
+ * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT
+ * then the keyed mutex will be acquired when it is released with the key 
+ * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key 
+ * or until the timeout specified by
+ * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs
+ * has lapsed. The timeout interval can either be a finite value
+ * specified in milliseconds or an infinite value. In case an infinite
+ * value is specified the timeout never elapses. The windows INFINITE
+ * macro must be used to specify infinite timeout.
+ *
+ * \param extSemArray - External semaphores to be waited on
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to wait on
+ * \param stream      - Stream to enqueue the wait operations in
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_TIMEOUT
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuDestroyExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+
+/**
+ * \brief Destroys an external semaphore
+ *
+ * Destroys an external semaphore object and releases any references
+ * to the underlying resource. Any outstanding signals or waits must
+ * have completed before the semaphore is destroyed.
+ *
+ * \param extSem - External semaphore to be destroyed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem);
+
+/** @} */ /* END CUDA_EXTRES_INTEROP */
+
+/**
+ * \defgroup CUDA_MEMOP Stream Memory Operations
+ *
+ * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream memory operations of the low-level CUDA
+ * driver application programming interface.
+ *
+ * There are two versions of these APIs, a legacy version and a newer V2 version.
+ *
+ * V1:
+ *
+ * The V1 API is disabled by default. Users are required
+ * to explicitly enable it, e.g. on Linux by passing the kernel module
+ * parameter shown below:
+ *     modprobe nvidia NVreg_EnableStreamMemOPs=1
+ * There is currently no way to enable these operations on other operating
+ * systems.
+ *
+ * Users can programmatically query whether the device supports these
+ * operations with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
+ *
+ * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
+ * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
+ * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
+ * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
+ * hardware features and can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
+ *
+ * V2:
+ *
+ * The V2 APIs are available by default on all platforms.
+ *
+ * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
+ *
+ * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64()
+ * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and
+ * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2.
+ *
+ * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and
+ * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform
+ * hardware features and can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
+ *
+ * V1 & V2:
+ *
+ * Note that all memory pointers passed as parameters to these operations
+ * are device pointers. Where necessary a device pointer should be
+ * obtained, for example with ::cuMemHostGetDevicePointer().
+ *
+ * None of the operations accepts pointers to managed memory buffers
+ * (::cuMemAllocManaged).
+ *
+ * \note
+ * Warning:
+ * Improper use of these APIs may deadlock the application. Synchronization 
+ * ordering established through these APIs is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by these APIs should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order.
+ *
+ * @{
+ */
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
+ * flag is passed, the write is preceded by a system-wide memory fence,
+ * equivalent to a __threadfence_system() but scoped to the stream
+ * rather than a CUDA thread.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue64,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER
+ * flag is passed, the write is preceded by a system-wide memory fence,
+ * equivalent to a __threadfence_system() but scoped to the stream
+ * rather than a CUDA thread.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue32,
+ * ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamBatchMemOp,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Batch operations to synchronize the stream via memory operations
+ *
+ * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32().
+ * Batching operations may avoid some performance overhead in both the API call
+ * and the device execution versus adding them to the stream in separate API
+ * calls. The operations are enqueued in the order they appear in the array.
+ *
+ * See ::CUstreamBatchMemOpType for the full set of supported operations, and
+ * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(),
+ * and ::cuStreamWriteValue64() for details of specific operations.
+ *
+ * Basic support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details
+ * on querying support for specific operations.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param stream The stream to enqueue the operations in.
+ * \param count The number of operations in the array. Must be less than 256.
+ * \param paramArray The types and parameters of the individual operations.
+ * \param flags Reserved for future expansion; must be 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32,
+ * ::cuStreamWaitValue64,
+ * ::cuStreamWriteValue32,
+ * ::cuStreamWriteValue64,
+ * ::cuMemHostRegister
+ */
+CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue64_v2,
+ * ::cuStreamWriteValue32_v2,
+ * ::cuStreamWriteValue64_v2,
+ * ::cuStreamBatchMemOp_v2,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Wait on a memory location
+ *
+ * Enqueues a synchronization of the stream on the given memory location. Work
+ * ordered after the operation will block until the given condition on the
+ * memory is satisfied. By default, the condition is to wait for
+ * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal.
+ * Other condition types can be specified via \p flags.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param stream The stream to synchronize on the memory location.
+ * \param addr The memory location to wait on.
+ * \param value The value to compare with the memory location.
+ * \param flags See ::CUstreamWaitValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32_v2,
+ * ::cuStreamWriteValue32_v2,
+ * ::cuStreamWriteValue64_v2,
+ * ::cuStreamBatchMemOp_v2,
+ * ::cuMemHostRegister,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot
+ * be used with managed memory (::cuMemAllocManaged).
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue64_v2,
+ * ::cuStreamWaitValue32_v2,
+ * ::cuStreamWaitValue64_v2,
+ * ::cuStreamBatchMemOp_v2,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+
+/**
+ * \brief Write a value to memory
+ *
+ * Write a value to memory.
+ *
+ * If the memory was registered via ::cuMemHostRegister(), the device pointer
+ * should be obtained with ::cuMemHostGetDevicePointer().
+ *
+ * Support for this can be queried with ::cuDeviceGetAttribute() and
+ * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2.
+ *
+ * \param stream The stream to do the write in.
+ * \param addr The device address to write to.
+ * \param value The value to write.
+ * \param flags See ::CUstreamWriteValue_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWriteValue32_v2,
+ * ::cuStreamWaitValue32_v2,
+ * ::cuStreamWaitValue64_v2,
+ * ::cuStreamBatchMemOp_v2,
+ * ::cuMemHostRegister,
+ * ::cuEventRecord
+ */
+CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+
+/**
+ * \brief Batch operations to synchronize the stream via memory operations
+ *
+ * This is a batch version of ::cuStreamWaitValue32_v2() and ::cuStreamWriteValue32_v2().
+ * Batching operations may avoid some performance overhead in both the API call
+ * and the device execution versus adding them to the stream in separate API
+ * calls. The operations are enqueued in the order they appear in the array.
+ *
+ * See ::CUstreamBatchMemOpType for the full set of supported operations, and
+ * ::cuStreamWaitValue32_v2(), ::cuStreamWaitValue64_v2(), ::cuStreamWriteValue32_v2(),
+ * and ::cuStreamWriteValue64_v2() for details of specific operations.
+ *
+ * See related APIs for details on querying support for specific operations.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param stream The stream to enqueue the operations in.
+ * \param count The number of operations in the array. Must be less than 256.
+ * \param paramArray The types and parameters of the individual operations.
+ * \param flags Reserved for future expansion; must be 0.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \notefnerr
+ *
+ * \sa ::cuStreamWaitValue32_v2,
+ * ::cuStreamWaitValue64_v2,
+ * ::cuStreamWriteValue32_v2,
+ * ::cuStreamWriteValue64_v2,
+ * ::cuMemHostRegister
+ */
+CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+
+/** @} */ /* END CUDA_MEMOP */
+
+/**
+ * \defgroup CUDA_EXEC Execution Control
+ *
+ * ___MANBRIEF___ execution control functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the execution control functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns information about a function
+ *
+ * Returns in \p *pi the integer value of the attribute \p attrib on the kernel
+ * given by \p hfunc. The supported attributes are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads
+ *   per block, beyond which a launch of the function would fail. This number
+ *   depends on both the function and the device on which the function is
+ *   currently loaded.
+ * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of
+ *   statically-allocated shared memory per block required by this function.
+ *   This does not include dynamically-allocated shared memory requested by
+ *   the user at runtime.
+ * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated
+ *   constant memory required by this function.
+ * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory
+ *   used by each thread of this function.
+ * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread
+ *   of this function.
+ * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for
+ *   which the function was compiled. This value is the major PTX version * 10
+ *   + the minor PTX version, so a PTX version 1.3 function would return the
+ *   value 13. Note that this may return the undefined value of 0 for cubins
+ *   compiled prior to CUDA 3.0.
+ * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for
+ *   which the function was compiled. This value is the major binary
+ *   version * 10 + the minor binary version, so a binary version 1.3 function
+ *   would return the value 13. Note that this will return a value of 10 for
+ *   legacy cubins that do not have a properly-encoded binary architecture
+ *   version.
+ * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has
+ *   been compiled with user specified option "-Xptxas --dlcm=ca" set .
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of
+ *   dynamically-allocated shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1
+ *   cache split ratio in percent of total shared memory.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the
+ *   kernel must launch with a valid cluster size specified.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks.
+ * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether
+ *   the function can be launched with non-portable cluster size. 1 is allowed,
+ *   0 is disallowed. A non-portable cluster size may only function on the
+ *   specific SKUs the program is tested on. The launch might fail if the
+ *   program is run on a different hardware platform. CUDA API provides
+ *   cudaOccupancyMaxActiveClusters to assist with checking whether the desired
+ *   size can be launched on the current device. A portable cluster size is
+ *   guaranteed to be functional on all compute capabilities higher than the
+ *   target compute capability. The portable cluster size for sm_90 is 8 blocks
+ *   per cluster. This value may increase for future compute capabilities. The
+ *   specific hardware unit may support higher cluster sizes that’s not
+ *   guaranteed to be portable.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * \param pi     - Returned attribute value
+ * \param attrib - Attribute requested
+ * \param hfunc  - Function to query attribute of
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes,
+ * ::cudaFuncSetAttribute
+ */
+CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+
+/**
+ * \brief Sets information about a function
+ *
+ * This call sets the value of a specified attribute \p attrib on the kernel given
+ * by \p hfunc to an integer value specified by \p val
+ * This function returns CUDA_SUCCESS if the new value of the attribute could be
+ * successfully set. If the set fails, this call will return an error.
+ * Not all attributes can have values set. Attempting to set a value on a read-only
+ * attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+ *
+ * Supported attributes for the cuFuncSetAttribute call are:
+ * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of
+ *   dynamically-allocated shared memory. The value should contain the requested
+ *   maximum size of dynamically-allocated shared memory. The sum of this value and
+ *   the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the
+ *   device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN.
+ *   The maximal size of requestable dynamic shared memory may differ by GPU
+ *   architecture.
+ * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1
+ *   cache and shared memory use the same hardware resources, this sets the shared memory
+ *   carveout preference, in percent of the total shared memory. 
+ *   See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED.
+ * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block
+ *   scheduling policy of a function. The value type is CUclusterSchedulingPolicy.
+ *
+ * \param hfunc  - Function to query attribute of
+ * \param attrib - Attribute requested
+ * \param value   - The value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuLaunchKernel,
+ * ::cudaFuncGetAttributes,
+ * ::cudaFuncSetAttribute
+ */
+CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
+
+/**
+ * \brief Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p config the preferred cache configuration for
+ * the device function \p hfunc. This is only a preference. The driver will use
+ * the requested configuration if possible, but it is free to choose a different
+ * configuration if required to execute \p hfunc.  Any context-wide preference
+ * set via ::cuCtxSetCacheConfig() will be overridden by this per-function
+ * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In
+ * that case, the current context-wide setting will be used.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ *
+ * The supported cache configurations are:
+ * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default)
+ * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache
+ * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory
+ * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory
+ *
+ * \param hfunc  - Kernel to configure cache for
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetCacheConfig
+ */
+CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
+
+/**
+ * \brief Sets the shared memory configuration for a device function.
+ *
+ * On devices with configurable shared memory banks, this function will
+ * force all subsequent launches of the specified device function to have
+ * the given shared memory bank size configuration. On any given launch of the
+ * function, the shared memory configuration of the device will be temporarily
+ * changed if needed to suit the function's preferred configuration. Changes in
+ * shared memory configuration between subsequent launches of functions,
+ * may introduce a device side synchronization point.
+ *
+ * Any per-function setting of shared memory bank size set via
+ * ::cuFuncSetSharedMemConfig will override the context wide setting set with
+ * ::cuCtxSetSharedMemConfig.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance.
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory
+ *   configuration when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively four bytes when launching this function.
+ * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to
+ *   be natively eight bytes when launching this function.
+ *
+ * \param hfunc  - kernel to be given a shared memory config
+ * \param config - requested shared memory configuration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuCtxGetSharedMemConfig,
+ * ::cuCtxSetSharedMemConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchKernel,
+ * ::cudaFuncSetSharedMemConfig
+ */
+CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config);
+
+/**
+ * \brief Returns a module handle
+ *
+ * Returns in \p *hmod the handle of the module that function \p hfunc
+ * is located in. The lifetime of the module corresponds to the lifetime of
+ * the context it was loaded in or until the module is explicitly unloaded.
+ *
+ * The CUDA runtime manages its own modules loaded into the primary context.
+ * If the handle returned by this API refers to a module loaded by the CUDA runtime,
+ * calling ::cuModuleUnload() on that module will result in undefined behavior.
+ *
+ * \param hmod - Returned module handle
+ * \param hfunc   - Function to retrieve module for
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \notefnerr
+ *
+ */
+CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p f can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into
+ * a single buffer that is passed in via the \p extra parameter.
+ * This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer.  Here is
+ * an example of using the \p extra parameter in this manner:
+ * \code
+    size_t argBufferSize;
+    char argBuffer[256];
+
+    // populate argBuffer and argBufferSize
+
+    void *config[] = {
+        CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
+        CU_LAUNCH_PARAM_BUFFER_SIZE,    &argBufferSize,
+        CU_LAUNCH_PARAM_END
+    };
+    status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config);
+ * \endcode
+ *
+ * The \p extra parameter exists to allow ::cuLaunchKernel to take
+ * additional less commonly used arguments.  \p extra specifies a list of
+ * names of extra settings and their corresponding values.  Each extra
+ * setting name is immediately followed by the corresponding value.  The
+ * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer containing all
+ *   the kernel parameters for launching kernel \p f;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t containing the
+ *   size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel
+ * parameters are specified with both \p kernelParams and \p extra
+ * (i.e. both \p kernelParams and \p extra are non-NULL).
+ *
+ * Calling ::cuLaunchKernel() invalidates the persistent function state
+ * set through the following deprecated APIs:
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(),
+ *  ::cuParamSetv().
+ *
+ * Note that to use ::cuLaunchKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param f              - Kernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ * \param extra          - Extra options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cudaLaunchKernel
+ */
+CUresult CUDAAPI cuLaunchKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams,
+                                void **extra);
+
+/**
+ * \brief Launches a CUDA function with launch-time configuration
+ *
+ * Invokes the kernel \p f with the specified launch-time configuration
+ * \p config.
+ *
+ * The ::CUlaunchConfig structure is defined as:
+ * \code
+        typedef struct CUlaunchConfig_st {
+            unsigned int gridDimX;
+            unsigned int gridDimY;
+            unsigned int gridDimZ;
+            unsigned int blockDimX;
+            unsigned int blockDimY;
+            unsigned int blockDimZ;
+            unsigned int sharedMemBytes;
+            CUstream hStream;
+            CUlaunchAttribute *attrs;
+            unsigned int numAttrs;
+        } CUlaunchConfig;
+ * \endcode
+ * where:
+ * - ::CUlaunchConfig::gridDimX is the width of the grid in blocks.
+ * - ::CUlaunchConfig::gridDimY is the height of the grid in blocks.
+ * - ::CUlaunchConfig::gridDimZ is the depth of the grid in blocks.
+ * - ::CUlaunchConfig::blockDimX is the X dimension of each thread block.
+ * - ::CUlaunchConfig::blockDimX is the Y dimension of each thread block.
+ * - ::CUlaunchConfig::blockDimZ is the Z dimension of each thread block.
+ * - ::CUlaunchConfig::sharedMemBytes is the dynamic shared-memory size per
+ *   thread block in bytes.
+ * - ::CUlaunchConfig::hStream is the handle to the stream to perform the launch
+ *   in. The CUDA context associated with this stream must match that associated
+ *   with function f.
+ * - ::CUlaunchConfig::attrs is an array of ::CUlaunchConfig::numAttrs
+ *   continguous ::CUlaunchAttribute elements. The value of this pointer is not
+ *   considered if ::CUlaunchConfig::numAttrs is zero. However, in that case, it
+ *   is recommended to set the pointer to NULL.
+ * - ::CUlaunchConfig::numAttrs is the numbers of attributes populating the
+ *   first ::CUlaunchConfig::numAttrs positions of the ::CUlaunchConfig::attrs
+ *   array.
+ *
+ * Launch-time configuration is specified by adding entries to
+ * ::CUlaunchConfig::attrs. Each entry is an attribute ID and a corresponding
+ * attribute value.
+ *
+ * The ::CUlaunchAttribute structure is defined as:
+ * \code
+        typedef struct CUlaunchAttribute_st {
+            CUlaunchAttributeID id;
+            CUlaunchAttributeValue value;
+        } CUlaunchAttribute;
+ * \endcode
+ * where:
+ * - ::CUlaunchAttribute::id is a unique enum identifying the attribute.
+ * - ::CUlaunchAttribute::value is a union that hold the attribute value.
+ *
+ * An example of using the \p config parameter:
+ * \code
+        CUlaunchAttribute coopAttr = {.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE,
+                                      .value = 1};
+        CUlaunchConfig config = {... // set block and grid dimensions
+                                 .attrs = &coopAttr,
+                                 .numAttrs = 1};
+
+        cuLaunchKernelEx(&config, kernel, NULL, NULL);
+ * \endcode
+ *
+ * The ::CUlaunchAttributeID enum is defined as:
+ * \code
+        typedef enum CUlaunchAttributeID_enum {
+            CU_LAUNCH_ATTRIBUTE_IGNORE = 0,
+            CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW   = 1,
+            CU_LAUNCH_ATTRIBUTE_COOPERATIVE            = 2,
+            CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3,
+            CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION                    = 4,
+            CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5,
+            CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION    = 6,
+            CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT                   = 7,
+        } CUlaunchAttributeID;
+ * \endcode
+ *
+ * and the corresponding ::CUlaunchAttributeValue union as :
+ * \code
+        typedef union CUlaunchAttributeValue_union {
+            cuuint64_t pad[8];
+            CUaccessPolicyWindow accessPolicyWindow;
+            int cooperative;
+            CUsynchronizationPolicy syncPolicy;
+            struct {
+                unsigned int x;
+                unsigned int y;
+                unsigned int z;
+            } clusterDim;
+            CUclusterSchedulingPolicy clusterSchedulingPolicyPreference;
+            int programmaticStreamSerializationAllowed;
+            struct {
+                CUevent event;
+                int flags;
+                int triggerAtBlockStart;
+            } programmaticEvent;
+        } CUlaunchAttributeValue;
+ * \endcode
+ *
+ * Setting ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE to a non-zero value causes the
+ * kernel launch to be a cooperative launch, with exactly the same usage and
+ * semantics of ::cuLaunchCooperativeKernel.
+ *
+ * Setting ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION to a non-zero
+ * values causes the kernel to use programmatic means to resolve its stream
+ * dependency -- enabling the CUDA runtime to opportunistically allow the grid's
+ * execution to overlap with the previous kernel in the stream, if that kernel
+ * requests the overlap.
+ *
+ * ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT records an event along with the
+ * kernel launch. Event recorded through this launch attribute is guaranteed to
+ * only trigger after all block in the associated kernel trigger the event. A
+ * block can trigger the event through PTX launchdep.release or CUDA builtin
+ * function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be
+ * inserted at the beginning of each block's execution if triggerAtBlockStart is
+ * set to non-0. Note that dependents (including the CPU thread calling
+ * cuEventSynchronize()) are not guaranteed to observe the release precisely
+ * when it is released. For example, cuEventSynchronize() may only observe the
+ * event trigger long after the associated kernel has completed. This recording
+ * type is primarily meant for establishing programmatic dependency between
+ * device tasks. The event supplied must not be an interprocess or interop
+ * event. The event must disable timing (i.e. created with
+ * ::CU_EVENT_DISABLE_TIMING flag set).
+ *
+ * The effect of other attributes is consistent with their effect when set via
+ * persistent APIs.
+ *
+ * See ::cuStreamSetAttribute for
+ * - ::CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW
+ * - ::CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY
+ *
+ * See ::cuFunctionSetAttribute for
+ * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+ * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
+ *
+ * Kernel parameters to \p f can be specified in the same ways that they can be
+ * using ::cuLaunchKernel.
+ *
+ * \param config         - Kernel to launch
+ * \param f              - Kernel to launch
+ * \param kernelParams   - Array of pointers to kernel parameters
+ * \param extra          - Extra options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cudaLaunchKernel,
+ * ::cudaLaunchKernelEx
+ */
+CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config,
+                                  CUfunction f,
+                                  void **kernelParams,
+                                  void **extra);
+
+/**
+ * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute
+ *
+ * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ
+ * grid of blocks. Each block contains \p blockDimX x \p blockDimY x
+ * \p blockDimZ threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * Kernel parameters must be specified via \p kernelParams.  If \p f
+ * has N parameters, then \p kernelParams needs to be an array of N
+ * pointers.  Each of \p kernelParams[0] through \p kernelParams[N-1]
+ * must point to a region of memory from which the actual kernel
+ * parameter will be copied.  The number of kernel parameters and their
+ * offsets and sizes do not need to be specified as that information is
+ * retrieved directly from the kernel's image.
+ *
+ * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API
+ *
+ * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous
+ * block shape, shared size and parameter info associated with \p f
+ * is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param f              - Kernel to launch
+ * \param gridDimX       - Width of grid in blocks
+ * \param gridDimY       - Height of grid in blocks
+ * \param gridDimZ       - Depth of grid in blocks
+ * \param blockDimX      - X dimension of each thread block
+ * \param blockDimY      - Y dimension of each thread block
+ * \param blockDimZ      - Z dimension of each thread block
+ * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes
+ * \param hStream        - Stream identifier
+ * \param kernelParams   - Array of pointers to kernel parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernelMultiDevice,
+ * ::cudaLaunchCooperativeKernel
+ */
+CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
+                                unsigned int gridDimX,
+                                unsigned int gridDimY,
+                                unsigned int gridDimZ,
+                                unsigned int blockDimX,
+                                unsigned int blockDimY,
+                                unsigned int blockDimZ,
+                                unsigned int sharedMemBytes,
+                                CUstream hStream,
+                                void **kernelParams);
+
+/**
+ * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+ *
+ * \deprecated This function is deprecated as of CUDA 11.3.
+ *
+ * Invokes kernels as specified in the \p launchParamsList array where each element
+ * of the array specifies all the parameters required to perform a single kernel launch.
+ * These kernels can cooperate and synchronize as they execute. The size of the array is
+ * specified by \p numDevices.
+ *
+ * No two kernels can be launched on the same device. All the devices targeted by this
+ * multi-device launch must be identical. All devices must have a non-zero value for the
+ * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH.
+ *
+ * All kernels launched must be identical with respect to the compiled code. Note that
+ * any __device__, __constant__ or __managed__ variables present in the module that owns
+ * the kernel launched on each device, are independently instantiated on every device.
+ * It is the application's responsiblity to ensure these variables are initialized and
+ * used appropriately.
+ *
+ * The size of the grids as specified in blocks, the size of the blocks themselves
+ * and the amount of shared memory used by each thread block must also match across
+ * all launched kernels.
+ *
+ * The streams used to launch these kernels must have been created via either ::cuStreamCreate
+ * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD
+ * cannot be used.
+ *
+ * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the
+ * total number of blocks launched per device has to match across all devices, the maximum
+ * number of blocks that can be launched per device will be limited by the device with the
+ * least number of multiprocessors.
+ *
+ * The kernels cannot make use of CUDA dynamic parallelism.
+ *
+ * The ::CUDA_LAUNCH_PARAMS structure is defined as:
+ * \code
+        typedef struct CUDA_LAUNCH_PARAMS_st
+        {
+            CUfunction function;
+            unsigned int gridDimX;
+            unsigned int gridDimY;
+            unsigned int gridDimZ;
+            unsigned int blockDimX;
+            unsigned int blockDimY;
+            unsigned int blockDimZ;
+            unsigned int sharedMemBytes;
+            CUstream hStream;
+            void **kernelParams;
+        } CUDA_LAUNCH_PARAMS;
+ * \endcode
+ * where:
+ * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must
+ *   be identical with respect to the compiled code.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across
+ *   all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes.
+ *   This must match across all kernels launched.
+ * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot
+ *   be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated
+ *   with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function.
+ * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If
+ *   ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams
+ *   needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through
+ *   ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual
+ *   kernel parameter will be copied. The number of kernel parameters and their offsets and sizes
+ *   do not need to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ * streams has completed. This behavior can be overridden by specifying the flag
+ * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel
+ * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ * execution.
+ *
+ * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified,
+ * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ * on the GPU corresponding to that stream to complete before it begins execution.
+ *
+ * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is
+ * the same as function state set through ::cuLaunchKernel API when called individually for each
+ * element in \p launchParamsList.
+ *
+ * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous
+ * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function
+ * in \p launchParamsList is overwritten.
+ *
+ * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have
+ * been compiled with toolchain version 3.2 or later so that it will
+ * contain kernel parameter information, or have no kernel parameters.
+ * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will
+ * return ::CUDA_ERROR_INVALID_IMAGE.
+ *
+ * \param launchParamsList - List of launch parameters, one per device
+ * \param numDevices       - Size of the \p launchParamsList array
+ * \param flags            - Flags to control launch behavior
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_IMAGE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuCtxGetCacheConfig,
+ * ::cuCtxSetCacheConfig,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuLaunchCooperativeKernel,
+ * ::cudaLaunchCooperativeKernelMultiDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags);
+
+/**
+ * \brief Enqueues a host function call in a stream
+ *
+ * Enqueues a host function to run in a stream.  The function will be called
+ * after currently enqueued work and will block work added after it.
+ *
+ * The host function must not make any CUDA API calls.  Attempting to use a
+ * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required.
+ * The host function must not perform any synchronization that may depend on
+ * outstanding CUDA work not mandated to run earlier.  Host functions without a
+ * mandated order (such as in independent streams) execute in undefined order
+ * and may be serialized.
+ *
+ * For the purposes of Unified Memory, execution makes a number of guarantees:
+ * <ul>
+ *   <li>The stream is considered idle for the duration of the function's
+ *   execution.  Thus, for example, the function may always use memory attached
+ *   to the stream it was enqueued in.</li>
+ *   <li>The start of execution of the function has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the function.  It thus synchronizes streams which have been "joined"
+ *   prior to the function.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a function might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   function call with an event.</li>
+ *   <li>Completion of the function does not cause a stream to become
+ *   active except as described above.  The stream will remain idle
+ *   if no device work follows the function, and will remain idle across
+ *   consecutive host functions or stream callbacks without device work in
+ *   between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a host function at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * Note that, in contrast to ::cuStreamAddCallback, the function will not be
+ * called in the event of an error in the CUDA context.
+ *
+ * \param hStream  - Stream to enqueue function call in
+ * \param fn       - The function to call once preceding stream operations are complete
+ * \param userData - User-specified data to be passed to the function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_SUPPORTED
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuStreamCreate,
+ * ::cuStreamQuery,
+ * ::cuStreamSynchronize,
+ * ::cuStreamWaitEvent,
+ * ::cuStreamDestroy,
+ * ::cuMemAllocManaged,
+ * ::cuStreamAttachMemAsync,
+ * ::cuStreamAddCallback
+ */
+CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+
+/** @} */ /* END CUDA_EXEC */
+
+/**
+ * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated execution control functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the block-dimensions for the function
+ *
+ * \deprecated
+ *
+ * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are
+ * created when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dimensions of
+ * \param x     - X dimension
+ * \param y     - Y dimension
+ * \param z     - Z dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetSharedSize,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
+
+/**
+ * \brief Sets the dynamic shared-memory size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p bytes the amount of dynamic shared memory that will be
+ * available to each thread block when the kernel given by \p hfunc is launched.
+ *
+ * \param hfunc - Kernel to specify dynamic shared-memory size for
+ * \param bytes - Dynamic shared-memory size per thread in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetCacheConfig,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes);
+
+/**
+ * \brief Sets the parameter size for the function
+ *
+ * \deprecated
+ *
+ * Sets through \p numbytes the total size in bytes needed by the function
+ * parameters of the kernel corresponding to \p hfunc.
+ *
+ * \param hfunc    - Kernel to set parameter size for
+ * \param numbytes - Size of parameter list in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes);
+
+/**
+ * \brief Adds an integer parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets an integer parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value);
+
+/**
+ * \brief Adds a floating-point parameter to the function's argument list
+ *
+ * \deprecated
+ *
+ * Sets a floating-point parameter that will be specified the next time the
+ * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset.
+ *
+ * \param hfunc  - Kernel to add parameter to
+ * \param offset - Offset to add parameter to argument list
+ * \param value  - Value of parameter
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value);
+
+/**
+ * \brief Adds arbitrary data to the function's argument list
+ *
+ * \deprecated
+ *
+ * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr
+ * into the parameter space of the kernel corresponding to \p hfunc. \p offset
+ * is a byte offset.
+ *
+ * \param hfunc    - Kernel to add data to
+ * \param offset   - Offset to add data to argument list
+ * \param ptr      - Pointer to arbitrary data
+ * \param numbytes - Size of data to copy in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block
+ * contains the number of threads specified by a previous call to
+ * ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f - Kernel to launch
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunchGrid,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGridAsync,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
+
+/**
+ * \brief Launches a CUDA function
+ *
+ * \deprecated
+ *
+ * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of
+ * blocks. Each block contains the number of threads specified by a previous
+ * call to ::cuFuncSetBlockShape().
+ *
+ * The block shape, dynamic shared memory size, and parameter information
+ * must be set using
+ *  ::cuFuncSetBlockShape(),
+ *  ::cuFuncSetSharedSize(),
+ *  ::cuParamSetSize(),
+ *  ::cuParamSeti(),
+ *  ::cuParamSetf(), and
+ *  ::cuParamSetv()
+ * prior to calling this function.
+ *
+ * Launching a function via ::cuLaunchKernel() invalidates the function's
+ * block shape, dynamic shared memory size, and parameter information. After
+ * launching via cuLaunchKernel, this state must be re-initialized prior to
+ * calling this function. Failure to do so results in undefined behavior.
+ *
+ * \param f           - Kernel to launch
+ * \param grid_width  - Width of grid in blocks
+ * \param grid_height - Height of grid in blocks
+ * \param hStream     - Stream identifier
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_LAUNCH_FAILED,
+ * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
+ * ::CUDA_ERROR_LAUNCH_TIMEOUT,
+ * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING,
+ * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED
+ *
+ * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no),
+ *       this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by
+ *       growing the per-thread stack as needed per launch and not shrinking it afterwards.
+ *
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa ::cuFuncSetBlockShape,
+ * ::cuFuncSetSharedSize,
+ * ::cuFuncGetAttribute,
+ * ::cuParamSetSize,
+ * ::cuParamSetf,
+ * ::cuParamSeti,
+ * ::cuParamSetv,
+ * ::cuLaunch,
+ * ::cuLaunchGrid,
+ * ::cuLaunchKernel
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+
+
+/**
+ * \brief Adds a texture-reference to the function's argument list
+ *
+ * \deprecated
+ *
+ * Makes the CUDA array or linear memory bound to the texture reference
+ * \p hTexRef available to a device program as a texture. In this version of
+ * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and
+ * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT.
+ *
+ * \param hfunc   - Kernel to add texture-reference to
+ * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT)
+ * \param hTexRef - Texture-reference to add to argument list
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
+/** @} */ /* END CUDA_EXEC_DEPRECATED */
+
+/**
+ * \defgroup CUDA_GRAPH Graph Management
+ *
+ * ___MANBRIEF___ graph management functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graph management functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a graph
+ *
+ * Creates an empty graph, which is returned via \p phGraph.
+ *
+ * \param phGraph - Returns newly created graph
+ * \param flags   - Graph creation flags, must be 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphInstantiate,
+ * ::cuGraphDestroy,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags);
+
+/**
+ * \brief Creates a kernel execution node and adds it to a graph
+ *
+ * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The CUDA_KERNEL_NODE_PARAMS structure is defined as:
+ *
+ * \code
+ *  typedef struct CUDA_KERNEL_NODE_PARAMS_st {
+ *      CUfunction func;
+ *      unsigned int gridDimX;
+ *      unsigned int gridDimY;
+ *      unsigned int gridDimZ;
+ *      unsigned int blockDimX;
+ *      unsigned int blockDimY;
+ *      unsigned int blockDimZ;
+ *      unsigned int sharedMemBytes;
+ *      void **kernelParams;
+ *      void **extra;
+ *  } CUDA_KERNEL_NODE_PARAMS;
+ * \endcode
+ *
+ * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x
+ * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains
+ * (\p blockDimX x \p blockDimY x \p blockDimZ) threads.
+ *
+ * \p sharedMemBytes sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p func can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ * to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single
+ * buffer that is passed in via \p extra. This places the burden on the application of knowing each
+ * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * to allow this function to take additional less commonly used arguments. \p extra specifies
+ * a list of names of extra settings and their corresponding values. Each extra setting name is
+ * immediately followed by the corresponding value. The list must be terminated with either NULL or
+ * CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer
+ *   containing all the kernel parameters for launching kernel
+ *   \p func;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t
+ *   containing the size of the buffer specified with
+ *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL).
+ * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel.
+ *
+ * The \p kernelParams or \p extra array, as well as the argument values it points to,
+ * are copied during this call.
+ *
+ * \note Kernels launched using graphs must not use texture and surface references. Reading or
+ *       writing through any texture or surface reference is undefined behavior.
+ *       This restriction does not apply to texture and surface objects.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the GPU execution node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuLaunchCooperativeKernel,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a kernel node's parameters
+ *
+ * Returns the parameters of kernel node \p hNode in \p nodeParams.
+ * The \p kernelParams or \p extra array returned in \p nodeParams,
+ * as well as the argument values it points to, are owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphKernelNodeSetParams to update the
+ * parameters of this node.
+ *
+ * The params will contain either \p kernelParams or \p extra,
+ * according to which of these was most recently set on the node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a kernel node's parameters
+ *
+ * Sets the parameters of kernel node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeGetParams
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a memcpy node and adds it to a graph
+ *
+ * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will perform the memcpy described by \p copyParams.
+ * See ::cuMemcpy3D() for a description of the structure and its restrictions.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer
+ * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed
+ * for those operand(s). The managed memory will be treated as residing on either the
+ * host or the device, depending on which memory type is specified.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param copyParams      - Parameters for the memory copy
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memcpy node's parameters
+ *
+ * Returns the parameters of memcpy node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters
+ *
+ * Sets the parameters of memcpy node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemcpy3D,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams);
+
+/**
+ * \brief Creates a memset node and adds it to a graph
+ *
+ * Creates a new memset node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The element size must be 1, 2, or 4 bytes.
+ * When the graph is launched, the node will perform the memset described by \p memsetParams.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param memsetParams    - Parameters for the memory set
+ * \param ctx             - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode
+ */
+CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Returns a memset node's parameters
+ *
+ * Returns the parameters of memset node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a memset node's parameters
+ *
+ * Sets the parameters of memset node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuMemsetD2D32,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a host execution node and adds it to a graph
+ *
+ * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the graph is launched, the node will invoke the specified CPU function.
+ * Host nodes are not supported under MPS with pre-Volta GPUs.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the host node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a host node's parameters
+ *
+ * Returns the parameters of host node \p hNode in \p nodeParams.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param nodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets a host node's parameters
+ *
+ * Sets the parameters of host node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchHostFunc,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeGetParams
+ */
+CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a child graph node and adds it to a graph
+ *
+ * Creates a new node which executes an embedded graph, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * If \p hGraph contains allocation or free nodes, this call will return an error.
+ *
+ * The node executes an embedded child graph. The child graph is cloned in this call.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param childGraph      - The graph to clone into this node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+
+/**
+ * \brief Gets a handle to the embedded graph of a child graph node
+ *
+ * Gets a handle to the embedded graph in a child graph node. This call
+ * does not clone the graph. Changes to the graph will be reflected in
+ * the node, and the node retains ownership of the graph.
+ *
+ * Allocation and free nodes cannot be added to the returned graph.
+ * Attempting to do so will return an error.
+ *
+ * \param hNode   - Node to get the embedded graph for
+ * \param phGraph - Location to store a handle to the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph);
+
+/**
+ * \brief Creates an empty node and adds it to a graph
+ *
+ * Creates a new node which performs no operation, and adds it to \p hGraph with
+ * \p numDependencies dependencies specified via \p dependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * An empty node performs no operation during execution, but can be used for
+ * transitive ordering. For example, a phased execution graph with 2 groups of n
+ * nodes with a barrier between them can be represented using an empty node and
+ * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+
+/**
+ * \brief Creates an event record node and adds it to a graph
+ *
+ * Creates a new event record node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Each launch of the graph will record \p event to capture execution of the
+ * node's dependencies.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+ 
+/**
+ * \brief Returns the event associated with an event record node
+ *
+ * Returns the event of event record node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
+
+/**
+ * \brief Sets an event record node's event
+ *
+ * Sets the event of event record node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Creates an event wait node and adds it to a graph
+ *
+ * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
+ * for details on what is captured by an event. \p event may be from a different context
+ * or device than the launch stream.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+
+/**
+ * \brief Returns the event associated with an event wait node
+ *
+ * Returns the event of event wait node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out);
+
+/**
+ * \brief Sets an event wait node's event
+ *
+ * Sets the event of event wait node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent
+ */
+CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Creates an external semaphore signal node and adds it to a graph
+ *
+ * Creates a new external semaphore signal node and adds it to \p hGraph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified
+ * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
+ * node will be placed at the root of the graph. \p dependencies may not have any
+ * duplicate entries. A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Performs a signal operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The operation(s) will occur after all of the node's
+ * dependencies have completed.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExternalSemaphoresSignalNodeGetParams,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns an external semaphore signal node's parameters
+ *
+ * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out);
+
+/**
+ * \brief Sets an external semaphore signal node's parameters
+ *
+ * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates an external semaphore wait node and adds it to a graph
+ *
+ * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * Performs a wait operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The node's dependencies will not be launched until
+ * the wait operation has completed.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphExternalSemaphoresWaitNodeGetParams,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns an external semaphore wait node's parameters
+ *
+ * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuLaunchKernel,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out);
+
+/**
+ * \brief Sets an external semaphore wait node's parameters
+ *
+ * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync
+ */
+CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates a batch memory operation node and adds it to a graph
+ *
+ * Creates a new batch memory operation node and adds it to \p hGraph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * When the node is added, the paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \note
+ * Warning:
+ * Improper use of this API may deadlock the application. Synchronization 
+ * ordering established through this API is not visible to CUDA. CUDA tasks 
+ * that are (even indirectly) ordered by this API should also have that order
+ * expressed with CUDA-visible dependencies such as events. This ensures that 
+ * the scheduler does not serialize them in an improper order. For more 
+ * information, see the Stream Memory Operations section in the programming 
+ * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html).
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBatchMemOp_v2,
+ * ::cuStreamWaitValue32_v2,
+ * ::cuStreamWriteValue32_v2,
+ * ::cuStreamWaitValue64_v2,
+ * ::cuStreamWriteValue64_v2,
+ * ::cuGraphBatchMemOpNodeGetParams,
+ * ::cuGraphBatchMemOpNodeSetParams,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode,
+ */
+CUresult CUDAAPI cuGraphAddBatchMemOpNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a batch mem op node's parameters
+ *
+ * Returns the parameters of batch mem op node \p hNode in \p nodeParams_out.
+ * The \p paramArray returned in \p nodeParams_out is owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cuGraphBatchMemOpNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode          - Node to get the parameters for
+ * \param nodeParams_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBatchMemOp_v2,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeSetParams
+ */
+CUresult CUDAAPI cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
+
+/**
+ * \brief Sets a batch mem op node's parameters
+ *
+ * Sets the parameters of batch mem op node \p hNode to \p nodeParams.
+ *
+ * The paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBatchMemOp_v2,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeGetParams
+ */
+CUresult CUDAAPI cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for a batch mem op node in the given graphExec
+ *
+ * Sets the parameters of a batch mem op node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The following fields on operations may be modified on an executable graph:
+ *
+ *  op.waitValue.address
+ *  op.waitValue.value[64]
+ *  op.waitValue.flags bits corresponding to wait type (i.e. CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified)
+ *  op.writeValue.address
+ *  op.writeValue.value[64]
+ *
+ * Other fields, such as the context, count or type of operations, and other types of operations such as membars, 
+ * may not be modified.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * The paramArray inside \p nodeParams is copied and therefore it can be
+ * freed after the call returns.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Batch mem op node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuStreamBatchMemOp_v2,
+ * ::cuGraphAddBatchMemOpNode,
+ * ::cuGraphBatchMemOpNodeGetParams,
+ * ::cuGraphBatchMemOpNodeSetParams,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Creates an allocation node and adds it to a graph
+ *
+ * Creates a new allocation node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
+ * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
+ *
+ * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode,
+ * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
+ * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
+ * owning graph.
+ *
+ * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
+ * graph which are ordered after the allocation node, but also by stream operations ordered after the
+ * graph's execution but before the allocation is freed.
+ *
+ * Allocations which are not freed in the same graph can be freed by:
+ * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree;
+ * - launching a graph with a free node for that allocation; or
+ * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes
+ * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation.
+ * 
+ * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
+ * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
+ * in another graph, a free node can no longer be added to the owning graph.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemFreeNode,
+ * ::cuGraphMemAllocNodeGetParams,
+ * ::cuDeviceGraphMemTrim,
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuMemAllocAsync,
+ * ::cuMemFreeAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Returns a memory alloc node's parameters
+ *
+ * Returns the parameters of a memory alloc node \p hNode in \p params_out.
+ * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
+ * node.  This memory remains valid until the node is destroyed.  The returned
+ * parameters must not be modified.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphMemFreeNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+
+/**
+ * \brief Creates a memory free node and adds it to a graph
+ *
+ * Creates a new memory free node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p phGraphNode.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dptr            - Address of memory to free
+ *
+ * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free:
+ * - an allocation twice in the same graph.
+ * - an address that was not returned by an allocation node.
+ * - an invalid address.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphMemFreeNodeGetParams,
+ * ::cuDeviceGraphMemTrim,
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuMemAllocAsync,
+ * ::cuMemFreeAsync,
+ * ::cuGraphCreate,
+ * ::cuGraphDestroyNode,
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+
+/**
+ * \brief Returns a memory free node's parameters
+ *
+ * Returns the address of a memory free node \p hNode in \p dptr_out.
+ *
+ * \param hNode    - Node to get the parameters for
+ * \param dptr_out - Pointer to return the device address
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemFreeNode,
+ * ::cuGraphMemAllocNodeGetParams
+ */
+CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out);
+
+/**
+ * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
+ *
+ * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
+ * freed back to the operating system.
+ *
+ * \param device - The device for which cached memory should be freed.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode,
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuDeviceGetGraphMemAttribute
+ */
+CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
+
+/**
+ * \brief Query asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - retrieved value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDeviceSetGraphMemAttribute,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode
+ */
+CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
+
+/**
+ * \brief Set asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - pointer to value to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ *
+ * \sa
+ * ::cuDeviceGetGraphMemAttribute,
+ * ::cuGraphAddMemAllocNode,
+ * ::cuGraphAddMemFreeNode
+ */
+CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value);
+
+/**
+ * \brief Clones a graph
+ *
+ * This function creates a copy of \p originalGraph and returns it in \p phGraphClone.
+ * All parameters are copied into the cloned graph. The original graph may be modified
+ * after this call without affecting the clone.
+ *
+ * Child graph nodes in the original graph are recursively copied into the clone.
+ *
+ * \param phGraphClone  - Returns newly created cloned graph
+ * \param originalGraph - Graph to clone
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphNodeFindInClone
+ */
+CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph);
+
+/**
+ * \brief Finds a cloned version of a node
+ *
+ * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode
+ * in the original graph.
+ *
+ * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone.
+ * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to
+ * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have
+ * been removed. The cloned node is then returned via \p phClonedNode.
+ *
+ * \param phNode  - Returns handle to the cloned node
+ * \param hOriginalNode - Handle to the original node
+ * \param hClonedGraph - Cloned graph to query
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphClone
+ */
+CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+
+/**
+ * \brief Returns a node's type
+ *
+ * Returns the node type of \p hNode in \p type.
+ *
+ * \param hNode - Node to query
+ * \param type  - Pointer to return the node type
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphKernelNodeGetParams,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphHostNodeGetParams,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphMemcpyNodeGetParams,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphMemsetNodeGetParams,
+ * ::cuGraphMemsetNodeSetParams
+ */
+CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type);
+
+/**
+ * \brief Returns a graph's nodes
+ *
+ * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this
+ * function will return the number of nodes in \p numNodes. Otherwise,
+ * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numNodes.
+ *
+ * \param hGraph   - Graph to query
+ * \param nodes    - Pointer to return the nodes
+ * \param numNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+
+/**
+ * \brief Returns a graph's root nodes
+ *
+ * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this
+ * function will return the number of root nodes in \p numRootNodes. Otherwise,
+ * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual
+ * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numRootNodes.
+ *
+ * \param hGraph       - Graph to query
+ * \param rootNodes    - Pointer to return the root nodes
+ * \param numRootNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetType,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+
+/**
+ * \brief Returns a graph's dependency edges
+ *
+ * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding
+ * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ * node in \p from[i]. \p from and \p to may both be NULL, in which
+ * case this function only returns the number of edges in \p numEdges. Otherwise,
+ * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ * the number of edges actually returned will be written to \p numEdges.
+ *
+ * \param hGraph   - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param numEdges - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+
+/**
+ * \brief Returns a node's dependencies
+ *
+ * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p numDependencies. Otherwise,
+ * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numDependencies.
+ *
+ * \param hNode           - Node to query
+ * \param dependencies    - Pointer to return the dependencies
+ * \param numDependencies - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependentNodes,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+
+/**
+ * \brief Returns a node's dependent nodes
+ *
+ * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p numDependentNodes.
+ * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p numDependentNodes.
+ *
+ * \param hNode             - Node to query
+ * \param dependentNodes    - Pointer to return the dependent nodes
+ * \param numDependentNodes - See description
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphGetNodes,
+ * ::cuGraphGetRootNodes,
+ * ::cuGraphGetEdges,
+ * ::cuGraphAddDependencies,
+ * ::cuGraphRemoveDependencies
+ */
+CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+
+/**
+ * \brief Adds dependency edges to a graph
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param hGraph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphRemoveDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph
+ *
+ * The number of \p dependencies to be removed is defined by \p numDependencies.
+ * Elements in \p from and \p to at corresponding indices define a dependency.
+ * Each node in \p from and \p to must belong to \p hGraph.
+ *
+ * If \p numDependencies is 0, elements in \p from and \p to will be ignored.
+ * Specifying a non-existing dependency will return an error.
+ *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
+ * \param hGraph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddDependencies,
+ * ::cuGraphGetEdges,
+ * ::cuGraphNodeGetDependencies,
+ * ::cuGraphNodeGetDependentNodes
+ */
+CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+
+/**
+ * \brief Remove a node from the graph
+ *
+ * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes
+ * on \p hNode and vice versa.
+ *
+ * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed.
+ * Any attempt to do so will return an error.
+ *
+ * \param hNode  - Node to remove
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphAddEmptyNode,
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphAddHostNode,
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphAddMemsetNode
+ */
+CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p phGraphExec.
+ *
+ * If there are any errors, diagnostic information may be returned in \p errorNode and
+ * \p logBuffer. This is the primary way to inspect instantiation errors. The output
+ * will be null terminated unless the diagnostics overflow
+ * the buffer. In this case, they will be truncated, and the last byte can be
+ * inspected to determine if truncation occurred.
+ *
+ * \param phGraphExec - Returns instantiated graph
+ * \param hGraph      - Graph to instantiate
+ * \param phErrorNode - In case of an instantiation error, this may be modified to
+ *                      indicate a node contributing to the error
+ * \param logBuffer   - A character buffer to store diagnostic messages
+ * \param bufferSize  - Size of the log buffer in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiateWithFlags,
+ * ::cuGraphCreate,
+ * ::cuGraphUpload,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p hGraph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p phGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p hGraph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time.
+ *
+ * An attempt to instantiate a second executable graph before destroying the first
+ * with ::cuGraphExecDestroy will result in an error.
+ *
+ * \param phGraphExec - Returns instantiated graph
+ * \param hGraph      - Graph to instantiate
+ * \param flags       - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphCreate,
+ * ::cuGraphUpload,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+
+/**
+ * \brief Sets the parameters for a kernel node in the given graphExec
+ *
+ * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
+ * The node is identified by the corresponding node \p hNode in the 
+ * non-executable graph, from which the executable graph was instantiated. 
+ *
+ * \p hNode must not have been removed from the original graph. All \p nodeParams 
+ * fields may change, but the following restrictions apply to \p func updates: 
+ *
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already 
+ * enqueued or running launches of \p hGraphExec are not affected by this call. 
+ * \p hNode is also not modified by this call.
+ * 
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param hNode       - kernel node from the graph from which graphExec was instantiated
+ * \param nodeParams  - Updated Parameters to set
+ * 
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddKernelNode,
+ * ::cuGraphKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p copyParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The source and destination memory in \p copyParams must be allocated from the same 
+ * contexts as the original source and destination memory.  Both the instantiation-time 
+ * memory operands and the memory operands in \p copyParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or
+ * either the original or new memory operands are multidimensional.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Memcpy node from the graph which was used to instantiate graphExec
+ * \param copyParams - The updated parameters to set
+ * \param ctx        - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemcpyNode,
+ * ::cuGraphMemcpyNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx);
+
+/**
+ * \brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p memsetParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The destination memory in \p memsetParams must be allocated from the same 
+ * contexts as the original destination memory.  Both the instantiation-time 
+ * memory operand and the memory operand in \p memsetParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or
+ * either the original or new memory operand are multidimensional.
+ *
+ * \param hGraphExec   - The executable graph in which to set the specified node
+ * \param hNode        - Memset node from the graph which was used to instantiate graphExec
+ * \param memsetParams - The updated parameters to set
+ * \param ctx          - Context on which to run the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddMemsetNode,
+ * ::cuGraphMemsetNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx);
+
+/**
+ * \brief Sets the parameters for a host node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had 
+ * contained \p nodeParams at instantiation.  hNode must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from hNode are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  hNode is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Host node from the graph which was used to instantiate graphExec
+ * \param nodeParams - The updated parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddHostNode,
+ * ::cuGraphHostNodeSetParams,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Updates node parameters in the child graph node in the given graphExec.
+ *
+ * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained
+ * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation.
+ * \p hNode must remain in the graph which was used to instantiate \p hGraphExec.
+ * Changed edges to and from \p hNode are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p hNode is also 
+ * not modified by this call.
+ *
+ * The topology of \p childGraph, as well as the node insertion order,  must match that
+ * of the graph contained in \p hNode.  See ::cuGraphExecUpdate() for a list of restrictions
+ * on what can be updated in an instantiated graph.  The update is recursive, so child graph
+ * nodes contained within the top level child graph will also be updated.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Host node from the graph which was used to instantiate graphExec
+ * \param childGraph - The graph supplying the updated parameters
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddChildGraphNode,
+ * ::cuGraphChildGraphNodeGetGraph,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+
+/**
+ * \brief Sets the event for an event record node in the given graphExec
+ *
+ * Sets the event of an event record node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - event record node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventRecordNode,
+ * ::cuGraphEventRecordNodeGetEvent,
+ * ::cuGraphEventWaitNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Sets the event for an event wait node in the given graphExec
+ *
+ * Sets the event of an event wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - event wait node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddEventWaitNode,
+ * ::cuGraphEventWaitNodeGetEvent,
+ * ::cuGraphEventRecordNodeSetEvent,
+ * ::cuEventRecordWithFlags,
+ * ::cuStreamWaitEvent,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+
+/**
+ * \brief Sets the parameters for an external semaphore signal node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddExternalSemaphoresSignalNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Sets the parameters for an external semaphore wait node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphAddExternalSemaphoresWaitNode,
+ * ::cuImportExternalSemaphore,
+ * ::cuSignalExternalSemaphoresAsync,
+ * ::cuWaitExternalSemaphoresAsync,
+ * ::cuGraphExecKernelNodeSetParams,
+ * ::cuGraphExecMemcpyNodeSetParams,
+ * ::cuGraphExecMemsetNodeSetParams,
+ * ::cuGraphExecHostNodeSetParams,
+ * ::cuGraphExecChildGraphNodeSetParams,
+ * ::cuGraphExecEventRecordNodeSetEvent,
+ * ::cuGraphExecEventWaitNodeSetEvent,
+ * ::cuGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ */
+CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams);
+
+/**
+ * \brief Enables or disables the specified node in the given graphExec
+ *
+ * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent 
+ * to empty nodes until they are reenabled. Existing node parameters are not affected by 
+ * disabling/enabling the node.
+ *  
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeGetEnabled,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
+
+/**
+ * \brief Query whether a node in the given graphExec is enabled
+ *
+ * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
+ *
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Location to return the enabled status of the node
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphNodeSetEnabled,
+ * ::cuGraphExecUpdate,
+ * ::cuGraphInstantiate
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
+
+/**
+ * \brief Uploads an executable graph in a stream
+ *
+ * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
+ * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
+ * previous work in \p hStream and any previous launches of \p hGraphExec.
+ * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec.
+ *
+ * \param hGraphExec - Executable graph to upload
+ * \param hStream    - Stream in which to upload the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphLaunch,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream);
+
+/**
+ * \brief Launches an executable graph in a stream
+ *
+ * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing
+ * at a time. Each launch is ordered behind both any previous work in \p hStream
+ * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be
+ * instantiated multiple times into multiple executable graphs.
+ *
+ * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and
+ * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH,
+ * the launch will fail with ::CUDA_ERROR_INVALID_VALUE.
+ *
+ * \param hGraphExec - Executable graph to launch
+ * \param hStream    - Stream in which to launch the graph
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphUpload,
+ * ::cuGraphExecDestroy
+ */
+CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream);
+
+/**
+ * \brief Destroys an executable graph
+ *
+ * Destroys the executable graph specified by \p hGraphExec, as well
+ * as all of its executable nodes. If the executable graph is
+ * in-flight, it will not be terminated, but rather freed
+ * asynchronously on completion.
+ *
+ * \param hGraphExec - Executable graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ * ::cuGraphUpload,
+ * ::cuGraphLaunch
+ */
+CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec);
+
+/**
+ * \brief Destroys a graph
+ *
+ * Destroys the graph specified by \p hGraph, as well as all of its nodes.
+ *
+ * \param hGraph - Graph to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph);
+
+/**
+ * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
+ *
+ * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
+ * node parameters in a topologically identical graph specified by \p hGraph.
+ *
+ * Limitations:
+ *
+ * - Kernel nodes:
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP.
+ *   - A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
+ *   - If the graph was instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the
+ *     priority attribute cannot change. Equality is checked on the originally requested
+ *     priority values, before they are clamped to the device's supported range.
+ * - Memset and memcpy nodes:
+ *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
+ *   - The source/destination memory must be allocated from the same contexts as the original
+ *     source/destination memory.
+ *   - Only 1D memsets can be changed.
+ * - Additional memcpy node restrictions:
+ *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
+ *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+ * - External semaphore wait nodes and record nodes:
+ *   - Changing the number of semaphores is not supported.
+ *
+ * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
+ *
+ * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under
+ * the following conditions:
+ *
+ * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out
+ *   is NULL.
+ * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out
+ *   is NULL.
+ * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is
+ *   the pairless node from \p hGraph.
+ * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph.
+ *
+ * cuGraphExecUpdate sets \p updateResult_out to:
+ * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case
+ *   \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported
+ *   way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph.
+ * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like 
+ *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
+ *
+ * If \p updateResult_out isn't set in one of the situations described above, the update check passes
+ * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph.  If an error happens
+ * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise,
+ * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS.
+ *
+ * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully.  It returns
+ * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included 
+ * changes which violated constraints specific to instantiated graph update.
+ *
+ * \param hGraphExec The instantiated graph to be updated
+ * \param hGraph The graph containing the updated parameters
+ * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any
+ * \param updateResult_out Whether the graph update was permitted.  If was forbidden, the reason why
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE,
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphInstantiate,
+ */
+CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+
+/**
+ * \brief Copies attributes from source node to destination node.
+ *
+ * Copies attributes from source node \p src to destination node \p dst.
+ * Both node must have the same context.
+ *
+ * \param[out] dst Destination node
+ * \param[in] src Source node
+ * For list of attributes see ::CUkernelNodeAttrID
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src);
+
+/**
+ * \brief Queries node attribute.
+ * 
+ * Queries attribute \p attr from node \p hNode and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hNode
+ * \param[in] attr
+ * \param[out] value_out 
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *  
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      CUkernelNodeAttrValue *value_out);
+ 
+/**
+ * \brief Sets node attribute.
+ * 
+ * Sets attribute \p attr on node \p hNode from corresponding attribute of
+ * \p value.
+ *
+ * \param[out] hNode
+ * \param[in] attr
+ * \param[out] value
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa
+ * ::CUaccessPolicyWindow
+ */
+CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr,
+                                      const CUkernelNodeAttrValue *value);
+
+/**
+ * \brief Write a DOT file describing graph structure
+ *
+ * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph.
+ * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
+ * \p flags can be specified to write more detailed information about each node type such as
+ * parameter values, kernel attributes, node and function handles.
+ *
+ * \param hGraph - The graph to create a DOT file from
+ * \param path   - The path to write the DOT file to
+ * \param flags  - Flags from CUgraphDebugDot_flags for specifying which additional node information to write
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OPERATING_SYSTEM
+ */
+CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags);
+
+/**
+ * \brief Create a user object
+ *
+ * Create a user object with the specified destructor callback and initial reference count. The
+ * initial references are owned by the caller.
+ *
+ * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
+ * are executed by a shared internal thread. Another thread may be signaled to perform such
+ * actions, if it does not block forward progress of tasks scheduled through CUDA.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param ptr             - The pointer to pass to the destroy function
+ * \param destroy         - Callback to free the user object when it is no longer in use
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy,
+                                    unsigned int initialRefcount, unsigned int flags);
+
+/**
+ * \brief Retain a reference to a user object
+ *
+ * Retains new references to a user object. The new references are owned by the caller.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to retain
+ * \param count  - The number of references to retain, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count);
+
+/**
+ * \brief Release a reference to a user object
+ *
+ * Releases user object references owned by the caller. The object's destructor is invoked if
+ * the reference count reaches zero.
+ *
+ * It is undefined behavior to release references not owned by the caller, or to use a user
+ * object handle after all references are released.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to release
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count);
+
+/**
+ * \brief Retain a reference to a user object from a graph
+ *
+ * Creates or moves user object references that will be owned by a CUDA graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph to associate the reference with
+ * \param object - The user object to retain a reference for
+ * \param count  - The number of references to add to the graph, typically 1. Must be
+ *                 nonzero and not larger than INT_MAX.
+ * \param flags  - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references
+ *                 from the calling thread, rather than create new references. Pass 0
+ *                 to create new references.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphReleaseUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+
+/**
+ * \brief Release a user object reference from a graph
+ *
+ * Releases user object references owned by a graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph that will release the reference
+ * \param object - The user object to release a reference for
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuUserObjectCreate,
+ * ::cuUserObjectRetain,
+ * ::cuUserObjectRelease,
+ * ::cuGraphRetainUserObject,
+ * ::cuGraphCreate
+ */
+CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count);
+
+/** @} */ /* END CUDA_GRAPH */
+
+/**
+ * \defgroup CUDA_OCCUPANCY Occupancy
+ *
+ * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the occupancy calculation functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+
+/**
+ * \brief Returns occupancy of a function
+ *
+ * Returns in \p *numBlocks the number of the maximum active blocks per
+ * streaming multiprocessor.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxActiveBlocksPerMultiprocessor;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, if caching is enabled, but
+ *   per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching
+ *   is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes
+ *   the occupancy calculator to return 0 in such cases. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * Returns in \p *blockSize a reasonable block size that can achieve
+ * the maximum occupancy (or, the maximum number of active warps with
+ * the fewest blocks per multiprocessor), and in \p *minGridSize the
+ * minimum grid size to achieve the maximum occupancy.
+ *
+ * If \p blockSizeLimit is 0, the configurator will use the maximum
+ * block size permitted by the device / function instead.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the
+ * user should leave both \p blockSizeToDynamicSMemSize and \p
+ * dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if
+ * the dynamic shared memory size is constant regardless of block
+ * size, the size should be passed through \p dynamicSMemSize, and \p
+ * blockSizeToDynamicSMemSize should be NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with
+ * different block sizes, the user needs to provide a unary function
+ * through \p blockSizeToDynamicSMemSize that computes the dynamic
+ * shared memory needed by \p func for any given block size. \p
+ * dynamicSMemSize is ignored. An example signature is:
+ *
+ * \code
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ * \endcode
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSize
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+
+/**
+ * \brief Suggest a launch configuration with reasonable occupancy
+ *
+ * An extended version of ::cuOccupancyMaxPotentialBlockSize. In
+ * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize,
+ * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags
+ * parameter.
+ *
+ * The \p Flags parameter controls how special cases are handled. The
+ * valid flags are:
+ *
+ * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as
+ *   ::cuOccupancyMaxPotentialBlockSize;
+ *
+ * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the
+ *   default behavior on platform where global caching affects
+ *   occupancy. On such platforms, the launch configurations that
+ *   produces maximal occupancy might not support global
+ *   caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE
+ *   guarantees that the the produced launch configuration is global
+ *   caching compatible at a potential cost of occupancy. More information
+ *   can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy
+ * \param blockSize   - Returned maximum block size that can achieve the maximum occupancy
+ * \param func        - Kernel for which launch configuration is calculated
+ * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size
+ * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to handle
+ * \param flags       - Options
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM 
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the blocks
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ */
+CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes / ::cuFuncGetAttribute),\p
+ * *clusterSize will reflect the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ * ::cuFuncGetAttributes
+ */
+CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes / ::cuFuncGetAttribute), the cluster size
+ * from config must either be unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CLUSTER_SIZE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ * ::cuFuncGetAttributes
+ */
+CUresult CUDAAPI cuOccupancyMaxActiveClusters(int *numClusters, CUfunction func, const CUlaunchConfig *config);
+/** @} */ /* END CUDA_OCCUPANCY */
+
+/**
+ * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated texture reference management functions of the
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the deprecated texture reference management
+ * functions of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Binds an array as a texture reference
+ *
+ * \deprecated
+ *
+ * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to
+ * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is
+ * unbound.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param hArray  - Array to bind
+ * \param Flags   - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Binds a mipmapped array to a texture reference
+ *
+ * \deprecated
+ *
+ * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef.
+ * Any previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT.
+ * Any CUDA array previously bound to \p hTexRef is unbound.
+ *
+ * \param hTexRef         - Texture reference to bind
+ * \param hMipmappedArray - Mipmapped array to bind
+ * \param Flags           - Options (must be ::CU_TRSA_OVERRIDE_FORMAT)
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+
+/**
+ * \brief Binds an address as a texture reference
+ *
+ * \deprecated
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses, ::cuTexRefSetAddress() passes back a byte offset in
+ * \p *ByteOffset that must be applied to texture fetches in order to read from
+ * the desired memory. This offset must be divided by the texel size and
+ * passed to kernels that read from the texture so they can be applied to the
+ * ::tex1Dfetch() function.
+ *
+ * If the device memory pointer was returned from ::cuMemAlloc(), the offset
+ * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter.
+ *
+ * The total number of elements (or texels) in the linear address range
+ * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH.
+ * The number of elements is computed as (\p bytes / bytesPerElement),
+ * where bytesPerElement is determined from the data format and number of
+ * components set using ::cuTexRefSetFormat().
+ *
+ * \param ByteOffset - Returned byte offset
+ * \param hTexRef    - Texture reference to bind
+ * \param dptr       - Device pointer to bind
+ * \param bytes      - Size of memory to bind in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes);
+
+/**
+ * \brief Binds an address as a 2D texture reference
+ *
+ * \deprecated
+ *
+ * Binds a linear address range to the texture reference \p hTexRef. Any
+ * previous address or CUDA array state associated with the texture reference
+ * is superseded by this function. Any memory previously bound to \p hTexRef
+ * is unbound.
+ *
+ * Using a ::tex2D() function inside a kernel requires a call to either
+ * ::cuTexRefSetArray() to bind the corresponding texture reference to an
+ * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear
+ * memory.
+ *
+ * Function calls to ::cuTexRefSetFormat() cannot follow calls to
+ * ::cuTexRefSetAddress2D() for the same texture reference.
+ *
+ * It is required that \p dptr be aligned to the appropriate hardware-specific
+ * texture alignment. You can query this value using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \p Pitch has to be aligned to the hardware-specific texture pitch alignment.
+ * This value can be queried using the device attribute
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is
+ * supplied, ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * Width and Height, which are specified in elements (or texels), cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * \p Pitch, which is specified in bytes, cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * \param hTexRef - Texture reference to bind
+ * \param desc    - Descriptor of CUDA array
+ * \param dptr    - Device pointer to bind
+ * \param Pitch   - Line pitch in bytes
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture2D
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+
+/**
+ * \brief Sets the format for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the format of the data to be read by the texture reference
+ * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the
+ * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure:
+ * They specify the format of each component and the number of components per
+ * array element.
+ *
+ * \param hTexRef             - Texture reference
+ * \param fmt                 - Format to set
+ * \param NumPackedComponents - Number of components per array element
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaCreateChannelDesc,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+
+/**
+ * \brief Sets the addressing mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the addressing mode \p am for the given dimension \p dim of the
+ * texture reference \p hTexRef. If \p dim is zero, the addressing mode is
+ * applied to the first parameter of the functions used to fetch from the
+ * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined
+ * as:
+ * \code
+   typedef enum CUaddress_mode_enum {
+      CU_TR_ADDRESS_MODE_WRAP = 0,
+      CU_TR_ADDRESS_MODE_CLAMP = 1,
+      CU_TR_ADDRESS_MODE_MIRROR = 2,
+      CU_TR_ADDRESS_MODE_BORDER = 3
+   } CUaddress_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only
+ * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ * \param am      - Addressing mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am);
+
+/**
+ * \brief Sets the filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the mipmap filtering mode \p fm to be used when reading memory through
+ * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as:
+ *
+ * \code
+   typedef enum CUfilter_mode_enum {
+      CU_TR_FILTER_MODE_POINT = 0,
+      CU_TR_FILTER_MODE_LINEAR = 1
+   } CUfilter_mode;
+ * \endcode
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param fm      - Filtering mode to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm);
+
+/**
+ * \brief Sets the mipmap level bias for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when
+ * reading memory through the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef - Texture reference
+ * \param bias    - Mipmap level bias
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias);
+
+/**
+ * \brief Sets the mipmap min/max mipmap level clamps for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp
+ * respectively, to be used when reading memory through the texture reference
+ * \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array.
+ *
+ * \param hTexRef        - Texture reference
+ * \param minMipmapLevelClamp - Mipmap min level clamp
+ * \param maxMipmapLevelClamp - Mipmap max level clamp
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+
+/**
+ * \brief Sets the maximum anisotropy for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * Note that this call has no effect if \p hTexRef is bound to linear memory.
+ *
+ * \param hTexRef  - Texture reference
+ * \param maxAniso - Maximum anisotropy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso);
+
+/**
+ * \brief Sets the border color for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference
+ * \p hTexRef. The color value supports only float type and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * Note that the color values can be set only when the Address mode is set to
+ * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode.
+ * Applications using integer border color values have to "reinterpret_cast" their values to float.
+ *
+ * \param hTexRef       - Texture reference
+ * \param pBorderColor  - RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddressMode,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor);
+
+/**
+ * \brief Sets the flags for a texture reference
+ *
+ * \deprecated
+ *
+ * Specifies optional flags via \p Flags to specify the behavior of data
+ * returned through the texture reference \p hTexRef. The valid flags are:
+ *
+ * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format
+ *   would not be promoted, regardless of whether or not this
+ *   flag is specified;
+ * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the
+ *   default behavior of having the texture coordinates range
+ *   from [0, Dim) where Dim is the width or height of the CUDA
+ *   array. Instead, the texture coordinates [0, 1.0) reference
+ *   the entire breadth of the array dimension;
+ * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+ *   filtering optimizations. Trilinear optimizations improve texture filtering
+ *   performance by allowing bilinear filtering on textures in scenarios where
+ *   it can closely approximate the expected results.
+ *
+ * \param hTexRef - Texture reference
+ * \param Flags   - Optional flags to set
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat,
+ * ::cudaBindTexture,
+ * ::cudaBindTexture2D,
+ * ::cudaBindTextureToArray,
+ * ::cudaBindTextureToMipmappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags);
+
+/**
+ * \brief Gets the address associated with a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pdptr the base address bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any device memory range.
+ *
+ * \param pdptr   - Returned device address
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef);
+
+/**
+ * \brief Gets the array bound to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *phArray the CUDA array bound to the texture reference
+ * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA array.
+ *
+ * \param phArray - Returned array
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmapped array bound to a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture
+ * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference
+ * is not bound to any CUDA mipmapped array.
+ *
+ * \param phMipmappedArray - Returned mipmapped array
+ * \param hTexRef          - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+
+/**
+ * \brief Gets the addressing mode used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pam the addressing mode corresponding to the
+ * dimension \p dim of the texture reference \p hTexRef. Currently, the only
+ * valid value for \p dim are 0 and 1.
+ *
+ * \param pam     - Returned addressing mode
+ * \param hTexRef - Texture reference
+ * \param dim     - Dimension
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+
+/**
+ * \brief Gets the filter-mode used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pfm the filtering mode of the texture reference
+ * \p hTexRef.
+ *
+ * \param pfm     - Returned filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the format used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pFormat and \p *pNumChannels the format and number
+ * of components of the CUDA array bound to the texture reference \p hTexRef.
+ * If \p pFormat or \p pNumChannels is NULL, it will be ignored.
+ *
+ * \param pFormat      - Returned format
+ * \param pNumChannels - Returned number of components
+ * \param hTexRef      - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap filtering mode for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the mipmap filtering mode in \p pfm that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pfm     - Returned mipmap filtering mode
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef);
+
+/**
+ * \brief Gets the mipmap level bias for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the mipmap level bias in \p pBias that's added to the specified mipmap
+ * level when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pbias   - Returned mipmap level bias
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef);
+
+/**
+ * \brief Gets the min/max mipmap level clamps for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp
+ * that's used when reading memory through the texture reference \p hTexRef.
+ *
+ * \param pminMipmapLevelClamp - Returned mipmap min level clamp
+ * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp
+ * \param hTexRef              - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+
+/**
+ * \brief Gets the maximum anisotropy for a texture reference
+ *
+ * \deprecated
+ *
+ * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through
+ * the texture reference \p hTexRef.
+ *
+ * \param pmaxAniso - Returned maximum anisotropy
+ * \param hTexRef   - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef);
+
+/**
+ * \brief Gets the border color used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p pBorderColor, values of the RGBA color used by
+ * the texture reference \p hTexRef.
+ * The color value is of type float and holds color components in
+ * the following sequence:
+ * pBorderColor[0] holds 'R' component
+ * pBorderColor[1] holds 'G' component
+ * pBorderColor[2] holds 'B' component
+ * pBorderColor[3] holds 'A' component
+ *
+ * \param hTexRef  - Texture reference
+ * \param pBorderColor   - Returned Type and Value of RGBA color
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef);
+
+/**
+ * \brief Gets the flags used by a texture reference
+ *
+ * \deprecated
+ *
+ * Returns in \p *pFlags the flags of the texture reference \p hTexRef.
+ *
+ * \param pFlags  - Returned flags
+ * \param hTexRef - Texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray,
+ * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat,
+ * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray,
+ * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef);
+
+/**
+ * \brief Creates a texture reference
+ *
+ * \deprecated
+ *
+ * Creates a texture reference and returns its handle in \p *pTexRef. Once
+ * created, the application must call ::cuTexRefSetArray() or
+ * ::cuTexRefSetAddress() to associate the reference with allocated memory.
+ * Other texture reference functions are used to specify the format and
+ * interpretation (addressing, filtering, etc.) to be used when the memory is
+ * read through this texture reference.
+ *
+ * \param pTexRef - Returned texture reference
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefDestroy
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef);
+
+/**
+ * \brief Destroys a texture reference
+ *
+ * \deprecated
+ *
+ * Destroys the texture reference specified by \p hTexRef.
+ *
+ * \param hTexRef - Texture reference to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuTexRefCreate
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef);
+
+/** @} */ /* END CUDA_TEXREF_DEPRECATED */
+
+
+/**
+ * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ surface reference management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface reference management functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Sets the CUDA array for a surface reference.
+ *
+ * \deprecated
+ *
+ * Sets the CUDA array \p hArray to be read and written by the surface reference
+ * \p hSurfRef.  Any previous CUDA array state associated with the surface
+ * reference is superseded by this function.  \p Flags must be set to 0.
+ * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array.
+ * Any CUDA array previously bound to \p hSurfRef is unbound.
+
+ * \param hSurfRef - Surface reference handle
+ * \param hArray - CUDA array handle
+ * \param Flags - set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuModuleGetSurfRef,
+ * ::cuSurfRefGetArray,
+ * ::cudaBindSurfaceToArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+
+/**
+ * \brief Passes back the CUDA array bound to a surface reference.
+ *
+ * \deprecated
+ *
+ * Returns in \p *phArray the CUDA array bound to the surface reference
+ * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference
+ * is not bound to any CUDA array.
+
+ * \param phArray - Surface reference handle
+ * \param hSurfRef - Surface reference handle
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef);
+
+/** @} */ /* END CUDA_SURFREF_DEPRECATED */
+
+/**
+ * \defgroup CUDA_TEXOBJECT Texture Object Management
+ *
+ * ___MANBRIEF___ texture object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the texture object management functions of the
+ * low-level CUDA driver application programming interface. The texture
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::CUDA_RESOURCE_DESC structure is defined as:
+ * \code
+        typedef struct CUDA_RESOURCE_DESC_st
+        {
+            CUresourcetype resType;
+
+            union {
+                struct {
+                    CUarray hArray;
+                } array;
+                struct {
+                    CUmipmappedArray hMipmappedArray;
+                } mipmap;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    CUdeviceptr devPtr;
+                    CUarray_format format;
+                    unsigned int numChannels;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+
+            unsigned int flags;
+        } CUDA_RESOURCE_DESC;
+
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        typedef enum CUresourcetype_enum {
+            CU_RESOURCE_TYPE_ARRAY           = 0x00,
+            CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01,
+            CU_RESOURCE_TYPE_LINEAR          = 0x02,
+            CU_RESOURCE_TYPE_PITCH2D         = 0x03
+        } CUresourcetype;
+ * \endcode
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray
+ * must be set to a valid CUDA mipmapped array handle.
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)).
+ *
+ * \par
+ * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels
+ * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width
+ * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively.
+ * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to
+ * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH.
+ *
+ * - ::flags must be set to zero.
+ *
+ *
+ * The ::CUDA_TEXTURE_DESC struct is defined as
+ * \code
+        typedef struct CUDA_TEXTURE_DESC_st {
+            CUaddress_mode addressMode[3];
+            CUfilter_mode filterMode;
+            unsigned int flags;
+            unsigned int maxAnisotropy;
+            CUfilter_mode mipmapFilterMode;
+            float mipmapLevelBias;
+            float minMipmapLevelClamp;
+            float maxMipmapLevelClamp;
+        } CUDA_TEXTURE_DESC;
+ * \endcode
+ * where
+ * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as:
+ *   \code
+        typedef enum CUaddress_mode_enum {
+            CU_TR_ADDRESS_MODE_WRAP = 0,
+            CU_TR_ADDRESS_MODE_CLAMP = 1,
+            CU_TR_ADDRESS_MODE_MIRROR = 2,
+            CU_TR_ADDRESS_MODE_BORDER = 3
+        } CUaddress_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES
+ *   is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP.
+ *
+ * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as:
+ *   \code
+        typedef enum CUfilter_mode_enum {
+            CU_TR_FILTER_MODE_POINT = 0,
+            CU_TR_FILTER_MODE_LINEAR = 1
+        } CUfilter_mode;
+ *   \endcode
+ *   This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR.
+ *
+ * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following:
+ *   - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of
+ *   having the texture promote integer data to floating point data in the
+ *   range [0, 1]. Note that texture with 32-bit integer format would not be 
+ *   promoted, regardless of whether or not this flag is specified.
+ *   - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior
+ *   of having the texture coordinates range from [0, Dim) where Dim is the 
+ *   width or height of the CUDA array. Instead, the texture coordinates 
+ *   [0, 1.0) reference the entire breadth of the array dimension; Note that
+ *   for CUDA mipmapped arrays, this flag has to be set.
+ *   - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear
+ *   filtering optimizations. Trilinear optimizations improve texture filtering
+ *   performance by allowing bilinear filtering on textures in scenarios where
+ *   it can closely approximate the expected results.
+ *   - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering. 
+ *   This flag can only be specified if the underlying resource is a CUDA array 
+ *   or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP.
+ *   When seamless cube map filtering is enabled, texture address modes specified 
+ *   by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode 
+ *   is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP 
+ *   will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is 
+ *   set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed
+ *   when sampling along the cube face borders.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ *
+ * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as
+ * \code
+        typedef struct CUDA_RESOURCE_VIEW_DESC_st
+        {
+            CUresourceViewFormat format;
+            size_t width;
+            size_t height;
+            size_t depth;
+            unsigned int firstMipmapLevel;
+            unsigned int lastMipmapLevel;
+            unsigned int firstLayer;
+            unsigned int lastLayer;
+        } CUDA_RESOURCE_VIEW_DESC;
+ * \endcode
+ * where:
+ * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32.
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base
+ *   format but with 4 channels.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources,
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectDestroy,
+ * ::cudaCreateTextureObject
+ */
+CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc);
+
+/**
+ * \brief Destroys a texture object
+ *
+ * Destroys the texture object specified by \p texObject.
+ *
+ * \param texObject - Texture object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaDestroyTextureObject
+ */
+CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource descriptor
+ *
+ * Returns the resource descriptor for the texture object specified by \p texObject.
+ *
+ * \param pResDesc  - Resource descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceDesc,
+ */
+CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectTextureDesc
+ */
+CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject);
+
+/**
+ * \brief Returns a texture object's resource view descriptor
+ *
+ * Returns the resource view descriptor for the texture object specified by \p texObject.
+ * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pResViewDesc - Resource view descriptor
+ * \param texObject    - Texture object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuTexObjectCreate,
+ * ::cudaGetTextureObjectResourceViewDesc
+ */
+CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject);
+
+/** @} */ /* END CUDA_TEXOBJECT */
+
+/**
+ * \defgroup CUDA_SURFOBJECT Surface Object Management
+ *
+ * ___MANBRIEF___ surface object management functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the surface object management functions of the
+ * low-level CUDA driver application programming interface. The surface
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a surface object
+ *
+ * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be
+ * ::CU_RESOURCE_TYPE_ARRAY and  ::CUDA_RESOURCE_DESC::res::array::hArray
+ * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero.
+ *
+ * Surface objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a surface object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param pSurfObject - Surface object to create
+ * \param pResDesc    - Resource descriptor
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectDestroy,
+ * ::cudaCreateSurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc);
+
+/**
+ * \brief Destroys a surface object
+ *
+ * Destroys the surface object specified by \p surfObject.
+ *
+ * \param surfObject - Surface object to destroy
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaDestroySurfaceObject
+ */
+CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject);
+
+/**
+ * \brief Returns a surface object's resource descriptor
+ *
+ * Returns the resource descriptor for the surface object specified by \p surfObject.
+ *
+ * \param pResDesc   - Resource descriptor
+ * \param surfObject - Surface object
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ *
+ * \sa
+ * ::cuSurfObjectCreate,
+ * ::cudaGetSurfaceObjectResourceDesc
+ */
+CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject);
+
+/** @} */ /* END CUDA_SURFOBJECT */
+
+/**
+ * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access
+ *
+ * ___MANBRIEF___ direct peer context memory access functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the direct peer context memory access functions
+ * of the low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Queries if a device may directly access a peer device's memory.
+ *
+ * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of
+ * directly accessing memory from contexts on \p peerDev and 0 otherwise.
+ * If direct access of \p peerDev from \p dev is possible, then access may be
+ * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess().
+ *
+ * \param canAccessPeer - Returned access capability
+ * \param dev           - Device from which allocations on \p peerDev are to
+ *                        be directly accessed.
+ * \param peerDev       - Device on which the allocations to be directly accessed
+ *                        by \p dev reside.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceCanAccessPeer
+ */
+CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev);
+
+/**
+ * \brief Enables direct access to memory allocations in a peer context.
+ *
+ * If both the current context and \p peerContext are on devices which support unified
+ * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same
+ * major compute capability, then on success all allocations from \p peerContext will
+ * immediately be accessible by the current context.  See \ref CUDA_UNIFIED for additional
+ * details.
+ *
+ * Note that access granted by this call is unidirectional and that in order to access
+ * memory from the current context in \p peerContext, a separate symmetric call
+ * to ::cuCtxEnablePeerAccess() is required.
+ *
+ * Note that there are both device-wide and system-wide limitations per system
+ * configuration, as noted in the CUDA Programming Guide under the section
+ * "Peer-to-Peer Memory Access".
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates
+ * that the ::CUdevice of the current context cannot directly access memory
+ * from the ::CUdevice of \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of
+ * \p peerContext from the current context has already been enabled.
+ *
+ * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible
+ * because hardware resources required for peer access have been exhausted.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext
+ * is not a valid context, or if the current context is \p peerContext.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0.
+ *
+ * \param peerContext - Peer context to enable direct access to from the current context
+ * \param Flags       - Reserved for future use and must be set to 0
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED,
+ * ::CUDA_ERROR_TOO_MANY_PEERS,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxDisablePeerAccess,
+ * ::cudaDeviceEnablePeerAccess
+ */
+CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags);
+
+/**
+ * \brief Disables direct access to memory allocations in a peer context and
+ * unregisters any registered allocations.
+ *
+  Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has
+ * not yet been enabled from \p peerContext to the current context.
+ *
+ * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if
+ * \p peerContext is not a valid context.
+ *
+ * \param peerContext - Peer context to disable direct access to
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa
+ * ::cuDeviceCanAccessPeer,
+ * ::cuCtxEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess
+ */
+CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext);
+
+/**
+ * \brief Queries attributes of the link between two devices.
+ *
+ * Returns in \p *value the value of the requested attribute \p attrib of the
+ * link between \p srcDevice and \p dstDevice. The supported attributes are:
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the
+ *   performance of the link between two devices.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over
+ *   the link are supported.
+ * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can
+ *   be accessed over the link.
+ *
+ * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid
+ * or if they represent the same device.
+ *
+ * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is
+ * a null pointer.
+ *
+ * \param value         - Returned value of the requested attribute
+ * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ * \param srcDevice     - The source device of the target link.
+ * \param dstDevice     - The destination device of the target link.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa
+ * ::cuCtxEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess,
+ * ::cuDeviceCanAccessPeer,
+ * ::cudaDeviceGetP2PAttribute
+ */
+CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice);
+
+/** @} */ /* END CUDA_PEER_ACCESS */
+
+/**
+ * \defgroup CUDA_GRAPHICS Graphics Interoperability
+ *
+ * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graphics interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Unregisters a graphics resource for access by CUDA
+ *
+ * Unregisters the graphics resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned.
+ *
+ * \param resource - Resource to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsD3D9RegisterResource,
+ * ::cuGraphicsD3D10RegisterResource,
+ * ::cuGraphicsD3D11RegisterResource,
+ * ::cuGraphicsGLRegisterBuffer,
+ * ::cuGraphicsGLRegisterImage,
+ * ::cudaGraphicsUnregisterResource
+ */
+CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource);
+
+/**
+ * \brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * Returns in \p *pArray an array through which the subresource of the mapped
+ * graphics resource \p resource which corresponds to array index \p arrayIndex
+ * and mipmap level \p mipLevel may be accessed.  The value set in \p *pArray may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p arrayIndex is not a valid array index for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p mipLevel is not a valid mipmap level for \p resource then
+ * ::CUDA_ERROR_INVALID_VALUE is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pArray      - Returned array through which a subresource of \p resource may be accessed
+ * \param resource    - Mapped resource to access
+ * \param arrayIndex  - Array index for array textures or cubemap face
+ *                      index as defined by ::CUarray_cubemap_face for
+ *                      cubemap textures for the subresource to access
+ * \param mipLevel    - Mipmap level for the subresource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsSubResourceGetMappedArray
+ */
+CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+/**
+ * \brief Get a mipmapped array through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics
+ * resource \p resource. The value set in \p *pMipmappedArray may change every time
+ * that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via a mipmapped array and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ * \param resource        - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsResourceGetMappedMipmappedArray
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+
+/**
+ * \brief Get a device pointer through which to access a mapped graphics resource.
+ *
+ * Returns in \p *pDevPtr a pointer through which the mapped graphics resource
+ * \p resource may be accessed.
+ * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer.
+ * The value set in \p pPointer may change every time that \p resource is mapped.
+ *
+ * If \p resource is not a buffer then it cannot be accessed via a pointer and
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned.
+ * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned.
+ * *
+ * \param pDevPtr    - Returned pointer through which \p resource may be accessed
+ * \param pSize      - Returned size of the buffer accessible starting at \p *pPointer
+ * \param resource   - Mapped resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer
+ */
+CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+
+/**
+ * \brief Set usage flags for mapping a graphics resource
+ *
+ * Set \p flags for mapping the graphics resource \p resource.
+ *
+ * Changes to \p flags will take effect the next time \p resource is mapped.
+ * The \p flags argument may be any of the following:
+
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels.  This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p resource is presently mapped for access by CUDA then
+ * ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param resource - Registered resource to set flags for
+ * \param flags    - Parameters for resource mapping
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsResourceSetMapFlags
+ */
+CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+
+/**
+ * \brief Map graphics resources for access by CUDA
+ *
+ * Maps the \p count graphics resources in \p resources for access by CUDA.
+ *
+ * The resources in \p resources may be accessed by CUDA until they
+ * are unmapped. The graphics API from which \p resources were registered
+ * should not access any resources while they are mapped by CUDA. If an
+ * application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any graphics calls
+ * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA
+ * work issued in \p stream begins.
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * \param count      - Number of resources to map
+ * \param resources  - Resources to map for CUDA usage
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsUnmapResources,
+ * ::cudaGraphicsMapResources
+ */
+CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/**
+ * \brief Unmap graphics resources.
+ *
+ * Unmaps the \p count graphics resources in \p resources.
+ *
+ * Once unmapped, the resources in \p resources may not be accessed by CUDA
+ * until they are mapped again.
+ *
+ * This function provides the synchronization guarantee that any CUDA work issued
+ * in \p stream before ::cuGraphicsUnmapResources() will complete before any
+ * subsequently issued graphics work begins.
+ *
+ *
+ * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * \param count      - Number of resources to unmap
+ * \param resources  - Resources to unmap
+ * \param hStream    - Stream with which to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \note_null_stream
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources,
+ * ::cudaGraphicsUnmapResources
+ */
+CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+
+/** @} */ /* END CUDA_GRAPHICS */
+
+/**
+ * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access 
+ *
+ * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the driver entry point access functions of the low-level CUDA
+ * driver application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the requested driver API function pointer
+ *
+ * Returns in \p **pfn the address of the CUDA driver function for the requested
+ * CUDA version and flags.
+ *
+ * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2
+ * should be specified as 11020. For a requested driver symbol, if the specified
+ * CUDA version is greater than or equal to the CUDA version in which the driver symbol
+ * was introduced, this API will return the function pointer to the corresponding
+ * versioned function.
+ *
+ * The pointer returned by the API should be cast to a function pointer matching the
+ * requested driver function's definition in the API header file. The function pointer
+ * typedef can be picked up from the corresponding typedefs header file. For example,
+ * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+ *
+ * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not
+ * supported on the platform, no ABI compatible driver function exists for the specified
+ * \p cudaVersion or if the driver symbol is invalid.
+ *
+ * The requested flags can be:
+ * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to
+ *   ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with
+ *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+ *   is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise.
+ * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols
+ *   that match the requested driver symbol name except the corresponding per-thread versions.
+ * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all
+ *   driver symbols that match the requested driver symbol name including the per-thread
+ *   versions. If a per-thread version is not found, the API will return the legacy version
+ *   of the driver function.
+ *
+ * \param symbol - The base name of the driver API function to look for. As an example,
+ *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and
+ *                 \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. 
+ * \param pfn - Location to return the function pointer to the requested driver function
+ * \param cudaVersion - The CUDA version to look for the requested driver symbol 
+ * \param flags -  Flags to specify search options.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_SUPPORTED,
+ * ::CUDA_ERROR_NOT_FOUND
+ * \note_version_mixing
+ *
+ * \sa
+ * ::cudaGetDriverEntryPoint
+ */
+CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags);
+
+/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */
+
+CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId);
+
+/**
+ * CUDA API versioning support
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuMemHostRegister
+    #undef cuGraphicsResourceSetMapFlags
+    #undef cuLinkCreate
+    #undef cuLinkAddData
+    #undef cuLinkAddFile
+    #undef cuDeviceTotalMem
+    #undef cuCtxCreate
+    #undef cuModuleGetGlobal
+    #undef cuMemGetInfo
+    #undef cuMemAlloc
+    #undef cuMemAllocPitch
+    #undef cuMemFree
+    #undef cuMemGetAddressRange
+    #undef cuMemAllocHost
+    #undef cuMemHostGetDevicePointer
+    #undef cuMemcpyHtoD
+    #undef cuMemcpyDtoH
+    #undef cuMemcpyDtoD
+    #undef cuMemcpyDtoA
+    #undef cuMemcpyAtoD
+    #undef cuMemcpyHtoA
+    #undef cuMemcpyAtoH
+    #undef cuMemcpyAtoA
+    #undef cuMemcpyHtoAAsync
+    #undef cuMemcpyAtoHAsync
+    #undef cuMemcpy2D
+    #undef cuMemcpy2DUnaligned
+    #undef cuMemcpy3D
+    #undef cuMemcpyHtoDAsync
+    #undef cuMemcpyDtoHAsync
+    #undef cuMemcpyDtoDAsync
+    #undef cuMemcpy2DAsync
+    #undef cuMemcpy3DAsync
+    #undef cuMemsetD8
+    #undef cuMemsetD16
+    #undef cuMemsetD32
+    #undef cuMemsetD2D8
+    #undef cuMemsetD2D16
+    #undef cuMemsetD2D32
+    #undef cuArrayCreate
+    #undef cuArrayGetDescriptor
+    #undef cuArray3DCreate
+    #undef cuArray3DGetDescriptor
+    #undef cuTexRefSetAddress
+    #undef cuTexRefSetAddress2D
+    #undef cuTexRefGetAddress
+    #undef cuGraphicsResourceGetMappedPointer
+    #undef cuCtxDestroy
+    #undef cuCtxPopCurrent
+    #undef cuCtxPushCurrent
+    #undef cuStreamDestroy
+    #undef cuEventDestroy
+    #undef cuMemcpy
+    #undef cuMemcpyAsync
+    #undef cuMemcpyPeer
+    #undef cuMemcpyPeerAsync
+    #undef cuMemcpy3DPeer
+    #undef cuMemcpy3DPeerAsync
+    #undef cuMemsetD8Async
+    #undef cuMemsetD16Async
+    #undef cuMemsetD32Async
+    #undef cuMemsetD2D8Async
+    #undef cuMemsetD2D16Async
+    #undef cuMemsetD2D32Async
+    #undef cuStreamGetPriority
+    #undef cuStreamGetFlags
+    #undef cuStreamGetCtx
+    #undef cuStreamWaitEvent
+    #undef cuStreamAddCallback
+    #undef cuStreamAttachMemAsync
+    #undef cuStreamQuery
+    #undef cuStreamSynchronize
+    #undef cuEventRecord
+    #undef cuEventRecordWithFlags
+    #undef cuLaunchKernel
+    #undef cuLaunchKernelEx
+    #undef cuLaunchHostFunc
+    #undef cuGraphicsMapResources
+    #undef cuGraphicsUnmapResources
+    #undef cuStreamWriteValue32
+    #undef cuStreamWaitValue32
+    #undef cuStreamWriteValue64
+    #undef cuStreamWaitValue64
+    #undef cuStreamBatchMemOp
+    #undef cuStreamWriteValue32_v2
+    #undef cuStreamWaitValue32_v2
+    #undef cuStreamWriteValue64_v2
+    #undef cuStreamWaitValue64_v2
+    #undef cuStreamBatchMemOp_v2
+    #undef cuMemPrefetchAsync
+    #undef cuLaunchCooperativeKernel
+    #undef cuSignalExternalSemaphoresAsync
+    #undef cuWaitExternalSemaphoresAsync
+    #undef cuStreamBeginCapture
+    #undef cuStreamEndCapture
+    #undef cuStreamIsCapturing
+    #undef cuStreamGetCaptureInfo
+    #undef cuStreamGetCaptureInfo_v2
+    #undef cuGraphUpload
+    #undef cuGraphLaunch
+    #undef cuDevicePrimaryCtxRelease
+    #undef cuDevicePrimaryCtxReset
+    #undef cuDevicePrimaryCtxSetFlags
+    #undef cuIpcOpenMemHandle
+    #undef cuStreamCopyAttributes
+    #undef cuStreamSetAttribute
+    #undef cuStreamGetAttribute
+    #undef cuGraphInstantiate
+    #undef cuMemMapArrayAsync
+    #undef cuMemFreeAsync 
+    #undef cuMemAllocAsync 
+    #undef cuMemAllocFromPoolAsync 
+    #undef cuStreamUpdateCaptureDependencies
+
+    CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags);
+    CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags);
+    CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name,
+        unsigned int numOptions, CUjit_option *options, void **optionValues);
+    CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path,
+        unsigned int numOptions, CUjit_option *options, void **optionValues);
+    CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch);
+
+    typedef unsigned int CUdeviceptr_v1;
+
+    typedef struct CUDA_MEMCPY2D_v1_st
+    {
+        unsigned int srcXInBytes;   /**< Source X in bytes */
+        unsigned int srcY;          /**< Source Y */
+        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+        const void *srcHost;        /**< Source host pointer */
+        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
+        CUarray srcArray;           /**< Source array reference */
+        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+
+        unsigned int dstXInBytes;   /**< Destination X in bytes */
+        unsigned int dstY;          /**< Destination Y */
+        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+        void *dstHost;              /**< Destination host pointer */
+        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
+        CUarray dstArray;           /**< Destination array reference */
+        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+
+        unsigned int WidthInBytes;  /**< Width of 2D memory copy in bytes */
+        unsigned int Height;        /**< Height of 2D memory copy */
+    } CUDA_MEMCPY2D_v1;
+
+    typedef struct CUDA_MEMCPY3D_v1_st
+    {
+        unsigned int srcXInBytes;   /**< Source X in bytes */
+        unsigned int srcY;          /**< Source Y */
+        unsigned int srcZ;          /**< Source Z */
+        unsigned int srcLOD;        /**< Source LOD */
+        CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */
+        const void *srcHost;        /**< Source host pointer */
+        CUdeviceptr_v1 srcDevice;   /**< Source device pointer */
+        CUarray srcArray;           /**< Source array reference */
+        void *reserved0;            /**< Must be NULL */
+        unsigned int srcPitch;      /**< Source pitch (ignored when src is array) */
+        unsigned int srcHeight;     /**< Source height (ignored when src is array; may be 0 if Depth==1) */
+
+        unsigned int dstXInBytes;   /**< Destination X in bytes */
+        unsigned int dstY;          /**< Destination Y */
+        unsigned int dstZ;          /**< Destination Z */
+        unsigned int dstLOD;        /**< Destination LOD */
+        CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */
+        void *dstHost;              /**< Destination host pointer */
+        CUdeviceptr_v1 dstDevice;   /**< Destination device pointer */
+        CUarray dstArray;           /**< Destination array reference */
+        void *reserved1;            /**< Must be NULL */
+        unsigned int dstPitch;      /**< Destination pitch (ignored when dst is array) */
+        unsigned int dstHeight;     /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */
+
+        unsigned int WidthInBytes;  /**< Width of 3D memory copy in bytes */
+        unsigned int Height;        /**< Height of 3D memory copy */
+        unsigned int Depth;         /**< Depth of 3D memory copy */
+    } CUDA_MEMCPY3D_v1;
+
+    typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st
+    {
+        unsigned int Width;         /**< Width of array */
+        unsigned int Height;        /**< Height of array */
+
+        CUarray_format Format;      /**< Array format */
+        unsigned int NumChannels;   /**< Channels per array element */
+    } CUDA_ARRAY_DESCRIPTOR_v1;
+
+    typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st
+    {
+        unsigned int Width;         /**< Width of 3D array */
+        unsigned int Height;        /**< Height of 3D array */
+        unsigned int Depth;         /**< Depth of 3D array */
+
+        CUarray_format Format;      /**< Array format */
+        unsigned int NumChannels;   /**< Channels per array element */
+        unsigned int Flags;         /**< Flags */
+    } CUDA_ARRAY3D_DESCRIPTOR_v1;
+
+    CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev);
+    CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev);
+    CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total);
+    CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr);
+    CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize);
+    CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+
+    CUresult CUDAAPI cuCtxDestroy(CUcontext ctx);
+    CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx);
+    CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx);
+    CUresult CUDAAPI cuStreamDestroy(CUstream hStream);
+    CUresult CUDAAPI cuEventDestroy(CUevent hEvent);
+    CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev);
+    CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev);
+    CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
+
+    CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy);
+    CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy);
+    CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N);
+    CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N);
+    CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N);
+    CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+    CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount);
+    CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+    CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy);
+    CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream);
+
+    CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+    CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority);
+    CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags);
+    CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
+    CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags);
+    CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+    CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags);
+    CUresult CUDAAPI cuStreamQuery(CUstream hStream);
+    CUresult CUDAAPI cuStreamSynchronize(CUstream hStream);
+    CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream);
+    CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags);
+    CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+    CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+    CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData);
+    CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+    CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags);
+    CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+    CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream);
+    CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+    CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream);
+    CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream);
+    CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode);
+    CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph);
+    CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+    CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+    CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+    
+    CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream);
+    CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream);
+    CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream);
+    CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value);
+    CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param);
+
+    CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
+    CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+    CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream);
+
+    CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream);
+    CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream);
+    CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+
+    CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM)
+static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) {
+    const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM|
+                                 CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM);
+    if ((flags & procAddressMask) == 0) {
+        flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM;
+    }
+    return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); 
+}
+#define cuGetProcAddress cuGetProcAddress_ptsz
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#if defined(__GNUC__)
+  #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT)
+    #pragma GCC visibility pop
+  #endif
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif /* __cuda_cuda_h__ */
diff --git a/ext/cudart/include/cudaD3D10.h b/ext/cudart/include/cudaD3D10.h
new file mode 100644
index 0000000000000000000000000000000000000000..9342cd7832a167958d4591c5afa346f43fa65d2d
--- /dev/null
+++ b/ext/cudart/include/cudaD3D10.h
@@ -0,0 +1,805 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAD3D10_H
+#define CUDAD3D10_H
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#define cuD3D10CtxCreate                    cuD3D10CtxCreate_v2
+#define cuD3D10ResourceGetSurfaceDimensions cuD3D10ResourceGetSurfaceDimensions_v2
+#define cuD3D10ResourceGetMappedPointer     cuD3D10ResourceGetMappedPointer_v2
+#define cuD3D10ResourceGetMappedSize        cuD3D10ResourceGetMappedSize_v2
+#define cuD3D10ResourceGetMappedPitch       cuD3D10ResourceGetMappedPitch_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \defgroup CUDA_D3D10 Direct3D 10 Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ Direct3D 10 interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the Direct3D 10 interoperability functions of the
+ * low-level CUDA driver application programming interface. Note that mapping 
+ * of Direct3D 10 resources is performed with the graphics API agnostic, resource 
+ * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to a D3D10 device
+ */
+typedef enum CUd3d10DeviceList_enum {
+    CU_D3D10_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by a D3D10 device */
+    CU_D3D10_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by a D3D10 device in its currently rendering frame */
+    CU_D3D10_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D10 device in the next frame */
+} CUd3d10DeviceList;
+
+/**
+ * \brief Gets the CUDA device corresponding to a display adapter.
+ *
+ * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
+ * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters.
+ *
+ * If no device on \p pAdapter is CUDA-compatible then the call will fail.
+ *
+ * \param pCudaDevice - Returned CUDA device corresponding to \p pAdapter
+ * \param pAdapter    - Adapter to query for CUDA device
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D10GetDevices,
+ * ::cudaD3D10GetDevice
+ */
+CUresult CUDAAPI cuD3D10GetDevice(CUdevice *pCudaDevice, IDXGIAdapter *pAdapter);
+
+/**
+ * \brief Gets the CUDA devices corresponding to a Direct3D 10 device
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
+ * to the Direct3D 10 device \p pD3D10Device.
+ * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
+ * corresponding to the Direct3D 10 device \p pD3D10Device.
+ *
+ * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
+ * call will return ::CUDA_ERROR_NO_DEVICE.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D10Device
+ * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D10Device
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param pD3D10Device     - Direct3D 10 device to query for CUDA devices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::CU_D3D10_DEVICE_LIST_ALL for all devices,
+ *                           ::CU_D3D10_DEVICE_LIST_CURRENT_FRAME for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::CU_D3D10_DEVICE_LIST_NEXT_FRAME for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NO_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D10GetDevice,
+ * ::cudaD3D10GetDevices
+ */
+CUresult CUDAAPI cuD3D10GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, CUd3d10DeviceList deviceList);
+
+/**
+ * \brief Register a Direct3D 10 resource for access by CUDA
+ *
+ * Registers the Direct3D 10 resource \p pD3DResource for access by CUDA and
+ * returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
+ * The handle returned in \p pCudaResource may be used to map and unmap this
+ * resource until it is unregistered.
+ * On success this call will increase the internal reference count on
+ * \p pD3DResource. This reference count will be decremented when this
+ * resource is unregistered through ::cuGraphicsUnregisterResource().
+ *
+ * This call is potentially high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pD3DResource must be one of the following.
+ * - ::ID3D10Buffer: may be accessed through a device pointer.
+ * - ::ID3D10Texture1D: individual subresources of the texture may be accessed via arrays
+ * - ::ID3D10Texture2D: individual subresources of the texture may be accessed via arrays
+ * - ::ID3D10Texture3D: individual subresources of the texture may be accessed via arrays
+ *
+ * The \p Flags argument may be used to specify additional parameters at register
+ * time.  The valid values for this parameter are
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA.  The following are some limitations.
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * A complete list of supported DXGI formats is as follows. For compactness the
+ * notation A_{B,C,D} represents A_B, A_C, and A_D.
+ * - DXGI_FORMAT_A8_UNORM
+ * - DXGI_FORMAT_B8G8R8A8_UNORM
+ * - DXGI_FORMAT_B8G8R8X8_UNORM
+ * - DXGI_FORMAT_R16_FLOAT
+ * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R32_FLOAT
+ * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
+ * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
+ * - DXGI_FORMAT_R32_{SINT,UINT}
+ * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
+ * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
+ *
+ * If \p pD3DResource is of incorrect type or is already registered then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
+ * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
+ * is returned.
+ *
+ * \param pCudaResource - Returned graphics resource handle
+ * \param pD3DResource  - Direct3D resource to register
+ * \param Flags         - Parameters for resource registration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsD3D10RegisterResource
+ */
+CUresult CUDAAPI cuGraphicsD3D10RegisterResource(CUgraphicsResource *pCudaResource, ID3D10Resource *pD3DResource, unsigned int Flags);
+
+/**
+ * \defgroup CUDA_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated Direct3D 10 interoperability functions of the 
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated Direct3D 10 interoperability functionality.
+ * @{
+ */
+
+/** Flags to register a resource */
+typedef enum CUD3D10register_flags_enum {
+    CU_D3D10_REGISTER_FLAGS_NONE  = 0x00,
+    CU_D3D10_REGISTER_FLAGS_ARRAY = 0x01,
+} CUD3D10register_flags;
+
+/** Flags to map or unmap a resource */
+typedef enum CUD3D10map_flags_enum {
+    CU_D3D10_MAPRESOURCE_FLAGS_NONE         = 0x00,
+    CU_D3D10_MAPRESOURCE_FLAGS_READONLY     = 0x01,
+    CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD = 0x02,
+} CUD3D10map_flags;
+
+
+/**
+ * \brief Create a CUDA context for interoperability with Direct3D 10
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with a D3D10
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param pCtx        - Returned newly created CUDA context
+ * \param pCudaDevice - Returned pointer to the device on which the context was created
+ * \param Flags       - Context creation flags (see ::cuCtxCreate() for details)
+ * \param pD3DDevice  - Direct3D device to create interoperability context with
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D10GetDevice,
+ * ::cuGraphicsD3D10RegisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
+
+/**
+ * \brief Create a CUDA context for interoperability with Direct3D 10
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with a D3D10
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param pCtx        - Returned newly created CUDA context
+ * \param flags       - Context creation flags (see ::cuCtxCreate() for details)
+ * \param pD3DDevice  - Direct3D device to create interoperability context with
+ * \param cudaDevice  - The CUDA device on which to create the context.  This device
+ *                      must be among the devices returned when querying
+ *                      ::CU_D3D10_DEVICES_ALL from  ::cuD3D10GetDevices.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D10GetDevices,
+ * ::cuGraphicsD3D10RegisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, ID3D10Device *pD3DDevice, CUdevice cudaDevice);
+
+/**
+ * \brief Get the Direct3D 10 device against which the current CUDA context was
+ * created
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with a D3D10
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D10GetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10GetDirect3DDevice(ID3D10Device **ppD3DDevice);
+
+/**
+ * \brief Register a Direct3D resource for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Registers the Direct3D resource \p pResource for access by CUDA.
+ *
+ * If this call is successful, then the application will be able to map and
+ * unmap this resource until it is unregistered through
+ * ::cuD3D10UnregisterResource(). Also on success, this call will increase the
+ * internal reference count on \p pResource. This reference count will be
+ * decremented when this resource is unregistered through
+ * ::cuD3D10UnregisterResource().
+ *
+ * This call is potentially high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pResource must be one of the following.
+ *
+ * - ::ID3D10Buffer: Cannot be used with \p Flags set to
+ *   ::CU_D3D10_REGISTER_FLAGS_ARRAY.
+ * - ::ID3D10Texture1D: No restrictions.
+ * - ::ID3D10Texture2D: No restrictions.
+ * - ::ID3D10Texture3D: No restrictions.
+ *
+ * The \p Flags argument specifies the mechanism through which CUDA will
+ * access the Direct3D resource.  The following values are allowed.
+ *
+ * - ::CU_D3D10_REGISTER_FLAGS_NONE: Specifies that CUDA will access this
+ *   resource through a ::CUdeviceptr. The pointer, size, and (for textures),
+ *   pitch for each subresource of this allocation may be queried through
+ *   ::cuD3D10ResourceGetMappedPointer(), ::cuD3D10ResourceGetMappedSize(),
+ *   and ::cuD3D10ResourceGetMappedPitch() respectively. This option is valid
+ *   for all resource types.
+ * - ::CU_D3D10_REGISTER_FLAGS_ARRAY: Specifies that CUDA will access this
+ *   resource through a ::CUarray queried on a sub-resource basis through
+ *   ::cuD3D10ResourceGetMappedArray(). This option is only valid for
+ *   resources of type ::ID3D10Texture1D, ::ID3D10Texture2D, and
+ *   ::ID3D10Texture3D.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA.  The following are some limitations.
+ *
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Resources allocated as shared may not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * If Direct3D interoperability is not initialized on this context then
+ * ::CUDA_ERROR_INVALID_CONTEXT is returned. If \p pResource is of incorrect
+ * type or is already registered, then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned. If \p pResource cannot be registered, then ::CUDA_ERROR_UNKNOWN
+ * is returned.
+ *
+ * \param pResource - Resource to register
+ * \param Flags     - Parameters for resource registration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsD3D10RegisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10RegisterResource(ID3D10Resource *pResource, unsigned int Flags);
+
+/**
+ * \brief Unregister a Direct3D resource
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Unregisters the Direct3D resource \p pResource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned.
+ *
+ * \param pResource - Resources to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnregisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10UnregisterResource(ID3D10Resource *pResource);
+
+/**
+ * \brief Map Direct3D resources for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
+ *
+ * The resources in \p ppResources may be accessed in CUDA kernels until they
+ * are unmapped. Direct3D should not access any resources while they are mapped
+ * by CUDA. If an application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any Direct3D calls
+ * issued before ::cuD3D10MapResources() will complete before any CUDA kernels
+ * issued after ::cuD3D10MapResources() begin.
+ *
+ * If any of \p ppResources have not been registered for use with CUDA or if
+ * \p ppResources contains any duplicate entries, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResources are
+ * presently mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is
+ * returned.
+ *
+ * \param count       - Number of resources to map for CUDA
+ * \param ppResources - Resources to map for CUDA
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10MapResources(unsigned int count, ID3D10Resource **ppResources);
+
+/**
+ * \brief Unmap Direct3D resources
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Unmaps the \p count Direct3D resources in \p ppResources.
+ *
+ * This function provides the synchronization guarantee that any CUDA kernels
+ * issued before ::cuD3D10UnmapResources() will complete before any Direct3D
+ * calls issued after ::cuD3D10UnmapResources() begin.
+ *
+ * If any of \p ppResources have not been registered for use with CUDA or if
+ * \p ppResources contains any duplicate entries, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResources are not
+ * presently mapped for access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is
+ * returned.
+ *
+ * \param count       - Number of resources to unmap for CUDA
+ * \param ppResources - Resources to unmap for CUDA
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10UnmapResources(unsigned int count, ID3D10Resource **ppResources);
+
+/**
+ * \brief Set usage flags for mapping a Direct3D resource
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Set flags for mapping the Direct3D resource \p pResource.
+ *
+ * Changes to flags will take effect the next time \p pResource is mapped. The
+ * \p Flags argument may be any of the following.
+ *
+ * - ::CU_D3D10_MAPRESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels. This is the default value.
+ * - ::CU_D3D10_MAPRESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p pResource has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is presently
+ * mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * \param pResource - Registered resource to set flags for
+ * \param Flags     - Parameters for resource mapping
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceSetMapFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceSetMapFlags(ID3D10Resource *pResource, unsigned int Flags);
+
+/**
+ * \brief Get an array through which to access a subresource of a Direct3D
+ * resource which has been mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pArray an array through which the subresource of the mapped
+ * Direct3D resource \p pResource, which corresponds to \p SubResource may be
+ * accessed. The value set in \p pArray may change every time that \p pResource
+ * is mapped.
+ *
+ * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned. If \p pResource was not registered with usage flags
+ * ::CU_D3D10_REGISTER_FLAGS_ARRAY, then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned. If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is
+ * returned.
+ *
+ * For usage requirements of the \p SubResource parameter, see
+ * ::cuD3D10ResourceGetMappedPointer().
+ *
+ * \param pArray       - Returned array corresponding to subresource
+ * \param pResource    - Mapped resource to access
+ * \param SubResource  - Subresource of pResource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsSubResourceGetMappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedArray(CUarray *pArray, ID3D10Resource *pResource, unsigned int SubResource);
+
+/**
+ * \brief Get a pointer through which to access a subresource of a Direct3D
+ * resource which has been mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pDevPtr the base pointer of the subresource of the mapped
+ * Direct3D resource \p pResource, which corresponds to \p SubResource. The
+ * value set in \p pDevPtr may change every time that \p pResource is mapped.
+ *
+ * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned. If \p pResource was not registered with usage flags
+ * ::CU_D3D10_REGISTER_FLAGS_NONE, then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned. If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is
+ * returned.
+ *
+ * If \p pResource is of type ::ID3D10Buffer, then \p SubResource must be 0.
+ * If \p pResource is of any other type, then the value of \p SubResource must
+ * come from the subresource calculation in ::D3D10CalcSubResource().
+ *
+ * \param pDevPtr      - Returned pointer corresponding to subresource
+ * \param pResource    - Mapped resource to access
+ * \param SubResource  - Subresource of pResource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceGetMappedPointer
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedPointer(CUdeviceptr *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
+
+/**
+ * \brief Get the size of a subresource of a Direct3D resource which has been
+ * mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pSize the size of the subresource of the mapped Direct3D
+ * resource \p pResource, which corresponds to \p SubResource. The value set
+ * in \p pSize may change every time that \p pResource is mapped.
+ *
+ * If \p pResource has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
+ * with usage flags ::CU_D3D10_REGISTER_FLAGS_NONE, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
+ * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * For usage requirements of the \p SubResource parameter, see
+ * ::cuD3D10ResourceGetMappedPointer().
+ *
+ * \param pSize        - Returned size of subresource
+ * \param pResource    - Mapped resource to access
+ * \param SubResource  - Subresource of pResource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceGetMappedPointer
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedSize(size_t *pSize, ID3D10Resource *pResource, unsigned int SubResource);
+
+/**
+ * \brief Get the pitch of a subresource of a Direct3D resource which has been
+ * mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of the
+ * subresource of the mapped Direct3D resource \p pResource, which corresponds
+ * to \p SubResource. The values set in \p pPitch and \p pPitchSlice may
+ * change every time that \p pResource is mapped.
+ *
+ * The pitch and Z-slice pitch values may be used to compute the location of a
+ * sample on a surface as follows.
+ *
+ * For a 2D surface, the byte offset of the sample at position \b x, \b y from
+ * the base pointer of the surface is:
+ *
+ * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
+ *
+ * For a 3D surface, the byte offset of the sample at position \b x, \b y,
+ * \b z from the base pointer of the surface is:
+ *
+ * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
+ *
+ * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
+ * NULL.
+ *
+ * If \p pResource is not of type ::IDirect3DBaseTexture10 or one of its
+ * sub-types or if \p pResource has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
+ * with usage flags ::CU_D3D10_REGISTER_FLAGS_NONE, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
+ * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * For usage requirements of the \p SubResource parameter, see
+ * ::cuD3D10ResourceGetMappedPointer().
+ *
+ * \param pPitch       - Returned pitch of subresource
+ * \param pPitchSlice  - Returned Z-slice pitch of subresource
+ * \param pResource    - Mapped resource to access
+ * \param SubResource  - Subresource of pResource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsSubResourceGetMappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
+
+/**
+ * \brief Get the dimensions of a registered surface
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
+ * subresource of the mapped Direct3D resource \p pResource, which corresponds
+ * to \p SubResource.
+ *
+ * Because anti-aliased surfaces may have multiple samples per pixel, it is
+ * possible that the dimensions of a resource will be an integer factor larger
+ * than the dimensions reported by the Direct3D runtime.
+ *
+ * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
+ * surfaces, the value returned in \p *pDepth will be 0.
+ *
+ * If \p pResource is not of type ::IDirect3DBaseTexture10 or
+ * ::IDirect3DSurface10 or if \p pResource has not been registered for use
+ * with CUDA, then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ *
+ * For usage requirements of the \p SubResource parameter, see
+ * ::cuD3D10ResourceGetMappedPointer().
+ *
+ * \param pWidth       - Returned width of surface
+ * \param pHeight      - Returned height of surface
+ * \param pDepth       - Returned depth of surface
+ * \param pResource    - Registered resource to access
+ * \param SubResource  - Subresource of pResource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsSubResourceGetMappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
+
+/** @} */ /* END CUDA_D3D10_DEPRECATED */
+/** @} */ /* END CUDA_D3D10 */
+
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuD3D10CtxCreate
+    #undef cuD3D10ResourceGetSurfaceDimensions
+    #undef cuD3D10ResourceGetMappedPointer
+    #undef cuD3D10ResourceGetMappedSize
+    #undef cuD3D10ResourceGetMappedPitch
+
+    CUresult CUDAAPI cuD3D10CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
+    CUresult CUDAAPI cuD3D10ResourceGetMappedPitch(unsigned int *pPitch, unsigned int *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
+    CUresult CUDAAPI cuD3D10ResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
+    CUresult CUDAAPI cuD3D10ResourceGetMappedSize(unsigned int *pSize, ID3D10Resource *pResource, unsigned int SubResource);
+    CUresult CUDAAPI cuD3D10ResourceGetSurfaceDimensions(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif
+
diff --git a/ext/cudart/include/cudaD3D10Typedefs.h b/ext/cudart/include/cudaD3D10Typedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..d84885dc51366b629166244a6ff398305ac3a47f
--- /dev/null
+++ b/ext/cudart/include/cudaD3D10Typedefs.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAD3D10TYPEDEFS_H
+#define CUDAD3D10TYPEDEFS_H
+
+// Dependent includes for cudaD3D10.h
+#include <rpcsal.h>
+#include <D3D10_1.h>
+
+#include <cudaD3D10.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaD3D10.h
+ */
+#define PFN_cuD3D10GetDevice  PFN_cuD3D10GetDevice_v2010
+#define PFN_cuD3D10GetDevices  PFN_cuD3D10GetDevices_v3020
+#define PFN_cuGraphicsD3D10RegisterResource  PFN_cuGraphicsD3D10RegisterResource_v3000
+#define PFN_cuD3D10CtxCreate  PFN_cuD3D10CtxCreate_v3020
+#define PFN_cuD3D10CtxCreateOnDevice  PFN_cuD3D10CtxCreateOnDevice_v3020
+#define PFN_cuD3D10GetDirect3DDevice  PFN_cuD3D10GetDirect3DDevice_v3020
+#define PFN_cuD3D10RegisterResource  PFN_cuD3D10RegisterResource_v2010
+#define PFN_cuD3D10UnregisterResource  PFN_cuD3D10UnregisterResource_v2010
+#define PFN_cuD3D10MapResources  PFN_cuD3D10MapResources_v2010
+#define PFN_cuD3D10UnmapResources  PFN_cuD3D10UnmapResources_v2010
+#define PFN_cuD3D10ResourceSetMapFlags  PFN_cuD3D10ResourceSetMapFlags_v2010
+#define PFN_cuD3D10ResourceGetMappedArray  PFN_cuD3D10ResourceGetMappedArray_v2010
+#define PFN_cuD3D10ResourceGetMappedPointer  PFN_cuD3D10ResourceGetMappedPointer_v3020
+#define PFN_cuD3D10ResourceGetMappedSize  PFN_cuD3D10ResourceGetMappedSize_v3020
+#define PFN_cuD3D10ResourceGetMappedPitch  PFN_cuD3D10ResourceGetMappedPitch_v3020
+#define PFN_cuD3D10ResourceGetSurfaceDimensions  PFN_cuD3D10ResourceGetSurfaceDimensions_v3020
+
+
+/**
+ * Type definitions for functions defined in cudaD3D10.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuD3D10GetDevice_v2010)(CUdevice_v1 *pCudaDevice, IDXGIAdapter *pAdapter);
+typedef CUresult (CUDAAPI *PFN_cuD3D10GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, CUd3d10DeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D10RegisterResource_v3000)(CUgraphicsResource *pCudaResource, ID3D10Resource *pD3DResource, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
+typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, ID3D10Device *pD3DDevice, CUdevice_v1 cudaDevice);
+typedef CUresult (CUDAAPI *PFN_cuD3D10GetDirect3DDevice_v3020)(ID3D10Device **ppD3DDevice);
+typedef CUresult (CUDAAPI *PFN_cuD3D10RegisterResource_v2010)(ID3D10Resource *pResource, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuD3D10UnregisterResource_v2010)(ID3D10Resource *pResource);
+typedef CUresult (CUDAAPI *PFN_cuD3D10MapResources_v2010)(unsigned int count, ID3D10Resource **ppResources);
+typedef CUresult (CUDAAPI *PFN_cuD3D10UnmapResources_v2010)(unsigned int count, ID3D10Resource **ppResources);
+typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceSetMapFlags_v2010)(ID3D10Resource *pResource, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedArray_v2010)(CUarray *pArray, ID3D10Resource *pResource, unsigned int SubResource);
+typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
+typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedSize_v3020)(size_t *pSize, ID3D10Resource *pResource, unsigned int SubResource);
+typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPitch_v3020)(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
+typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetSurfaceDimensions_v3020)(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
+
+/*
+ * Type definitions for older versioned functions in cudaD3D10.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreate_v2010)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice);
+    typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPitch_v2010)(unsigned int *pPitch, unsigned int *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource);
+    typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPointer_v2010)(CUdeviceptr_v1 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource);
+    typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedSize_v2010)(unsigned int *pSize, ID3D10Resource *pResource, unsigned int SubResource);
+    typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetSurfaceDimensions_v2010)(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, ID3D10Resource *pResource, unsigned int SubResource);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/ext/cudart/include/cudaD3D11.h b/ext/cudart/include/cudaD3D11.h
new file mode 100644
index 0000000000000000000000000000000000000000..302d297b0985540c64f09a575b982422170b8634
--- /dev/null
+++ b/ext/cudart/include/cudaD3D11.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAD3D11_H
+#define CUDAD3D11_H
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#define cuD3D11CtxCreate cuD3D11CtxCreate_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \defgroup CUDA_D3D11 Direct3D 11 Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ Direct3D 11 interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the Direct3D 11 interoperability functions of the
+ * low-level CUDA driver application programming interface. Note that mapping 
+ * of Direct3D 11 resources is performed with the graphics API agnostic, resource 
+ * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to a D3D11 device
+ */
+typedef enum CUd3d11DeviceList_enum {
+    CU_D3D11_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by a D3D11 device */
+    CU_D3D11_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame */
+    CU_D3D11_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D11 device in the next frame */
+} CUd3d11DeviceList;
+
+/**
+ * \brief Gets the CUDA device corresponding to a display adapter.
+ *
+ * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
+ * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters.
+ *
+ * If no device on \p pAdapter is CUDA-compatible the call will return
+ * ::CUDA_ERROR_NO_DEVICE.
+ *
+ * \param pCudaDevice - Returned CUDA device corresponding to \p pAdapter
+ * \param pAdapter    - Adapter to query for CUDA device
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NO_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D11GetDevices,
+ * ::cudaD3D11GetDevice
+ */
+CUresult CUDAAPI cuD3D11GetDevice(CUdevice *pCudaDevice, IDXGIAdapter *pAdapter);
+
+/**
+ * \brief Gets the CUDA devices corresponding to a Direct3D 11 device
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
+ * to the Direct3D 11 device \p pD3D11Device.
+ * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
+ * corresponding to the Direct3D 11 device \p pD3D11Device.
+ *
+ * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
+ * call will return ::CUDA_ERROR_NO_DEVICE.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D11Device
+ * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D11Device
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param pD3D11Device     - Direct3D 11 device to query for CUDA devices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::CU_D3D11_DEVICE_LIST_ALL for all devices,
+ *                           ::CU_D3D11_DEVICE_LIST_CURRENT_FRAME for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::CU_D3D11_DEVICE_LIST_NEXT_FRAME for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NO_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D11GetDevice,
+ * ::cudaD3D11GetDevices
+ */
+CUresult CUDAAPI cuD3D11GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, CUd3d11DeviceList deviceList);
+
+/**
+ * \brief Register a Direct3D 11 resource for access by CUDA
+ *
+ * Registers the Direct3D 11 resource \p pD3DResource for access by CUDA and
+ * returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
+ * The handle returned in \p pCudaResource may be used to map and unmap this
+ * resource until it is unregistered.
+ * On success this call will increase the internal reference count on
+ * \p pD3DResource. This reference count will be decremented when this
+ * resource is unregistered through ::cuGraphicsUnregisterResource().
+ *
+ * This call is potentially high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pD3DResource must be one of the following.
+ * - ::ID3D11Buffer: may be accessed through a device pointer.
+ * - ::ID3D11Texture1D: individual subresources of the texture may be accessed via arrays
+ * - ::ID3D11Texture2D: individual subresources of the texture may be accessed via arrays
+ * - ::ID3D11Texture3D: individual subresources of the texture may be accessed via arrays
+ *
+ * The \p Flags argument may be used to specify additional parameters at register
+ * time.  The valid values for this parameter are
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA.  The following are some limitations.
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * A complete list of supported DXGI formats is as follows. For compactness the
+ * notation A_{B,C,D} represents A_B, A_C, and A_D.
+ * - DXGI_FORMAT_A8_UNORM
+ * - DXGI_FORMAT_B8G8R8A8_UNORM
+ * - DXGI_FORMAT_B8G8R8X8_UNORM
+ * - DXGI_FORMAT_R16_FLOAT
+ * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R32_FLOAT
+ * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
+ * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
+ * - DXGI_FORMAT_R32_{SINT,UINT}
+ * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
+ * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
+ *
+ * If \p pD3DResource is of incorrect type or is already registered then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
+ * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
+ * is returned.
+ *
+ * \param pCudaResource - Returned graphics resource handle
+ * \param pD3DResource  - Direct3D resource to register
+ * \param Flags         - Parameters for resource registration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsD3D11RegisterResource
+ */
+CUresult CUDAAPI cuGraphicsD3D11RegisterResource(CUgraphicsResource *pCudaResource, ID3D11Resource *pD3DResource, unsigned int Flags);
+
+/**
+ * \defgroup CUDA_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated Direct3D 11 interoperability functions of the
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated Direct3D 11 interoperability functionality.
+ * @{
+ */
+
+/**
+ * \brief Create a CUDA context for interoperability with Direct3D 11
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with a D3D11
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param pCtx        - Returned newly created CUDA context
+ * \param pCudaDevice - Returned pointer to the device on which the context was created
+ * \param Flags       - Context creation flags (see ::cuCtxCreate() for details)
+ * \param pD3DDevice  - Direct3D device to create interoperability context with
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D11GetDevice,
+ * ::cuGraphicsD3D11RegisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
+
+/**
+ * \brief Create a CUDA context for interoperability with Direct3D 11
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with a D3D11
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param pCtx        - Returned newly created CUDA context
+ * \param flags       - Context creation flags (see ::cuCtxCreate() for details)
+ * \param pD3DDevice  - Direct3D device to create interoperability context with
+ * \param cudaDevice  - The CUDA device on which to create the context.  This device
+ *                      must be among the devices returned when querying
+ *                      ::CU_D3D11_DEVICES_ALL from  ::cuD3D11GetDevices.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D11GetDevices,
+ * ::cuGraphicsD3D11RegisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, ID3D11Device *pD3DDevice, CUdevice cudaDevice);
+
+/**
+ * \brief Get the Direct3D 11 device against which the current CUDA context was
+ * created
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with a D3D11
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D11GetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11GetDirect3DDevice(ID3D11Device **ppD3DDevice);
+
+/** @} */ /* END CUDA_D3D11_DEPRECATED */
+/** @} */ /* END CUDA_D3D11 */
+
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuD3D11CtxCreate
+
+    CUresult CUDAAPI cuD3D11CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif
+
diff --git a/ext/cudart/include/cudaD3D11Typedefs.h b/ext/cudart/include/cudaD3D11Typedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..f517c3d48391eee5e5c48d046876c6fc9fdd3a71
--- /dev/null
+++ b/ext/cudart/include/cudaD3D11Typedefs.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAD3D11TYPEDEFS_H
+#define CUDAD3D11TYPEDEFS_H
+
+// Dependent includes for cudaD3D11.h
+#include <rpcsal.h>
+#include <D3D11_1.h>
+
+#include <cudaD3D11.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaD3D11.h
+ */
+#define PFN_cuD3D11GetDevice  PFN_cuD3D11GetDevice_v3000
+#define PFN_cuD3D11GetDevices  PFN_cuD3D11GetDevices_v3020
+#define PFN_cuGraphicsD3D11RegisterResource  PFN_cuGraphicsD3D11RegisterResource_v3000
+#define PFN_cuD3D11CtxCreate  PFN_cuD3D11CtxCreate_v3020
+#define PFN_cuD3D11CtxCreateOnDevice  PFN_cuD3D11CtxCreateOnDevice_v3020
+#define PFN_cuD3D11GetDirect3DDevice  PFN_cuD3D11GetDirect3DDevice_v3020
+
+
+/**
+ * Type definitions for functions defined in cudaD3D11.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuD3D11GetDevice_v3000)(CUdevice_v1 *pCudaDevice, IDXGIAdapter *pAdapter);
+typedef CUresult (CUDAAPI *PFN_cuD3D11GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, CUd3d11DeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D11RegisterResource_v3000)(CUgraphicsResource *pCudaResource, ID3D11Resource *pD3DResource, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
+typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, ID3D11Device *pD3DDevice, CUdevice_v1 cudaDevice);
+typedef CUresult (CUDAAPI *PFN_cuD3D11GetDirect3DDevice_v3020)(ID3D11Device **ppD3DDevice);
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreate_v3000)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/ext/cudart/include/cudaD3D9.h b/ext/cudart/include/cudaD3D9.h
new file mode 100644
index 0000000000000000000000000000000000000000..d615e35316740b83446462041df8645d3af22ca7
--- /dev/null
+++ b/ext/cudart/include/cudaD3D9.h
@@ -0,0 +1,886 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAD3D9_H
+#define CUDAD3D9_H
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#define cuD3D9CtxCreate                    cuD3D9CtxCreate_v2
+#define cuD3D9ResourceGetSurfaceDimensions cuD3D9ResourceGetSurfaceDimensions_v2
+#define cuD3D9ResourceGetMappedPointer     cuD3D9ResourceGetMappedPointer_v2
+#define cuD3D9ResourceGetMappedSize        cuD3D9ResourceGetMappedSize_v2
+#define cuD3D9ResourceGetMappedPitch       cuD3D9ResourceGetMappedPitch_v2
+#define cuD3D9MapVertexBuffer              cuD3D9MapVertexBuffer_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \file cudaD3D9.h
+ * \brief Header file for the Direct3D 9 interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_D3D9 Direct3D 9 Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ Direct3D 9 interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the Direct3D 9 interoperability functions of the
+ * low-level CUDA driver application programming interface. Note that mapping 
+ * of Direct3D 9 resources is performed with the graphics API agnostic, resource 
+ * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to a D3D9 device
+ */
+typedef enum CUd3d9DeviceList_enum {
+    CU_D3D9_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by a D3D9 device */
+    CU_D3D9_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by a D3D9 device in its currently rendering frame */
+    CU_D3D9_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D9 device in the next frame */
+} CUd3d9DeviceList;
+
+/**
+ * \brief Gets the CUDA device corresponding to a display adapter.
+ *
+ * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the
+ * adapter name \p pszAdapterName obtained from ::EnumDisplayDevices() or
+ * ::IDirect3D9::GetAdapterIdentifier().
+ *
+ * If no device on the adapter with name \p pszAdapterName is CUDA-compatible,
+ * then the call will fail.
+ *
+ * \param pCudaDevice    - Returned CUDA device corresponding to pszAdapterName
+ * \param pszAdapterName - Adapter name to query for device
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D9CtxCreate,
+ * ::cudaD3D9GetDevice
+ */
+CUresult CUDAAPI cuD3D9GetDevice(CUdevice *pCudaDevice, const char *pszAdapterName);
+
+/**
+ * \brief Gets the CUDA devices corresponding to a Direct3D 9 device
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding
+ * to the Direct3D 9 device \p pD3D9Device.
+ * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices
+ * corresponding to the Direct3D 9 device \p pD3D9Device.
+ *
+ * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
+ * call will return ::CUDA_ERROR_NO_DEVICE.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D9Device
+ * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D9Device
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param pD3D9Device      - Direct3D 9 device to query for CUDA devices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::CU_D3D9_DEVICE_LIST_ALL for all devices,
+ *                           ::CU_D3D9_DEVICE_LIST_CURRENT_FRAME for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::CU_D3D9_DEVICE_LIST_NEXT_FRAME for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_NO_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_FOUND,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D9CtxCreate,
+ * ::cudaD3D9GetDevices
+ */
+CUresult CUDAAPI cuD3D9GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, CUd3d9DeviceList deviceList);
+
+/**
+ * \brief Create a CUDA context for interoperability with Direct3D 9
+ *
+ * Creates a new CUDA context, enables interoperability for that context with
+ * the Direct3D device \p pD3DDevice, and associates the created CUDA context
+ * with the calling thread.
+ * The created ::CUcontext will be returned in \p *pCtx.
+ * Direct3D resources from this device may be registered and mapped through the
+ * lifetime of this CUDA context.
+ * If \p pCudaDevice is non-NULL then the ::CUdevice on which this CUDA context was
+ * created will be returned in \p *pCudaDevice.
+ *
+ * On success, this call will increase the internal reference count on
+ * \p pD3DDevice. This reference count will be decremented upon destruction of
+ * this context through ::cuCtxDestroy().
+ * This context will cease to function if \p pD3DDevice is destroyed or encounters
+ * an error.
+ *
+ * Note that this function is never required for correct functionality.  Use of 
+ * this function will result in accelerated interoperability only when the
+ * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice 
+ * is not an IDirect3DDevice9Ex.  In all other circumstances, this function is 
+ * not necessary.
+ *
+ * \param pCtx        - Returned newly created CUDA context
+ * \param pCudaDevice - Returned pointer to the device on which the context was created
+ * \param Flags       - Context creation flags (see ::cuCtxCreate() for details)
+ * \param pD3DDevice  - Direct3D device to create interoperability context with
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D9GetDevice,
+ * ::cuGraphicsD3D9RegisterResource
+ */
+CUresult CUDAAPI cuD3D9CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
+
+/**
+ * \brief Create a CUDA context for interoperability with Direct3D 9
+ *
+ * Creates a new CUDA context, enables interoperability for that context with
+ * the Direct3D device \p pD3DDevice, and associates the created CUDA context
+ * with the calling thread.
+ * The created ::CUcontext will be returned in \p *pCtx.
+ * Direct3D resources from this device may be registered and mapped through the
+ * lifetime of this CUDA context.
+ *
+ * On success, this call will increase the internal reference count on
+ * \p pD3DDevice. This reference count will be decremented upon destruction of
+ * this context through ::cuCtxDestroy().
+ * This context will cease to function if \p pD3DDevice is destroyed or encounters
+ * an error.
+ *
+ * Note that this function is never required for correct functionality.  Use of 
+ * this function will result in accelerated interoperability only when the
+ * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice 
+ * is not an IDirect3DDevice9Ex.  In all other circumstances, this function is 
+ * not necessary.
+ *
+ * \param pCtx        - Returned newly created CUDA context
+ * \param flags       - Context creation flags (see ::cuCtxCreate() for details)
+ * \param pD3DDevice  - Direct3D device to create interoperability context with
+ * \param cudaDevice  - The CUDA device on which to create the context.  This device
+ *                      must be among the devices returned when querying
+ *                      ::CU_D3D9_DEVICES_ALL from  ::cuD3D9GetDevices.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D9GetDevices,
+ * ::cuGraphicsD3D9RegisterResource
+ */
+CUresult CUDAAPI cuD3D9CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, IDirect3DDevice9 *pD3DDevice, CUdevice cudaDevice);
+
+/**
+ * \brief Get the Direct3D 9 device against which the current CUDA context was
+ * created
+ *
+ * Returns in \p *ppD3DDevice the Direct3D device against which this CUDA context
+ * was created in ::cuD3D9CtxCreate().
+ *
+ * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT
+ * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D9GetDevice,
+ * ::cudaD3D9GetDirect3DDevice
+ */
+CUresult CUDAAPI cuD3D9GetDirect3DDevice(IDirect3DDevice9 **ppD3DDevice);
+
+/**
+ * \brief Register a Direct3D 9 resource for access by CUDA
+ *
+ * Registers the Direct3D 9 resource \p pD3DResource for access by CUDA and
+ * returns a CUDA handle to \p pD3Dresource in \p pCudaResource.
+ * The handle returned in \p pCudaResource may be used to map and unmap this
+ * resource until it is unregistered.
+ * On success this call will increase the internal reference count on
+ * \p pD3DResource. This reference count will be decremented when this
+ * resource is unregistered through ::cuGraphicsUnregisterResource().
+ *
+ * This call is potentially high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pD3DResource must be one of the following.
+ * - ::IDirect3DVertexBuffer9: may be accessed through a device pointer
+ * - ::IDirect3DIndexBuffer9: may be accessed through a device pointer
+ * - ::IDirect3DSurface9: may be accessed through an array.
+ *     Only stand-alone objects of type ::IDirect3DSurface9
+ *     may be explicitly shared. In particular, individual mipmap levels and faces
+ *     of cube maps may not be registered directly. To access individual surfaces
+ *     associated with a texture, one must register the base texture object.
+ * - ::IDirect3DBaseTexture9: individual surfaces on this texture may be accessed
+ *     through an array.
+ *
+ * The \p Flags argument may be used to specify additional parameters at register
+ * time.  The valid values for this parameter are
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA.  The following are some limitations.
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Resources allocated as shared may not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * A complete list of supported formats is as follows:
+ * - D3DFMT_L8
+ * - D3DFMT_L16
+ * - D3DFMT_A8R8G8B8
+ * - D3DFMT_X8R8G8B8
+ * - D3DFMT_G16R16
+ * - D3DFMT_A8B8G8R8
+ * - D3DFMT_A8
+ * - D3DFMT_A8L8
+ * - D3DFMT_Q8W8V8U8
+ * - D3DFMT_V16U16
+ * - D3DFMT_A16B16G16R16F
+ * - D3DFMT_A16B16G16R16
+ * - D3DFMT_R32F
+ * - D3DFMT_G16R16F
+ * - D3DFMT_A32B32G32R32F
+ * - D3DFMT_G32R32F
+ * - D3DFMT_R16F
+ *
+ * If Direct3D interoperability is not initialized for this context using
+ * ::cuD3D9CtxCreate then ::CUDA_ERROR_INVALID_CONTEXT is returned.
+ * If \p pD3DResource is of incorrect type or is already registered then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
+ * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE
+ * is returned.
+ *
+ * \param pCudaResource - Returned graphics resource handle
+ * \param pD3DResource  - Direct3D resource to register
+ * \param Flags         - Parameters for resource registration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuD3D9CtxCreate,
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsD3D9RegisterResource
+ */
+CUresult CUDAAPI cuGraphicsD3D9RegisterResource(CUgraphicsResource *pCudaResource, IDirect3DResource9 *pD3DResource, unsigned int Flags);
+
+/**
+ * \defgroup CUDA_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated Direct3D 9 interoperability functions of the
+ * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated Direct3D 9 interoperability functionality.
+ * @{
+ */
+
+/** Flags to register a resource */
+typedef enum CUd3d9register_flags_enum {
+    CU_D3D9_REGISTER_FLAGS_NONE  = 0x00,
+    CU_D3D9_REGISTER_FLAGS_ARRAY = 0x01,
+} CUd3d9register_flags;
+
+/** Flags to map or unmap a resource */
+typedef enum CUd3d9map_flags_enum {
+    CU_D3D9_MAPRESOURCE_FLAGS_NONE         = 0x00,
+    CU_D3D9_MAPRESOURCE_FLAGS_READONLY     = 0x01,
+    CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD = 0x02,
+} CUd3d9map_flags;
+
+/**
+ * \brief Register a Direct3D resource for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Registers the Direct3D resource \p pResource for access by CUDA.
+ *
+ * If this call is successful, then the application will be able to map and
+ * unmap this resource until it is unregistered through
+ * ::cuD3D9UnregisterResource(). Also on success, this call will increase the
+ * internal reference count on \p pResource. This reference count will be
+ * decremented when this resource is unregistered through
+ * ::cuD3D9UnregisterResource().
+ *
+ * This call is potentially high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pResource must be one of the following.
+ *
+ * - ::IDirect3DVertexBuffer9: Cannot be used with \p Flags set to
+ *   ::CU_D3D9_REGISTER_FLAGS_ARRAY.
+ * - ::IDirect3DIndexBuffer9: Cannot be used with \p Flags set to
+ *   ::CU_D3D9_REGISTER_FLAGS_ARRAY.
+ * - ::IDirect3DSurface9: Only stand-alone objects of type ::IDirect3DSurface9
+ *   may be explicitly shared. In particular, individual mipmap levels and
+ *   faces of cube maps may not be registered directly. To access individual
+ *   surfaces associated with a texture, one must register the base texture
+ *   object. For restrictions on the \p Flags parameter, see type
+ *   ::IDirect3DBaseTexture9.
+ * - ::IDirect3DBaseTexture9: When a texture is registered, all surfaces
+ *   associated with the all mipmap levels of all faces of the texture will be
+ *   accessible to CUDA.
+ *
+ * The \p Flags argument specifies the mechanism through which CUDA will access
+ * the Direct3D resource. The following values are allowed.
+ *
+ * - CU_D3D9_REGISTER_FLAGS_NONE: Specifies that CUDA will access this resource
+ *   through a ::CUdeviceptr. The pointer, size, and (for textures), pitch for
+ *   each subresource of this allocation may be queried through
+ *   ::cuD3D9ResourceGetMappedPointer(), ::cuD3D9ResourceGetMappedSize(), and
+ *   ::cuD3D9ResourceGetMappedPitch() respectively. This option is valid for
+ *   all resource types.
+ * - ::CU_D3D9_REGISTER_FLAGS_ARRAY: Specifies that CUDA will access this
+ *   resource through a ::CUarray queried on a sub-resource basis through
+ *   ::cuD3D9ResourceGetMappedArray(). This option is only valid for resources
+ *   of type ::IDirect3DSurface9 and subtypes of ::IDirect3DBaseTexture9.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA. The following are some limitations.
+ *
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Resources allocated as shared may not be registered with CUDA.
+ * - Any resources allocated in ::D3DPOOL_SYSTEMMEM or ::D3DPOOL_MANAGED may
+ *   not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * If Direct3D interoperability is not initialized on this context, then
+ * ::CUDA_ERROR_INVALID_CONTEXT is returned. If \p pResource is of incorrect
+ * type (e.g. is a non-stand-alone ::IDirect3DSurface9) or is already
+ * registered, then ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource
+ * cannot be registered then ::CUDA_ERROR_UNKNOWN is returned.
+ *
+ * \param pResource - Resource to register for CUDA access
+ * \param Flags     - Flags for resource registration
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsD3D9RegisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9RegisterResource(IDirect3DResource9 *pResource, unsigned int Flags);
+
+/**
+ * \brief Unregister a Direct3D resource
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Unregisters the Direct3D resource \p pResource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned.
+ *
+ * \param pResource - Resource to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsUnregisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnregisterResource(IDirect3DResource9 *pResource);
+
+/**
+ * \brief Map Direct3D resources for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Maps the \p count Direct3D resources in \p ppResource for access by CUDA.
+ *
+ * The resources in \p ppResource may be accessed in CUDA kernels until they
+ * are unmapped. Direct3D should not access any resources while they are mapped
+ * by CUDA. If an application does so the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any Direct3D calls
+ * issued before ::cuD3D9MapResources() will complete before any CUDA kernels
+ * issued after ::cuD3D9MapResources() begin.
+ *
+ * If any of \p ppResource have not been registered for use with CUDA or if
+ * \p ppResource contains any duplicate entries, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned.  If any of \p ppResource are
+ * presently mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is
+ * returned.
+ *
+ * \param count      - Number of resources in ppResource
+ * \param ppResource - Resources to map for CUDA usage
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9MapResources(unsigned int count, IDirect3DResource9 **ppResource);
+
+/**
+ * \brief Unmaps Direct3D resources
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Unmaps the \p count Direct3D resources in \p ppResource.
+ *
+ * This function provides the synchronization guarantee that any CUDA kernels
+ * issued before ::cuD3D9UnmapResources() will complete before any Direct3D
+ * calls issued after ::cuD3D9UnmapResources() begin.
+ *
+ * If any of \p ppResource have not been registered for use with CUDA or if
+ * \p ppResource contains any duplicate entries, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResource are not
+ * presently mapped for access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is
+ * returned.
+ *
+ * \param count      - Number of resources to unmap for CUDA
+ * \param ppResource - Resources to unmap for CUDA
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa
+ * ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnmapResources(unsigned int count, IDirect3DResource9 **ppResource);
+
+/**
+ * \brief Set usage flags for mapping a Direct3D resource
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0.
+ *
+ * Set \p Flags for mapping the Direct3D resource \p pResource.
+ *
+ * Changes to \p Flags will take effect the next time \p pResource is mapped.
+ * The \p Flags argument may be any of the following:
+ * - ::CU_D3D9_MAPRESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels. This is the default value.
+ * - ::CU_D3D9_MAPRESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p pResource has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is presently
+ * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * \param pResource - Registered resource to set flags for
+ * \param Flags     - Parameters for resource mapping
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceSetMapFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceSetMapFlags(IDirect3DResource9 *pResource, unsigned int Flags);
+
+/**
+ * \brief Get the dimensions of a registered surface
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
+ * subresource of the mapped Direct3D resource \p pResource, which corresponds
+ * to \p Face and \p Level.
+ *
+ * Because anti-aliased surfaces may have multiple samples per pixel, it is
+ * possible that the dimensions of a resource will be an integer factor larger
+ * than the dimensions reported by the Direct3D runtime.
+ *
+ * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
+ * surfaces, the value returned in \p *pDepth will be 0.
+ *
+ * If \p pResource is not of type ::IDirect3DBaseTexture9 or
+ * ::IDirect3DSurface9 or if \p pResource has not been registered for use with
+ * CUDA, then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ *
+ * For usage requirements of \p Face and \p Level parameters, see
+ * ::cuD3D9ResourceGetMappedPointer().
+ *
+ * \param pWidth    - Returned width of surface
+ * \param pHeight   - Returned height of surface
+ * \param pDepth    - Returned depth of surface
+ * \param pResource - Registered resource to access
+ * \param Face      - Face of resource to access
+ * \param Level     - Level of resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsSubResourceGetMappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+
+/**
+ * \brief Get an array through which to access a subresource of a Direct3D
+ * resource which has been mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pArray an array through which the subresource of the mapped
+ * Direct3D resource \p pResource which corresponds to \p Face and \p Level may
+ * be accessed. The value set in \p pArray may change every time that
+ * \p pResource is mapped.
+ *
+ * If \p pResource is not registered then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned. If \p pResource was not registered with usage flags
+ * ::CU_D3D9_REGISTER_FLAGS_ARRAY then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned. If \p pResource is not mapped then ::CUDA_ERROR_NOT_MAPPED is
+ * returned.
+ *
+ * For usage requirements of \p Face and \p Level parameters, see
+ * ::cuD3D9ResourceGetMappedPointer().
+ *
+ * \param pArray    - Returned array corresponding to subresource
+ * \param pResource - Mapped resource to access
+ * \param Face      - Face of resource to access
+ * \param Level     - Level of resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsSubResourceGetMappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedArray(CUarray *pArray, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+
+/**
+ * \brief Get the pointer through which to access a subresource of a Direct3D
+ * resource which has been mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pDevPtr the base pointer of the subresource of the mapped
+ * Direct3D resource \p pResource, which corresponds to \p Face and \p Level.
+ * The value set in \p pDevPtr may change every time that \p pResource is
+ * mapped.
+ *
+ * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is
+ * returned. If \p pResource was not registered with usage flags
+ * ::CU_D3D9_REGISTER_FLAGS_NONE, then ::CUDA_ERROR_INVALID_HANDLE is returned.
+ * If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * If \p pResource is of type ::IDirect3DCubeTexture9, then \p Face must one
+ * of the values enumerated by type ::D3DCUBEMAP_FACES.  For all other types
+ * \p Face must be 0. If \p Face is invalid, then ::CUDA_ERROR_INVALID_VALUE
+ * is returned.
+ *
+ * If \p pResource is of type ::IDirect3DBaseTexture9, then \p Level must
+ * correspond to a valid mipmap level. At present only mipmap level 0 is
+ * supported. For all other types \p Level must be 0. If \p Level is invalid,
+ * then ::CUDA_ERROR_INVALID_VALUE is returned.
+ *
+ * \param pDevPtr     - Returned pointer corresponding to subresource
+ * \param pResource   - Mapped resource to access
+ * \param Face        - Face of resource to access
+ * \param Level       - Level of resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceGetMappedPointer
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedPointer(CUdeviceptr *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+
+/**
+ * \brief Get the size of a subresource of a Direct3D resource which has been
+ * mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pSize the size of the subresource of the mapped Direct3D
+ * resource \p pResource, which corresponds to \p Face and \p Level. The value
+ * set in \p pSize may change every time that \p pResource is mapped.
+ *
+ * If \p pResource has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered
+ * with usage flags ::CU_D3D9_REGISTER_FLAGS_NONE, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for
+ * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * For usage requirements of \p Face and \p Level parameters, see
+ * ::cuD3D9ResourceGetMappedPointer.
+ *
+ * \param pSize       - Returned size of subresource
+ * \param pResource   - Mapped resource to access
+ * \param Face        - Face of resource to access
+ * \param Level       - Level of resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceGetMappedPointer
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedSize(size_t *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+
+/**
+ * \brief Get the pitch of a subresource of a Direct3D resource which has been
+ * mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
+ * the subresource of the mapped Direct3D resource \p pResource, which
+ * corresponds to \p Face and \p Level. The values set in \p pPitch and
+ * \p pPitchSlice may change every time that \p pResource is mapped.
+ *
+ * The pitch and Z-slice pitch values may be used to compute the location of a
+ * sample on a surface as follows.
+ *
+ * For a 2D surface, the byte offset of the sample at position \b x, \b y from
+ * the base pointer of the surface is:
+ *
+ * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
+ *
+ * For a 3D surface, the byte offset of the sample at position \b x, \b y,
+ * \b z from the base pointer of the surface is:
+ *
+ * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
+ *
+ * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
+ * NULL.
+ *
+ * If \p pResource is not of type ::IDirect3DBaseTexture9 or one of its
+ * sub-types or if \p pResource has not been registered for use with CUDA,
+ * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
+ * registered with usage flags ::CU_D3D9_REGISTER_FLAGS_NONE, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped
+ * for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned.
+ *
+ * For usage requirements of \p Face and \p Level parameters, see
+ * ::cuD3D9ResourceGetMappedPointer().
+ *
+ * \param pPitch      - Returned pitch of subresource
+ * \param pPitchSlice - Returned Z-slice pitch of subresource
+ * \param pResource   - Mapped resource to access
+ * \param Face        - Face of resource to access
+ * \param Level       - Level of resource to access
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_NOT_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsSubResourceGetMappedArray
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+
+/* CUDA 1.x compatibility API. These functions are deprecated, please use the ones above. */
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9Begin(IDirect3DDevice9 *pDevice);
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9End(void);
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9RegisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9MapVertexBuffer(CUdeviceptr *pDevPtr, size_t *pSize, IDirect3DVertexBuffer9 *pVB);
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnmapVertexBuffer(IDirect3DVertexBuffer9 *pVB);
+__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnregisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
+
+/** @} */ /* END CUDA_D3D9_DEPRECATED */
+/** @} */ /* END CUDA_D3D9 */
+
+
+/**
+ * CUDA API versioning support
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuD3D9CtxCreate
+    #undef cuD3D9ResourceGetSurfaceDimensions
+    #undef cuD3D9ResourceGetMappedPointer
+    #undef cuD3D9ResourceGetMappedSize
+    #undef cuD3D9ResourceGetMappedPitch
+    #undef cuD3D9MapVertexBuffer
+
+    CUresult CUDAAPI cuD3D9CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
+    CUresult CUDAAPI cuD3D9ResourceGetSurfaceDimensions(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+    CUresult CUDAAPI cuD3D9ResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+    CUresult CUDAAPI cuD3D9ResourceGetMappedSize(unsigned int *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+    CUresult CUDAAPI cuD3D9ResourceGetMappedPitch(unsigned int *pPitch, unsigned int *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+    CUresult CUDAAPI cuD3D9MapVertexBuffer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, IDirect3DVertexBuffer9 *pVB);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif
+
diff --git a/ext/cudart/include/cudaD3D9Typedefs.h b/ext/cudart/include/cudaD3D9Typedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..40bcdc6014707f64864ca69cdaf24b3ec966fbfc
--- /dev/null
+++ b/ext/cudart/include/cudaD3D9Typedefs.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAD3D9TYPEDEFS_H
+#define CUDAD3D9TYPEDEFS_H
+
+// Dependent includes for cudaD3D11.h
+#include <d3d9.h>
+
+#include <cudaD3D9.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaD3D9.h
+ */
+#define PFN_cuD3D9GetDevice  PFN_cuD3D9GetDevice_v2000
+#define PFN_cuD3D9GetDevices  PFN_cuD3D9GetDevices_v3020
+#define PFN_cuD3D9CtxCreate  PFN_cuD3D9CtxCreate_v3020
+#define PFN_cuD3D9CtxCreateOnDevice  PFN_cuD3D9CtxCreateOnDevice_v3020
+#define PFN_cuD3D9GetDirect3DDevice  PFN_cuD3D9GetDirect3DDevice_v2000
+#define PFN_cuGraphicsD3D9RegisterResource  PFN_cuGraphicsD3D9RegisterResource_v3000
+#define PFN_cuD3D9RegisterResource  PFN_cuD3D9RegisterResource_v2000
+#define PFN_cuD3D9UnregisterResource  PFN_cuD3D9UnregisterResource_v2000
+#define PFN_cuD3D9MapResources  PFN_cuD3D9MapResources_v2000
+#define PFN_cuD3D9UnmapResources  PFN_cuD3D9UnmapResources_v2000
+#define PFN_cuD3D9ResourceSetMapFlags  PFN_cuD3D9ResourceSetMapFlags_v2000
+#define PFN_cuD3D9ResourceGetSurfaceDimensions  PFN_cuD3D9ResourceGetSurfaceDimensions_v3020
+#define PFN_cuD3D9ResourceGetMappedArray  PFN_cuD3D9ResourceGetMappedArray_v2010
+#define PFN_cuD3D9ResourceGetMappedPointer  PFN_cuD3D9ResourceGetMappedPointer_v3020
+#define PFN_cuD3D9ResourceGetMappedSize  PFN_cuD3D9ResourceGetMappedSize_v3020
+#define PFN_cuD3D9ResourceGetMappedPitch  PFN_cuD3D9ResourceGetMappedPitch_v3020
+#define PFN_cuD3D9Begin  PFN_cuD3D9Begin_v2000
+#define PFN_cuD3D9End  PFN_cuD3D9End_v2000
+#define PFN_cuD3D9RegisterVertexBuffer  PFN_cuD3D9RegisterVertexBuffer_v2000
+#define PFN_cuD3D9MapVertexBuffer  PFN_cuD3D9MapVertexBuffer_v3020
+#define PFN_cuD3D9UnmapVertexBuffer  PFN_cuD3D9UnmapVertexBuffer_v2000
+#define PFN_cuD3D9UnregisterVertexBuffer  PFN_cuD3D9UnregisterVertexBuffer_v2000
+
+
+/**
+ * Type definitions for functions defined in cudaD3D9.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuD3D9GetDevice_v2000)(CUdevice_v1 *pCudaDevice, const char *pszAdapterName);
+typedef CUresult (CUDAAPI *PFN_cuD3D9GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, CUd3d9DeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
+typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, IDirect3DDevice9 *pD3DDevice, CUdevice_v1 cudaDevice);
+typedef CUresult (CUDAAPI *PFN_cuD3D9GetDirect3DDevice_v2000)(IDirect3DDevice9 **ppD3DDevice);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D9RegisterResource_v3000)(CUgraphicsResource *pCudaResource, IDirect3DResource9 *pD3DResource, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuD3D9RegisterResource_v2000)(IDirect3DResource9 *pResource, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuD3D9UnregisterResource_v2000)(IDirect3DResource9 *pResource);
+typedef CUresult (CUDAAPI *PFN_cuD3D9MapResources_v2000)(unsigned int count, IDirect3DResource9 **ppResource);
+typedef CUresult (CUDAAPI *PFN_cuD3D9UnmapResources_v2000)(unsigned int count, IDirect3DResource9 **ppResource);
+typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceSetMapFlags_v2000)(IDirect3DResource9 *pResource, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetSurfaceDimensions_v3020)(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedArray_v2010)(CUarray *pArray, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedSize_v3020)(size_t *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPitch_v3020)(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+typedef CUresult (CUDAAPI *PFN_cuD3D9Begin_v2000)(IDirect3DDevice9 *pDevice);
+typedef CUresult (CUDAAPI *PFN_cuD3D9End_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuD3D9RegisterVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
+typedef CUresult (CUDAAPI *PFN_cuD3D9MapVertexBuffer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, IDirect3DVertexBuffer9 *pVB);
+typedef CUresult (CUDAAPI *PFN_cuD3D9UnmapVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
+typedef CUresult (CUDAAPI *PFN_cuD3D9UnregisterVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB);
+
+/*
+ * Type definitions for older versioned functions in cudaD3D9.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreate_v2000)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice);
+    typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetSurfaceDimensions_v2000)(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+    typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPointer_v2000)(CUdeviceptr_v1 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+    typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedSize_v2000)(unsigned int *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+    typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPitch_v2000)(unsigned int *pPitch, unsigned int *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level);
+    typedef CUresult (CUDAAPI *PFN_cuD3D9MapVertexBuffer_v2000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, IDirect3DVertexBuffer9 *pVB);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/ext/cudart/include/cudaGL.h b/ext/cudart/include/cudaGL.h
new file mode 100644
index 0000000000000000000000000000000000000000..21e972ae22658c55d6387599c9a92f3c1d50f20c
--- /dev/null
+++ b/ext/cudart/include/cudaGL.h
@@ -0,0 +1,605 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAGL_H
+#define CUDAGL_H
+
+#include <cuda.h>
+#include <GL/gl.h>
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+
+#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __CUDA_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDA_API_PTDS(api) api ## _ptds
+    #define __CUDA_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDA_API_PTDS(api) api
+    #define __CUDA_API_PTSZ(api) api
+#endif
+
+#define cuGLCtxCreate            cuGLCtxCreate_v2
+#define cuGLMapBufferObject      __CUDA_API_PTDS(cuGLMapBufferObject_v2)
+#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2)
+#define cuGLGetDevices           cuGLGetDevices_v2
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * \file cudaGL.h
+ * \brief Header file for the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ */
+
+/**
+ * \defgroup CUDA_GL OpenGL Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA
+ * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the OpenGL interoperability functions of the
+ * low-level CUDA driver application programming interface. Note that mapping 
+ * of OpenGL resources is performed with the graphics API agnostic, resource 
+ * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability".
+ *
+ * @{
+ */
+
+#if defined(_WIN32)
+#if !defined(WGL_NV_gpu_affinity)
+typedef void* HGPUNV;
+#endif
+#endif /* _WIN32 */
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * pCudaResource.  The register flags \p Flags specify the intended usage,
+ * as follows:
+ *
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param buffer - name of buffer object to be registered
+ * \param Flags - Register flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa 
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsGLRegisterBuffer
+ */
+CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.  
+ * A handle to the registered object is returned as \p pCudaResource.  
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p Flags specify the intended usage, as follows:
+ *
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param image - name of texture or renderbuffer object to be registered
+ * \param target - Identifies the type of object specified by \p image
+ * \param Flags - Register flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa 
+ * ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsMapResources,
+ * ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsGLRegisterImage
+ */
+CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+
+#ifdef _WIN32
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns in \p *pDevice the CUDA device associated with a \p hGpu, if
+ * applicable.
+ *
+ * \param pDevice - Device associated with hGpu
+ * \param hGpu    - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity()
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync,
+ * ::cuGLSetBufferObjectMapFlags,
+ * ::cudaWGLGetDevice
+ */
+CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu);
+#endif /* _WIN32 */
+
+/**
+ * CUDA devices corresponding to an OpenGL device
+ */
+typedef enum CUGLDeviceList_enum {
+    CU_GL_DEVICE_LIST_ALL            = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+    CU_GL_DEVICE_LIST_CURRENT_FRAME  = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+    CU_GL_DEVICE_LIST_NEXT_FRAME     = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */
+} CUGLDeviceList;
+
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
+ * at most cudaDeviceCount of the CUDA-compatible devices corresponding to 
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE.
+ *
+ * The \p deviceList argument may be any of the following:
+ * - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context.
+ * - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to
+ *   render the current frame (in SLI).
+ * - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to
+ *   render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that
+ *   this is correct in all cases.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices.
+ * \param pCudaDevices     - Returned CUDA devices.
+ * \param cudaDeviceCount  - The size of the output device array pCudaDevices.
+ * \param deviceList       - The set of devices to return.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NO_DEVICE,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT
+ *
+ * \notefnerr
+ *
+ * \sa
+ * ::cuWGLGetDevice,
+ * ::cudaGLGetDevices
+ */
+CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+
+/**
+ * \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level
+ * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+
+/** Flags to map or unmap a resource */
+typedef enum CUGLmap_flags_enum {
+    CU_GL_MAP_RESOURCE_FLAGS_NONE          = 0x00,
+    CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY     = 0x01,
+    CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02,    
+} CUGLmap_flags;
+
+/**
+ * \brief Create a CUDA context for interoperability with OpenGL
+ *
+ * \deprecated This function is deprecated as of Cuda 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA context with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * \param pCtx   - Returned CUDA context
+ * \param Flags  - Options for CUDA context creation
+ * \param device - Device on which to create the context
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ * ::cuWGLGetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
+
+/**
+ * \brief Initializes OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Initializes OpenGL interoperability. This function is deprecated
+ * and calling it is no longer required. It may fail if the needed
+ * OpenGL driver facilities are not available.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_UNKNOWN
+ * \notefnerr
+ *
+ * \sa ::cuGLMapBufferObject,
+ * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject,
+ * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync,
+ * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags,
+ * ::cuWGLGetDevice
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void);
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  There must be a valid OpenGL context bound to the current
+ * thread when this function is called, and the buffer name is
+ * resolved by that context.
+ *
+ * \param buffer - The name of the buffer object to register.
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_ALREADY_MAPPED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsGLRegisterBuffer
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer);
+
+/**
+ * \brief Maps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Maps the buffer object specified by \p buffer into the address space of the
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
+ * and size of the resulting mapping.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * All streams in the current CUDA context are synchronized with the
+ * current GL context.
+ *
+ * \param dptr   - Returned mapped base pointer
+ * \param size   - Returned size of mapping
+ * \param buffer - The name of the buffer object to map
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_MAP_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size,  GLuint buffer);  
+
+/**
+ * \brief Unmaps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * All streams in the current CUDA context are synchronized with the
+ * current GL context.
+ *
+ * \param buffer - Buffer object to unmap
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer);
+
+/**
+ * \brief Unregister an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unregisters the buffer object specified by \p buffer.  This
+ * releases any resources associated with the registered buffer.
+ * After this call, the buffer may no longer be mapped for access by
+ * CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * \param buffer - Name of the buffer object to unregister
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnregisterResource
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer);
+
+/**
+ * \brief Set the map flags for an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Sets the map flags for the buffer object specified by \p buffer.
+ *
+ * Changes to \p Flags will take effect the next time \p buffer is mapped.
+ * The \p Flags argument may be any of the following:
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA kernels. This is the default value.
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which
+ *   access this resource will not write to this resource.
+ * - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels
+ *   which access this resource will not read from this resource and will
+ *   write over the entire contents of the resource, so none of the data
+ *   previously stored in the resource will be preserved.
+ *
+ * If \p buffer has not been registered for use with CUDA, then
+ * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently
+ * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * \param buffer - Buffer object to unmap
+ * \param Flags  - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsResourceSetMapFlags
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags);
+
+/**
+ * \brief Maps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Maps the buffer object specified by \p buffer into the address space of the
+ * current CUDA context and returns in \p *dptr and \p *size the base pointer
+ * and size of the resulting mapping.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * Stream \p hStream in the current CUDA context is synchronized with
+ * the current GL context.
+ *
+ * \param dptr    - Returned mapped base pointer
+ * \param size    - Returned size of mapping
+ * \param buffer  - The name of the buffer object to map
+ * \param hStream - Stream to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_MAP_FAILED
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsMapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
+
+/**
+ * \brief Unmaps an OpenGL buffer object
+ *
+ * \deprecated This function is deprecated as of Cuda 3.0. 
+ *
+ * Unmaps the buffer object specified by \p buffer for access by CUDA.
+ *
+ * There must be a valid OpenGL context bound to the current thread
+ * when this function is called.  This must be the same context, or a
+ * member of the same shareGroup, as the context that was bound when
+ * the buffer was registered.
+ *
+ * Stream \p hStream in the current CUDA context is synchronized with
+ * the current GL context.
+ *
+ * \param buffer  - Name of the buffer object to unmap
+ * \param hStream - Stream to synchronize
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuGraphicsUnmapResources
+ */
+__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream);
+
+/** @} */ /* END CUDA_GL_DEPRECATED */
+/** @} */ /* END CUDA_GL */
+
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuGLCtxCreate
+    #undef cuGLMapBufferObject
+    #undef cuGLMapBufferObjectAsync
+    #undef cuGLGetDevices
+
+    CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+    CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer);
+    CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size,  GLuint buffer, CUstream hStream);
+    CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device );
+    CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer);
+    CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size,  GLuint buffer, CUstream hStream);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+
+#ifdef __cplusplus
+};
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif
diff --git a/ext/cudart/include/cudaGLTypedefs.h b/ext/cudart/include/cudaGLTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..81f0d5349e435159647af9af379d1e8e8441221c
--- /dev/null
+++ b/ext/cudart/include/cudaGLTypedefs.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAGLTYPEDEFS_H
+#define CUDAGLTYPEDEFS_H
+
+// Dependent includes for cudagl.h
+#include <GL/gl.h>
+
+#include <cudaGL.h>
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaGL.h
+ */
+#define PFN_cuGraphicsGLRegisterBuffer  PFN_cuGraphicsGLRegisterBuffer_v3000
+#define PFN_cuGraphicsGLRegisterImage  PFN_cuGraphicsGLRegisterImage_v3000
+#define PFN_cuWGLGetDevice  PFN_cuWGLGetDevice_v2020
+#define PFN_cuGLGetDevices  PFN_cuGLGetDevices_v6050
+#define PFN_cuGLCtxCreate  PFN_cuGLCtxCreate_v3020
+#define PFN_cuGLInit  PFN_cuGLInit_v2000
+#define PFN_cuGLRegisterBufferObject  PFN_cuGLRegisterBufferObject_v2000
+#define PFN_cuGLMapBufferObject  __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000)
+#define PFN_cuGLUnmapBufferObject  PFN_cuGLUnmapBufferObject_v2000
+#define PFN_cuGLUnregisterBufferObject  PFN_cuGLUnregisterBufferObject_v2000
+#define PFN_cuGLSetBufferObjectMapFlags  PFN_cuGLSetBufferObjectMapFlags_v2030
+#define PFN_cuGLMapBufferObjectAsync  __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000)
+#define PFN_cuGLUnmapBufferObjectAsync  PFN_cuGLUnmapBufferObjectAsync_v2030
+
+
+/**
+ * Type definitions for functions defined in cudaGL.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags);
+#ifdef _WIN32
+typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu);
+#endif
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream);
+
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer);
+typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/ext/cudart/include/cudaProfilerTypedefs.h b/ext/cudart/include/cudaProfilerTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..bea7df4573aff2fa5b0d0029ce9d40a7ebe2de46
--- /dev/null
+++ b/ext/cudart/include/cudaProfilerTypedefs.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDAPROFILERTYPEDEFS_H
+#define CUDAPROFILERTYPEDEFS_H
+
+#include <cudaProfiler.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cudaProfiler.h
+ */
+#define PFN_cuProfilerInitialize  PFN_cuProfilerInitialize_v4000
+#define PFN_cuProfilerStart  PFN_cuProfilerStart_v4000
+#define PFN_cuProfilerStop  PFN_cuProfilerStop_v4000
+
+
+/**
+ * Type definitions for functions defined in cudaProfiler.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/ext/cudart/include/cudaTypedefs.h b/ext/cudart/include/cudaTypedefs.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ab767efea8d520fb8f95b4485428cafe70dbdbc
--- /dev/null
+++ b/ext/cudart/include/cudaTypedefs.h
@@ -0,0 +1,959 @@
+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef CUDATYPEDEFS_H
+#define CUDATYPEDEFS_H
+
+#include <cuda.h>
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz
+#else
+    #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version
+    #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+
+/*
+ * Macros for the latest version for each driver function in cuda.h
+ */
+#define PFN_cuGetErrorString  PFN_cuGetErrorString_v6000
+#define PFN_cuGetErrorName  PFN_cuGetErrorName_v6000
+#define PFN_cuInit  PFN_cuInit_v2000
+#define PFN_cuDriverGetVersion  PFN_cuDriverGetVersion_v2020
+#define PFN_cuDeviceGet  PFN_cuDeviceGet_v2000
+#define PFN_cuDeviceGetCount  PFN_cuDeviceGetCount_v2000
+#define PFN_cuDeviceGetName  PFN_cuDeviceGetName_v2000
+#define PFN_cuDeviceGetUuid  PFN_cuDeviceGetUuid_v11040
+#define PFN_cuDeviceGetLuid  PFN_cuDeviceGetLuid_v10000
+#define PFN_cuDeviceTotalMem  PFN_cuDeviceTotalMem_v3020
+#define PFN_cuDeviceGetTexture1DLinearMaxWidth  PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010
+#define PFN_cuDeviceGetAttribute  PFN_cuDeviceGetAttribute_v2000
+#define PFN_cuDeviceGetNvSciSyncAttributes  PFN_cuDeviceGetNvSciSyncAttributes_v10020
+#define PFN_cuDeviceSetMemPool  PFN_cuDeviceSetMemPool_v11020
+#define PFN_cuDeviceGetMemPool  PFN_cuDeviceGetMemPool_v11020
+#define PFN_cuDeviceGetDefaultMemPool  PFN_cuDeviceGetDefaultMemPool_v11020
+#define PFN_cuDeviceGetProperties  PFN_cuDeviceGetProperties_v2000
+#define PFN_cuDeviceComputeCapability  PFN_cuDeviceComputeCapability_v2000
+#define PFN_cuDevicePrimaryCtxRetain  PFN_cuDevicePrimaryCtxRetain_v7000
+#define PFN_cuDevicePrimaryCtxRelease  PFN_cuDevicePrimaryCtxRelease_v11000
+#define PFN_cuDevicePrimaryCtxSetFlags  PFN_cuDevicePrimaryCtxSetFlags_v11000
+#define PFN_cuDevicePrimaryCtxGetState  PFN_cuDevicePrimaryCtxGetState_v7000
+#define PFN_cuDevicePrimaryCtxReset  PFN_cuDevicePrimaryCtxReset_v11000
+#define PFN_cuDeviceGetExecAffinitySupport  PFN_cuDeviceGetExecAffinitySupport_v11040
+#define PFN_cuCtxCreate  PFN_cuCtxCreate_v11040
+#define PFN_cuCtxDestroy  PFN_cuCtxDestroy_v4000
+#define PFN_cuCtxPushCurrent  PFN_cuCtxPushCurrent_v4000
+#define PFN_cuCtxPopCurrent  PFN_cuCtxPopCurrent_v4000
+#define PFN_cuCtxSetCurrent  PFN_cuCtxSetCurrent_v4000
+#define PFN_cuCtxGetCurrent  PFN_cuCtxGetCurrent_v4000
+#define PFN_cuCtxGetDevice  PFN_cuCtxGetDevice_v2000
+#define PFN_cuCtxGetFlags  PFN_cuCtxGetFlags_v7000
+#define PFN_cuCtxSynchronize  PFN_cuCtxSynchronize_v2000
+#define PFN_cuCtxSetLimit  PFN_cuCtxSetLimit_v3010
+#define PFN_cuCtxGetLimit  PFN_cuCtxGetLimit_v3010
+#define PFN_cuCtxGetCacheConfig  PFN_cuCtxGetCacheConfig_v3020
+#define PFN_cuCtxSetCacheConfig  PFN_cuCtxSetCacheConfig_v3020
+#define PFN_cuCtxGetSharedMemConfig  PFN_cuCtxGetSharedMemConfig_v4020
+#define PFN_cuCtxSetSharedMemConfig  PFN_cuCtxSetSharedMemConfig_v4020
+#define PFN_cuCtxGetApiVersion  PFN_cuCtxGetApiVersion_v3020
+#define PFN_cuCtxGetStreamPriorityRange  PFN_cuCtxGetStreamPriorityRange_v5050
+#define PFN_cuCtxResetPersistingL2Cache  PFN_cuCtxResetPersistingL2Cache_v11000
+#define PFN_cuCtxAttach  PFN_cuCtxAttach_v2000
+#define PFN_cuCtxDetach  PFN_cuCtxDetach_v2000
+#define PFN_cuCtxGetExecAffinity  PFN_cuCtxGetExecAffinity_v11040
+#define PFN_cuModuleLoad  PFN_cuModuleLoad_v2000
+#define PFN_cuModuleLoadData  PFN_cuModuleLoadData_v2000
+#define PFN_cuModuleLoadDataEx  PFN_cuModuleLoadDataEx_v2010
+#define PFN_cuModuleLoadFatBinary  PFN_cuModuleLoadFatBinary_v2000
+#define PFN_cuModuleUnload  PFN_cuModuleUnload_v2000
+#define PFN_cuModuleGetFunction  PFN_cuModuleGetFunction_v2000
+#define PFN_cuModuleGetGlobal  PFN_cuModuleGetGlobal_v3020
+#define PFN_cuModuleGetTexRef  PFN_cuModuleGetTexRef_v2000
+#define PFN_cuModuleGetSurfRef  PFN_cuModuleGetSurfRef_v3000
+#define PFN_cuLinkCreate  PFN_cuLinkCreate_v6050
+#define PFN_cuLinkAddData  PFN_cuLinkAddData_v6050
+#define PFN_cuLinkAddFile  PFN_cuLinkAddFile_v6050
+#define PFN_cuLinkComplete  PFN_cuLinkComplete_v5050
+#define PFN_cuLinkDestroy  PFN_cuLinkDestroy_v5050
+#define PFN_cuMemGetInfo  PFN_cuMemGetInfo_v3020
+#define PFN_cuMemAlloc  PFN_cuMemAlloc_v3020
+#define PFN_cuMemAllocPitch  PFN_cuMemAllocPitch_v3020
+#define PFN_cuMemFree  PFN_cuMemFree_v3020
+#define PFN_cuMemGetAddressRange  PFN_cuMemGetAddressRange_v3020
+#define PFN_cuMemAllocHost  PFN_cuMemAllocHost_v3020
+#define PFN_cuMemFreeHost  PFN_cuMemFreeHost_v2000
+#define PFN_cuMemHostAlloc  PFN_cuMemHostAlloc_v2020
+#define PFN_cuMemHostGetDevicePointer  PFN_cuMemHostGetDevicePointer_v3020
+#define PFN_cuMemHostGetFlags  PFN_cuMemHostGetFlags_v2030
+#define PFN_cuMemAllocManaged  PFN_cuMemAllocManaged_v6000
+#define PFN_cuDeviceGetByPCIBusId  PFN_cuDeviceGetByPCIBusId_v4010
+#define PFN_cuDeviceGetPCIBusId  PFN_cuDeviceGetPCIBusId_v4010
+#define PFN_cuIpcGetEventHandle  PFN_cuIpcGetEventHandle_v4010
+#define PFN_cuIpcOpenEventHandle  PFN_cuIpcOpenEventHandle_v4010
+#define PFN_cuIpcGetMemHandle  PFN_cuIpcGetMemHandle_v4010
+#define PFN_cuIpcOpenMemHandle  PFN_cuIpcOpenMemHandle_v11000
+#define PFN_cuIpcCloseMemHandle  PFN_cuIpcCloseMemHandle_v4010
+#define PFN_cuMemHostRegister  PFN_cuMemHostRegister_v6050
+#define PFN_cuMemHostUnregister  PFN_cuMemHostUnregister_v4000
+#define PFN_cuMemcpy  __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000)
+#define PFN_cuMemcpyPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000)
+#define PFN_cuMemcpyHtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000)
+#define PFN_cuMemcpyDtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000)
+#define PFN_cuMemcpyDtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoD  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000)
+#define PFN_cuMemcpyHtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000)
+#define PFN_cuMemcpyAtoH  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000)
+#define PFN_cuMemcpyAtoA  __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000)
+#define PFN_cuMemcpy2D  __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000)
+#define PFN_cuMemcpy2DUnaligned  __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000)
+#define PFN_cuMemcpy3D  __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000)
+#define PFN_cuMemcpy3DPeer  __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000)
+#define PFN_cuMemcpyAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000)
+#define PFN_cuMemcpyPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000)
+#define PFN_cuMemcpyHtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000)
+#define PFN_cuMemcpyDtoDAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000)
+#define PFN_cuMemcpyHtoAAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000)
+#define PFN_cuMemcpyAtoHAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000)
+#define PFN_cuMemcpy2DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000)
+#define PFN_cuMemcpy3DPeerAsync  __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000)
+#define PFN_cuMemsetD8  __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000)
+#define PFN_cuMemsetD16  __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000)
+#define PFN_cuMemsetD32  __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000)
+#define PFN_cuMemsetD2D8  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000)
+#define PFN_cuMemsetD2D16  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000)
+#define PFN_cuMemsetD2D32  __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000)
+#define PFN_cuMemsetD8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000)
+#define PFN_cuMemsetD16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000)
+#define PFN_cuMemsetD32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000)
+#define PFN_cuMemsetD2D8Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000)
+#define PFN_cuMemsetD2D16Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000)
+#define PFN_cuMemsetD2D32Async  __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000)
+#define PFN_cuArrayCreate  PFN_cuArrayCreate_v3020
+#define PFN_cuArrayGetDescriptor  PFN_cuArrayGetDescriptor_v3020
+#define PFN_cuArrayGetSparseProperties  PFN_cuArrayGetSparseProperties_v11010
+#define PFN_cuMipmappedArrayGetSparseProperties  PFN_cuMipmappedArrayGetSparseProperties_v11010
+#define PFN_cuArrayGetMemoryRequirements  PFN_cuArrayGetMemoryRequirements_v11060
+#define PFN_cuMipmappedArrayGetMemoryRequirements  PFN_cuMipmappedArrayGetMemoryRequirements_v11060
+#define PFN_cuArrayGetPlane  PFN_cuArrayGetPlane_v11020
+#define PFN_cuArrayDestroy  PFN_cuArrayDestroy_v2000
+#define PFN_cuArray3DCreate  PFN_cuArray3DCreate_v3020
+#define PFN_cuArray3DGetDescriptor  PFN_cuArray3DGetDescriptor_v3020
+#define PFN_cuMipmappedArrayCreate  PFN_cuMipmappedArrayCreate_v5000
+#define PFN_cuMipmappedArrayGetLevel  PFN_cuMipmappedArrayGetLevel_v5000
+#define PFN_cuMipmappedArrayDestroy  PFN_cuMipmappedArrayDestroy_v5000
+#define PFN_cuMemAddressReserve  PFN_cuMemAddressReserve_v10020
+#define PFN_cuMemAddressFree  PFN_cuMemAddressFree_v10020
+#define PFN_cuMemCreate  PFN_cuMemCreate_v10020
+#define PFN_cuMemRelease  PFN_cuMemRelease_v10020
+#define PFN_cuMemMap  PFN_cuMemMap_v10020
+#define PFN_cuMemMapArrayAsync  __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010)
+#define PFN_cuMemUnmap  PFN_cuMemUnmap_v10020
+#define PFN_cuMemSetAccess  PFN_cuMemSetAccess_v10020
+#define PFN_cuMemGetAccess  PFN_cuMemGetAccess_v10020
+#define PFN_cuMemExportToShareableHandle  PFN_cuMemExportToShareableHandle_v10020
+#define PFN_cuMemImportFromShareableHandle  PFN_cuMemImportFromShareableHandle_v10020
+#define PFN_cuMemGetAllocationGranularity  PFN_cuMemGetAllocationGranularity_v10020
+#define PFN_cuMemGetAllocationPropertiesFromHandle  PFN_cuMemGetAllocationPropertiesFromHandle_v10020
+#define PFN_cuMemRetainAllocationHandle  PFN_cuMemRetainAllocationHandle_v11000
+#define PFN_cuMemFreeAsync  __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020)
+#define PFN_cuMemAllocAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020)
+#define PFN_cuMemPoolTrimTo  PFN_cuMemPoolTrimTo_v11020
+#define PFN_cuMemPoolSetAttribute  PFN_cuMemPoolSetAttribute_v11020
+#define PFN_cuMemPoolGetAttribute  PFN_cuMemPoolGetAttribute_v11020
+#define PFN_cuMemPoolSetAccess  PFN_cuMemPoolSetAccess_v11020
+#define PFN_cuMemPoolGetAccess  PFN_cuMemPoolGetAccess_v11020
+#define PFN_cuMemPoolCreate  PFN_cuMemPoolCreate_v11020
+#define PFN_cuMemPoolDestroy  PFN_cuMemPoolDestroy_v11020
+#define PFN_cuMemAllocFromPoolAsync  __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020)
+#define PFN_cuMemPoolExportToShareableHandle  PFN_cuMemPoolExportToShareableHandle_v11020
+#define PFN_cuMemPoolImportFromShareableHandle  PFN_cuMemPoolImportFromShareableHandle_v11020
+#define PFN_cuMemPoolExportPointer  PFN_cuMemPoolExportPointer_v11020
+#define PFN_cuMemPoolImportPointer  PFN_cuMemPoolImportPointer_v11020
+#define PFN_cuPointerGetAttribute  PFN_cuPointerGetAttribute_v4000
+#define PFN_cuMemPrefetchAsync  __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000)
+#define PFN_cuMemAdvise  PFN_cuMemAdvise_v8000
+#define PFN_cuMemRangeGetAttribute  PFN_cuMemRangeGetAttribute_v8000
+#define PFN_cuMemRangeGetAttributes  PFN_cuMemRangeGetAttributes_v8000
+#define PFN_cuPointerSetAttribute  PFN_cuPointerSetAttribute_v6000
+#define PFN_cuPointerGetAttributes  PFN_cuPointerGetAttributes_v7000
+#define PFN_cuStreamCreate  PFN_cuStreamCreate_v2000
+#define PFN_cuStreamCreateWithPriority  PFN_cuStreamCreateWithPriority_v5050
+#define PFN_cuStreamGetPriority  __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000)
+#define PFN_cuStreamGetFlags  __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000)
+#define PFN_cuStreamGetCtx  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020)
+#define PFN_cuStreamWaitEvent  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000)
+#define PFN_cuStreamAddCallback  __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000)
+#define PFN_cuStreamBeginCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010)
+#define PFN_cuThreadExchangeStreamCaptureMode  PFN_cuThreadExchangeStreamCaptureMode_v10010
+#define PFN_cuStreamEndCapture  __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000)
+#define PFN_cuStreamIsCapturing  __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000)
+#define PFN_cuStreamGetCaptureInfo  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010)
+#define PFN_cuStreamGetCaptureInfo_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030)
+#define PFN_cuStreamUpdateCaptureDependencies  __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030)
+#define PFN_cuStreamAttachMemAsync  __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000)
+#define PFN_cuStreamQuery  __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000)
+#define PFN_cuStreamSynchronize  __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000)
+#define PFN_cuStreamDestroy  PFN_cuStreamDestroy_v4000
+#define PFN_cuStreamCopyAttributes  __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000)
+#define PFN_cuStreamGetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000)
+#define PFN_cuStreamSetAttribute  __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000)
+#define PFN_cuEventCreate  PFN_cuEventCreate_v2000
+#define PFN_cuEventRecord  __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000)
+#define PFN_cuEventRecordWithFlags  __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010)
+#define PFN_cuEventQuery  PFN_cuEventQuery_v2000
+#define PFN_cuEventSynchronize  PFN_cuEventSynchronize_v2000
+#define PFN_cuEventDestroy  PFN_cuEventDestroy_v4000
+#define PFN_cuEventElapsedTime  PFN_cuEventElapsedTime_v2000
+#define PFN_cuImportExternalMemory  PFN_cuImportExternalMemory_v10000
+#define PFN_cuExternalMemoryGetMappedBuffer  PFN_cuExternalMemoryGetMappedBuffer_v10000
+#define PFN_cuExternalMemoryGetMappedMipmappedArray  PFN_cuExternalMemoryGetMappedMipmappedArray_v10000
+#define PFN_cuDestroyExternalMemory  PFN_cuDestroyExternalMemory_v10000
+#define PFN_cuImportExternalSemaphore  PFN_cuImportExternalSemaphore_v10000
+#define PFN_cuSignalExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuWaitExternalSemaphoresAsync  __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000)
+#define PFN_cuDestroyExternalSemaphore  PFN_cuDestroyExternalSemaphore_v10000
+#define PFN_cuStreamWaitValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000)
+#define PFN_cuStreamWaitValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000)
+#define PFN_cuStreamWriteValue32  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000)
+#define PFN_cuStreamWriteValue64  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000)
+#define PFN_cuStreamBatchMemOp  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000)
+#define PFN_cuStreamWaitValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 11070, 11070)
+#define PFN_cuStreamWaitValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 11070, 11070)
+#define PFN_cuStreamWriteValue32_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 11070, 11070)
+#define PFN_cuStreamWriteValue64_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 11070, 11070)
+#define PFN_cuStreamBatchMemOp_v2  __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 11070, 11070)
+#define PFN_cuFuncGetAttribute  PFN_cuFuncGetAttribute_v2020
+#define PFN_cuFuncSetAttribute  PFN_cuFuncSetAttribute_v9000
+#define PFN_cuFuncSetCacheConfig  PFN_cuFuncSetCacheConfig_v3000
+#define PFN_cuFuncSetSharedMemConfig  PFN_cuFuncSetSharedMemConfig_v4020
+#define PFN_cuLaunchKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000)
+#define PFN_cuLaunchKernelEx __API_TYPEDEF_PTSZ(PFN_cuLaunchKernelEx, 11060, 11060)
+#define PFN_cuLaunchCooperativeKernel  __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000)
+#define PFN_cuLaunchCooperativeKernelMultiDevice  PFN_cuLaunchCooperativeKernelMultiDevice_v9000
+#define PFN_cuLaunchHostFunc  __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000)
+#define PFN_cuFuncSetBlockShape  PFN_cuFuncSetBlockShape_v2000
+#define PFN_cuFuncSetSharedSize  PFN_cuFuncSetSharedSize_v2000
+#define PFN_cuParamSetSize  PFN_cuParamSetSize_v2000
+#define PFN_cuParamSeti  PFN_cuParamSeti_v2000
+#define PFN_cuParamSetf  PFN_cuParamSetf_v2000
+#define PFN_cuParamSetv  PFN_cuParamSetv_v2000
+#define PFN_cuLaunch  PFN_cuLaunch_v2000
+#define PFN_cuLaunchGrid  PFN_cuLaunchGrid_v2000
+#define PFN_cuLaunchGridAsync  PFN_cuLaunchGridAsync_v2000
+#define PFN_cuParamSetTexRef  PFN_cuParamSetTexRef_v2000
+#define PFN_cuGraphCreate  PFN_cuGraphCreate_v10000
+#define PFN_cuGraphAddKernelNode  PFN_cuGraphAddKernelNode_v10000
+#define PFN_cuGraphKernelNodeGetParams  PFN_cuGraphKernelNodeGetParams_v10000
+#define PFN_cuGraphKernelNodeSetParams  PFN_cuGraphKernelNodeSetParams_v10000
+#define PFN_cuGraphAddMemcpyNode  PFN_cuGraphAddMemcpyNode_v10000
+#define PFN_cuGraphMemcpyNodeGetParams  PFN_cuGraphMemcpyNodeGetParams_v10000
+#define PFN_cuGraphMemcpyNodeSetParams  PFN_cuGraphMemcpyNodeSetParams_v10000
+#define PFN_cuGraphAddMemsetNode  PFN_cuGraphAddMemsetNode_v10000
+#define PFN_cuGraphMemsetNodeGetParams  PFN_cuGraphMemsetNodeGetParams_v10000
+#define PFN_cuGraphMemsetNodeSetParams  PFN_cuGraphMemsetNodeSetParams_v10000
+#define PFN_cuGraphAddHostNode  PFN_cuGraphAddHostNode_v10000
+#define PFN_cuGraphHostNodeGetParams  PFN_cuGraphHostNodeGetParams_v10000
+#define PFN_cuGraphHostNodeSetParams  PFN_cuGraphHostNodeSetParams_v10000
+#define PFN_cuGraphAddChildGraphNode  PFN_cuGraphAddChildGraphNode_v10000
+#define PFN_cuGraphChildGraphNodeGetGraph  PFN_cuGraphChildGraphNodeGetGraph_v10000
+#define PFN_cuGraphAddEmptyNode  PFN_cuGraphAddEmptyNode_v10000
+#define PFN_cuGraphAddEventRecordNode  PFN_cuGraphAddEventRecordNode_v11010
+#define PFN_cuGraphEventRecordNodeGetEvent  PFN_cuGraphEventRecordNodeGetEvent_v11010
+#define PFN_cuGraphEventRecordNodeSetEvent  PFN_cuGraphEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphAddEventWaitNode  PFN_cuGraphAddEventWaitNode_v11010
+#define PFN_cuGraphEventWaitNodeGetEvent  PFN_cuGraphEventWaitNodeGetEvent_v11010
+#define PFN_cuGraphEventWaitNodeSetEvent  PFN_cuGraphEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphAddExternalSemaphoresSignalNode  PFN_cuGraphAddExternalSemaphoresSignalNode_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams  PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphAddExternalSemaphoresWaitNode  PFN_cuGraphAddExternalSemaphoresWaitNode_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams  PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020
+#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphAddBatchMemOpNode PFN_cuGraphAddBatchMemOpNode_v11070
+#define PFN_cuGraphBatchMemOpNodeGetParams PFN_cuGraphBatchMemOpNodeGetParams_v11070
+#define PFN_cuGraphBatchMemOpNodeSetParams PFN_cuGraphBatchMemOpNodeSetParams _v11070
+#define PFN_cuGraphExecBatchMemOpNodeSetParams PFN_cuGraphExecBatchMemOpNodeSetParams_v11070
+#define PFN_cuGraphClone  PFN_cuGraphClone_v10000
+#define PFN_cuGraphNodeFindInClone  PFN_cuGraphNodeFindInClone_v10000
+#define PFN_cuGraphNodeGetType  PFN_cuGraphNodeGetType_v10000
+#define PFN_cuGraphGetNodes  PFN_cuGraphGetNodes_v10000
+#define PFN_cuGraphGetRootNodes  PFN_cuGraphGetRootNodes_v10000
+#define PFN_cuGraphGetEdges  PFN_cuGraphGetEdges_v10000
+#define PFN_cuGraphNodeGetDependencies  PFN_cuGraphNodeGetDependencies_v10000
+#define PFN_cuGraphNodeGetDependentNodes  PFN_cuGraphNodeGetDependentNodes_v10000
+#define PFN_cuGraphAddDependencies  PFN_cuGraphAddDependencies_v10000
+#define PFN_cuGraphRemoveDependencies  PFN_cuGraphRemoveDependencies_v10000
+#define PFN_cuGraphDestroyNode  PFN_cuGraphDestroyNode_v10000
+#define PFN_cuGraphInstantiate  PFN_cuGraphInstantiate_v11000
+#define PFN_cuGraphInstantiateWithFlags  PFN_cuGraphInstantiateWithFlags_v11040
+#define PFN_cuGraphExecKernelNodeSetParams  PFN_cuGraphExecKernelNodeSetParams_v10010
+#define PFN_cuGraphExecMemcpyNodeSetParams  PFN_cuGraphExecMemcpyNodeSetParams_v10020
+#define PFN_cuGraphExecMemsetNodeSetParams  PFN_cuGraphExecMemsetNodeSetParams_v10020
+#define PFN_cuGraphExecHostNodeSetParams  PFN_cuGraphExecHostNodeSetParams_v10020
+#define PFN_cuGraphExecChildGraphNodeSetParams  PFN_cuGraphExecChildGraphNodeSetParams_v11010
+#define PFN_cuGraphExecEventRecordNodeSetEvent  PFN_cuGraphExecEventRecordNodeSetEvent_v11010
+#define PFN_cuGraphExecEventWaitNodeSetEvent  PFN_cuGraphExecEventWaitNodeSetEvent_v11010
+#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams  PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020
+#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams  PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020
+#define PFN_cuGraphUpload  __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010)
+#define PFN_cuGraphLaunch  __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000)
+#define PFN_cuGraphExecDestroy  PFN_cuGraphExecDestroy_v10000
+#define PFN_cuGraphDestroy  PFN_cuGraphDestroy_v10000
+#define PFN_cuGraphExecUpdate  PFN_cuGraphExecUpdate_v10020
+#define PFN_cuGraphKernelNodeCopyAttributes  PFN_cuGraphKernelNodeCopyAttributes_v11000
+#define PFN_cuGraphKernelNodeGetAttribute  PFN_cuGraphKernelNodeGetAttribute_v11000
+#define PFN_cuGraphKernelNodeSetAttribute  PFN_cuGraphKernelNodeSetAttribute_v11000
+#define PFN_cuGraphDebugDotPrint  PFN_cuGraphDebugDotPrint_v11030
+#define PFN_cuGraphAddMemAllocNode  PFN_cuGraphAddMemAllocNode_v11040
+#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040
+#define PFN_cuGraphAddMemFreeNode  PFN_cuGraphAddMemFreeNode_v11040
+#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040
+#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060
+#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060
+#define PFN_cuDeviceGraphMemTrim  PFN_cuDeviceGraphMemTrim_v11040
+#define PFN_cuDeviceGetGraphMemAttribute  PFN_cuDeviceGetGraphMemAttribute_v11040
+#define PFN_cuDeviceSetGraphMemAttribute  PFN_cuDeviceSetGraphMemAttribute_v11040
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050
+#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags  PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000
+#define PFN_cuOccupancyMaxPotentialBlockSize  PFN_cuOccupancyMaxPotentialBlockSize_v6050
+#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags  PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000
+#define PFN_cuOccupancyAvailableDynamicSMemPerBlock  PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020
+#define PFN_cuOccupancyMaxPotentialClusterSize  PFN_cuOccupancyMaxPotentialClusterSize_v11070
+#define PFN_cuOccupancyMaxActiveClusters  PFN_cuOccupancyMaxActiveClusters_v11070
+#define PFN_cuTexRefSetArray  PFN_cuTexRefSetArray_v2000
+#define PFN_cuTexRefSetMipmappedArray  PFN_cuTexRefSetMipmappedArray_v5000
+#define PFN_cuTexRefSetAddress  PFN_cuTexRefSetAddress_v3020
+#define PFN_cuTexRefSetAddress2D  PFN_cuTexRefSetAddress2D_v4010
+#define PFN_cuTexRefSetFormat  PFN_cuTexRefSetFormat_v2000
+#define PFN_cuTexRefSetAddressMode  PFN_cuTexRefSetAddressMode_v2000
+#define PFN_cuTexRefSetFilterMode  PFN_cuTexRefSetFilterMode_v2000
+#define PFN_cuTexRefSetMipmapFilterMode  PFN_cuTexRefSetMipmapFilterMode_v5000
+#define PFN_cuTexRefSetMipmapLevelBias  PFN_cuTexRefSetMipmapLevelBias_v5000
+#define PFN_cuTexRefSetMipmapLevelClamp  PFN_cuTexRefSetMipmapLevelClamp_v5000
+#define PFN_cuTexRefSetMaxAnisotropy  PFN_cuTexRefSetMaxAnisotropy_v5000
+#define PFN_cuTexRefSetBorderColor  PFN_cuTexRefSetBorderColor_v8000
+#define PFN_cuTexRefSetFlags  PFN_cuTexRefSetFlags_v2000
+#define PFN_cuTexRefGetAddress  PFN_cuTexRefGetAddress_v3020
+#define PFN_cuTexRefGetArray  PFN_cuTexRefGetArray_v2000
+#define PFN_cuTexRefGetMipmappedArray  PFN_cuTexRefGetMipmappedArray_v5000
+#define PFN_cuTexRefGetAddressMode  PFN_cuTexRefGetAddressMode_v2000
+#define PFN_cuTexRefGetFilterMode  PFN_cuTexRefGetFilterMode_v2000
+#define PFN_cuTexRefGetFormat  PFN_cuTexRefGetFormat_v2000
+#define PFN_cuTexRefGetMipmapFilterMode  PFN_cuTexRefGetMipmapFilterMode_v5000
+#define PFN_cuTexRefGetMipmapLevelBias  PFN_cuTexRefGetMipmapLevelBias_v5000
+#define PFN_cuTexRefGetMipmapLevelClamp  PFN_cuTexRefGetMipmapLevelClamp_v5000
+#define PFN_cuTexRefGetMaxAnisotropy  PFN_cuTexRefGetMaxAnisotropy_v5000
+#define PFN_cuTexRefGetBorderColor  PFN_cuTexRefGetBorderColor_v8000
+#define PFN_cuTexRefGetFlags  PFN_cuTexRefGetFlags_v2000
+#define PFN_cuTexRefCreate  PFN_cuTexRefCreate_v2000
+#define PFN_cuTexRefDestroy  PFN_cuTexRefDestroy_v2000
+#define PFN_cuSurfRefSetArray  PFN_cuSurfRefSetArray_v3000
+#define PFN_cuSurfRefGetArray  PFN_cuSurfRefGetArray_v3000
+#define PFN_cuTexObjectCreate  PFN_cuTexObjectCreate_v5000
+#define PFN_cuTexObjectDestroy  PFN_cuTexObjectDestroy_v5000
+#define PFN_cuTexObjectGetResourceDesc  PFN_cuTexObjectGetResourceDesc_v5000
+#define PFN_cuTexObjectGetTextureDesc  PFN_cuTexObjectGetTextureDesc_v5000
+#define PFN_cuTexObjectGetResourceViewDesc  PFN_cuTexObjectGetResourceViewDesc_v5000
+#define PFN_cuSurfObjectCreate  PFN_cuSurfObjectCreate_v5000
+#define PFN_cuSurfObjectDestroy  PFN_cuSurfObjectDestroy_v5000
+#define PFN_cuSurfObjectGetResourceDesc  PFN_cuSurfObjectGetResourceDesc_v5000
+#define PFN_cuDeviceCanAccessPeer  PFN_cuDeviceCanAccessPeer_v4000
+#define PFN_cuCtxEnablePeerAccess  PFN_cuCtxEnablePeerAccess_v4000
+#define PFN_cuCtxDisablePeerAccess  PFN_cuCtxDisablePeerAccess_v4000
+#define PFN_cuDeviceGetP2PAttribute  PFN_cuDeviceGetP2PAttribute_v8000
+#define PFN_cuGraphicsUnregisterResource  PFN_cuGraphicsUnregisterResource_v3000
+#define PFN_cuGraphicsSubResourceGetMappedArray  PFN_cuGraphicsSubResourceGetMappedArray_v3000
+#define PFN_cuGraphicsResourceGetMappedMipmappedArray  PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000
+#define PFN_cuGraphicsResourceGetMappedPointer  PFN_cuGraphicsResourceGetMappedPointer_v3020
+#define PFN_cuGraphicsResourceSetMapFlags  PFN_cuGraphicsResourceSetMapFlags_v6050
+#define PFN_cuGraphicsMapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000)
+#define PFN_cuGraphicsUnmapResources  __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000)
+#define PFN_cuGetExportTable  PFN_cuGetExportTable_v3000
+#define PFN_cuFuncGetModule  PFN_cuFuncGetModule_v11000
+#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030
+#define PFN_cuGetProcAddress  PFN_cuGetProcAddress_v11030
+#define PFN_cuUserObjectCreate  PFN_cuUserObjectCreate_v11030
+#define PFN_cuUserObjectRetain  PFN_cuUserObjectRetain_v11030
+#define PFN_cuUserObjectRelease  PFN_cuUserObjectRelease_v11030
+#define PFN_cuGraphRetainUserObject  PFN_cuGraphRetainUserObject_v11030
+#define PFN_cuGraphReleaseUserObject  PFN_cuGraphReleaseUserObject_v11030
+#define PFN_cuModuleGetLoadingMode  PFN_cuModuleGetLoadingMode_v11070
+#define PFN_cuMemGetHandleForAddressRange  PFN_cuMemGetHandleForAddressRange_v11070
+
+/*
+ * Type definitions for functions defined in cuda.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr);
+typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active);
+typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig);
+typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority);
+typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void);
+typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin);
+typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name);
+typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut);
+typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state);
+typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total);
+typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr);
+typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx);
+typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray);
+typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level);
+typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size);
+typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option);
+typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle);
+typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count);
+typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent);
+typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc);
+typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem);
+typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060_ptsz)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z);
+typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value);
+typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes);
+typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height);
+typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddBatchMemOpNode_v11070)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeGetParams_v11070)(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeSetParams_v11070)(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecBatchMemOpNodeSetParams_v11070)(CUgraphExec graphExec, CUgraphNode node, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec);
+typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph);
+typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr);
+typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialClusterSize_v11070)(int *clusterSize, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveClusters_v11070)(int *numClusters, CUfunction func, const CUlaunchConfig *config);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor);
+typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef);
+typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject);
+typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev);
+typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext);
+typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId);
+typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc);
+typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount);
+typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy);
+typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra);
+typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams);
+typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream);
+typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode);
+typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph);
+typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out);
+typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream);
+typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value);
+typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param);
+typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream);
+typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags);
+typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count);
+typedef CUresult (CUDAAPI *PFN_cuModuleGetLoadingMode_v11070)(CUmoduleLoadingMode *mode);
+typedef CUresult (CUDAAPI *PFN_cuMemGetHandleForAddressRange_v11070)(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags);
+
+/*
+ * Type definitions for older versioned functions in cuda.h
+ */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total);
+    typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes);
+    typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr);
+    typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize);
+    typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height);
+    typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray);
+    typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch);
+    typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef);
+    typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource);
+    typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx);
+    typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx);
+    typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev);
+    typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream);
+    typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags);
+    typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize);
+#endif
+
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+
+#endif // file guard
diff --git a/ext/cudart/include/cuda_awbarrier.h b/ext/cudart/include/cuda_awbarrier.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a7fe8a370330454f8a49e083899a50f7dc527ce
--- /dev/null
+++ b/ext/cudart/include/cuda_awbarrier.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_H_
+# define _CUDA_AWBARRIER_H_
+
+# include "cuda_awbarrier_primitives.h"
+
+# if !defined(_CUDA_AWBARRIER_SM_TARGET)
+#  error This file requires compute capability 7.0 or greater.
+# endif
+
+# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+             -std=c++11 compiler option.
+# endif
+
+_CUDA_AWBARRIER_BEGIN_NAMESPACE
+
+class awbarrier {
+public:
+    class arrival_token {
+    public:
+        arrival_token() = default;
+        ~arrival_token() = default;
+        _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
+    private:
+        _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
+        uint64_t token;
+        friend awbarrier;
+    };
+    awbarrier() = default;
+    awbarrier(const awbarrier&) = delete;
+    awbarrier& operator=(const awbarrier&) = delete;
+    ~awbarrier() = default;
+
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
+    _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
+    _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
+    _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
+    _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
+private:
+    uint64_t barrier;
+    friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
+    friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
+    friend class pipeline;
+};
+
+_CUDA_AWBARRIER_QUALIFIER
+uint32_t awbarrier::arrival_token::pending_count() const
+{
+    const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
+#if (__CUDA_ARCH__ >= 900)
+    return pending_count;
+#else
+    return (pending_count >> 15);
+#endif
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token::arrival_token(uint64_t token)
+    : token(token)
+{
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void init(awbarrier* barrier, uint32_t expected_count)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
+
+#if (__CUDA_ARCH__ >= 900)
+    const uint32_t init_count = expected_count;
+#else
+    const uint32_t init_count = (expected_count << 15) + expected_count;
+#endif
+
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void inval(awbarrier* barrier)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
+
+    return arrival_token(token);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive_and_drop()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
+
+    return arrival_token(token);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
+{
+    constexpr uint64_t max_busy_wait_cycles = 1024;
+    constexpr uint32_t max_sleep_ns = 1 << 20;
+
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+        return true;
+    }
+
+    uint64_t start_cycles = clock64();
+    uint64_t elapsed_cycles = 0;
+    uint32_t sleep_ns = 32;
+    while (elapsed_cycles < hint_cycles) {
+        if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+            return true;
+        }
+
+        if (elapsed_cycles > max_busy_wait_cycles) {
+            __nanosleep(sleep_ns);
+            if (sleep_ns < max_sleep_ns) {
+                sleep_ns *= 2;
+            }
+        }
+
+        elapsed_cycles = clock64() - start_cycles;
+    }
+
+    return false;
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::wait(arrival_token token)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    while (!timed_wait(token, ~0u));
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::arrive_and_wait()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    this->wait(this->arrive());
+}
+
+_CUDA_AWBARRIER_QUALIFIER __host__
+constexpr uint32_t awbarrier::max()
+{
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+
+_CUDA_AWBARRIER_END_NAMESPACE
+
+#endif /* !_CUDA_AWBARRIER_H_ */
diff --git a/ext/cudart/include/cuda_awbarrier_helpers.h b/ext/cudart/include/cuda_awbarrier_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..a112fea7830daf2934afff4aa6c14f0787a9f161
--- /dev/null
+++ b/ext/cudart/include/cuda_awbarrier_helpers.h
@@ -0,0 +1,350 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_HELPERS_H_
+#define _CUDA_AWBARRIER_HELPERS_H_
+
+#define _CUDA_AWBARRIER_NAMESPACE       nvcuda::experimental
+#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
+#define _CUDA_AWBARRIER_END_NAMESPACE   } }
+
+#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE       _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
+#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
+#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE   } _CUDA_AWBARRIER_END_NAMESPACE
+
+# if !defined(_CUDA_AWBARRIER_QUALIFIER)
+#  define _CUDA_AWBARRIER_QUALIFIER inline __device__
+# endif
+# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
+#  define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
+#endif
+
+#if defined(__CUDA_ARCH__)
+#if  (__CUDA_ARCH__ >= 800)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80
+#elif (__CUDA_ARCH__ >= 700)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif // No support < 700
+#else // !defined(__CUDA_ARCH__)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif // defined(__CUDA_ARCH__)
+
+#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
+# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
+#endif
+
+#if !defined(_CUDA_AWBARRIER_DEBUG)
+# if defined(__CUDACC_DEBUG__)
+#  define _CUDA_AWBARRIER_DEBUG 1
+# else
+#  define _CUDA_AWBARRIER_DEBUG 0
+# endif
+#endif
+
+#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
+# if !defined(__CUDACC_RTC__)
+#  include <cassert>
+# endif
+# define _CUDA_AWBARRIER_ASSERT(x) assert((x));
+# define _CUDA_AWBARRIER_ABORT() assert(0);
+#else
+# define _CUDA_AWBARRIER_ASSERT(x)
+# define _CUDA_AWBARRIER_ABORT() __trap();
+#endif
+
+#if defined(__CUDACC_RTC__)
+typedef unsigned short     uint16_t;
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t           uintptr_t;
+#else
+# include <stdint.h>
+#endif
+
+#if defined(_CUDA_AWBARRIER_SM_TARGET)
+
+typedef uint64_t __mbarrier_t;
+typedef uint64_t __mbarrier_token_t;
+
+_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
+
+namespace _CUDA_AWBARRIER_SM_70 {
+    union AWBarrier {
+        struct {
+            uint32_t expected;
+            uint32_t pending;
+        } split;
+        uint64_t raw;
+    };
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
+
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        awbarrier->split.expected = 0x40000000 - expected_count;
+        awbarrier->split.pending = 0x80000000 - expected_count;
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_inval(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint32_t __awbarrier_token_pending_count(uint64_t token) {
+        const uint32_t pending = token >> 32;
+        return 0x80000000 - (pending & 0x7fffffff);
+    }
+
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, 1);
+        }
+
+        __threadfence_block();
+
+        const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
+        const uint32_t new_pending = old_pending + 1;
+        const bool reset = (old_pending ^ new_pending) & 0x80000000;
+
+        if (reset) {
+            __threadfence_block();
+
+            uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
+            new_expected &= ~0x40000000;
+            if (new_expected & 0x20000000) {
+                new_expected |= 0x40000000;
+            }
+            atomicAdd_block(&awbarrier->split.pending, new_expected);
+        }
+
+        return static_cast<uint64_t>(old_pending) << 32;
+    }
+
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
+
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, count);
+        }
+
+        return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
+
+        return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
+    }
+}; // namespace _CUDA_AWBARRIER_SM_70
+
+namespace _CUDA_AWBARRIER_SM_80 {
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
+
+        asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
+                : "memory");
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_inval(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        asm volatile ("mbarrier.inval.shared.b64 [%0];"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier))
+                : "memory");
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint32_t __awbarrier_token_pending_count(uint64_t token) {
+        uint32_t __pending_count;
+
+        asm ("mbarrier.pending_count.b64 %0, %1;"
+                : "=r"(__pending_count)
+                : "l"(token));
+        return __pending_count;
+    }
+
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        uint64_t token;
+
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        }
+
+        return token;
+    }
+
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
+
+        uint64_t token;
+
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        }
+
+        return token;
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        uint16_t __wait_complete;
+
+        asm volatile ("{"
+                "    .reg .pred %%p;"
+                "    mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
+                "    selp.u16 %0, 1, 0, %%p;"
+                "}"
+                : "=h"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
+                : "memory");
+        return bool(__wait_complete);
+    }
+
+}; // namespace _CUDA_AWBARRIER_SM_80
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
+{
+    _CUDA_AWBARRIER_SM_TARGET::__awbarrier_init(barrier, expected_count);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier_inval(uint64_t* barrier)
+{
+    _CUDA_AWBARRIER_SM_TARGET::__awbarrier_inval(barrier);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+uint32_t awbarrier_token_pending_count(uint64_t token)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_token_pending_count(token);
+}
+
+template<bool _Drop>
+_CUDA_AWBARRIER_QUALIFIER
+uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop_no_complete<_Drop>(barrier, arrive_count);
+}
+
+template<bool _Drop>
+_CUDA_AWBARRIER_QUALIFIER
+uint64_t awbarrier_arrive_drop(uint64_t* barrier)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop<_Drop>(barrier);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_test_wait(barrier, token);
+}
+
+_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
+
+#endif /* defined(_CUDA_AWBARRIER_SM_TARGET) */
+
+#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */
diff --git a/ext/cudart/include/cuda_awbarrier_primitives.h b/ext/cudart/include/cuda_awbarrier_primitives.h
new file mode 100644
index 0000000000000000000000000000000000000000..647110a3351477cdd8a197f53c5877648964ab8e
--- /dev/null
+++ b/ext/cudart/include/cuda_awbarrier_primitives.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
+#define _CUDA_AWBARRIER_PRIMITIVES_H_
+
+#include "cuda_awbarrier_helpers.h"
+
+#if !defined(_CUDA_AWBARRIER_SM_TARGET)
+# error This file requires compute capability 7.0 or greater.
+#endif
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER __host__
+uint32_t __mbarrier_maximum_count() {
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) {
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void __mbarrier_inval(__mbarrier_t* barrier) {
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
+}
+
+#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */
diff --git a/ext/cudart/include/cuda_bf16.h b/ext/cudart/include/cuda_bf16.h
new file mode 100644
index 0000000000000000000000000000000000000000..695863c5122984c5d343e9aa70e583d5e19f6d2e
--- /dev/null
+++ b/ext/cudart/include/cuda_bf16.h
@@ -0,0 +1,3749 @@
+/*
+* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics
+* This section describes nv_bfloat16 precision intrinsic functions that are
+* only supported in device code.
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16
+* To use these functions, include the header file \p cuda_bf16.h in your program.
+*/
+
+#ifndef __CUDA_BF16_H__
+#define __CUDA_BF16_H__
+
+#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+#if defined(__CUDACC__)
+#define __CUDA_BF16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
+#else
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static
+#endif /* defined(__CUDACC__) */
+
+#define __CUDA_BF16_TYPES_EXIST__
+
+/* Forward-declaration of structures defined in "cuda_bf16.hpp" */
+
+/**
+ * \brief nv_bfloat16 datatype 
+ * 
+ * \details This structure implements the datatype for storing 
+ * nv_bfloat16 floating-point numbers. The structure implements 
+ * assignment operators and type conversions. 16 bits are being 
+ * used in total: 1 sign bit, 8 bits for the exponent, and 
+ * the significand is being stored in 7 bits. The total 
+ * precision is 8 bits.
+ * 
+ */
+struct __nv_bfloat16;
+
+/**
+ * \brief nv_bfloat162 datatype
+ * 
+ * \details This structure implements the datatype for storing two 
+ * nv_bfloat16 floating-point numbers. 
+ * The structure implements assignment operators and type conversions. 
+ * 
+ */
+struct __nv_bfloat162;
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value. 
+* 
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. 
+* \param[in] a - float. Is only being read. 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode
+* and returns \p nv_bfloat16 with converted value.
+*
+* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read. 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode
+* and returns \p nv_bfloat16 with converted value.
+* 
+* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read. 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-down mode
+* and returns \p nv_bfloat16 with converted value.
+* 
+* \details Converts float number \p a to nv_bfloat16 precision in round-down mode.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts float number to nv_bfloat16 precision in round-up mode
+* and returns \p nv_bfloat16 with converted value.
+* 
+* \details Converts float number \p a to nv_bfloat16 precision in round-up mode.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p a converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts \p nv_bfloat16 number to float.
+* 
+* \details Converts nv_bfloat16 number \p a to float.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns float
+* - \p a converted to float. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+*
+* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and
+* populates both halves of \p nv_bfloat162 with converted value.
+* \param[in] a - float. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16
+* precision number.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even
+* mode and returns \p nv_bfloat162 with converted values.
+*
+* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode
+* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read. 
+* \param[in] b - float. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 value with corresponding halves equal to the
+* converted input floats.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result
+* 
+* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns float
+* - The low 16 bits of \p a converted to float.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result
+* 
+* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns float
+* - The high 16 bits of \p a converted to float.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a);
+
+#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both components of float2 number to nv_bfloat16 precision in
+* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values.
+* 
+* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest
+* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read. 
+*  
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 which has corresponding halves equal to the
+* converted float2 components.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result.
+* 
+* \details Converts both halves of \p nv_bfloat162 input \p a to float2 and returns the
+* result.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns float2
+* - \p a converted to float2.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode.
+* 
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode.
+* 
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+* 
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode.
+* 
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode.
+*
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode.
+* 
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode.
+* 
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.  
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+* 
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero
+* mode.
+* 
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero
+* mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16.  
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode.
+* 
+* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even
+* mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - \p i converted to nv_bfloat16. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+* 
+* \details Round \p h to the nearest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The truncated integer value. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+* 
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The smallest integer value not less than \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+* 
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The largest integer value which is less than or equal to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+* 
+* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point
+* format, with bfloat16way cases rounded to the nearest even integer value.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The nearest integer to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Truncate \p nv_bfloat162 vector input argument to the integral part.
+* 
+* \details Round each component of vector \p h to the nearest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The truncated \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument.
+* 
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector of smallest integers not less than \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+* 
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector of largest integers which is less than or equal to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Round input to nearest integer value in nv_bfloat16 floating-point
+* number.
+* 
+* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in
+* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the
+* nearest even integer value.
+* \param[in] h - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector of rounded integer values. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns \p nv_bfloat162 with both halves equal to the input value.
+* 
+* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16
+* number.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector which has both its halves equal to the input \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Swaps both halves of the \p nv_bfloat162 input.
+* 
+* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number
+* with swapped halves.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - \p a with its halves being swapped. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines
+* into one \p nv_bfloat162 number. 
+* 
+* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value. 
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The low 16 bits of \p a and of \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and
+* combines into one \p nv_bfloat162 number.
+* 
+* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into
+* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The high 16 bits of \p a and of \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns high 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns high 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The high 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Returns low 16 bits of \p nv_bfloat162 input.
+*
+* \details Returns low 16 bits of \p nv_bfloat162 input \p a.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat16
+* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Checks if the input \p nv_bfloat16 number is infinite.
+* 
+* \details Checks if the input \p nv_bfloat16 number \p a is infinite. 
+* \param[in] a - nv_bfloat16. Is only being read. 
+* 
+* \returns int 
+* - -1 iff \p a is equal to negative infinity, 
+* - 1 iff \p a is equal to positive infinity, 
+* - 0 otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number.
+* 
+* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts low 16 bits from \p nv_bfloat162 input.
+* 
+* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the low 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Extracts high 16 bits from \p nv_bfloat162 input.
+* 
+* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162
+* number which has both halves equal to the extracted bits.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The nv_bfloat162 with both halves equal to the high 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer.
+* 
+* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h
+* as a signed short integer. 
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns short int
+* - The reinterpreted value. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer.
+* 
+* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h
+* as an unsigned short number.
+* \param[in] h - nv_bfloat16. Is only being read. 
+* 
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16.
+* 
+* \details Reinterprets the bits in the signed short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16.
+* 
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* nv_bfloat16 floating-point number.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i);
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
+* 
+* \details Returns the value of var held by the thread whose ID is given by delta. 
+* If width is less than warpSize then each subsection of the warp behaves as a separate 
+* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], 
+* the value returned corresponds to the value of var held by the delta modulo width (i.e. 
+* within the same subsection). width must have a value which is a power of 2; 
+* results are undefined if width is not a power of 2, or is a number greater than 
+* warpSize. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - nv_bfloat162. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. 
+* The value of var held by the resulting lane ID is returned: in effect, var is shifted up 
+* the warp by delta threads. If width is less than warpSize then each subsection of the warp 
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
+* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. 
+* width must have a value which is a power of 2; results are undefined if width is not a power of 2, 
+* or is a number greater than warpSize. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - nv_bfloat162. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by adding delta to the caller's thread ID. 
+* The value of var held by the resulting thread ID is returned: this has the effect 
+* of shifting var down the warp by delta threads. If width is less than warpSize then 
+* each subsection of the warp behaves as a separate entity with a starting logical 
+* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread 
+* will not wrap around the value of width and so the upper delta threads 
+* will remain unchanged. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - nv_bfloat162. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
+* 
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: 
+* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each 
+* group of width consecutive threads are able to access elements from earlier groups of threads, 
+* however if they attempt to access elements from later groups of threads their own value of var 
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
+* reduction and broadcast. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - nv_bfloat162. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
+* 
+* \details Returns the value of var held by the thread whose ID is given by delta. 
+* If width is less than warpSize then each subsection of the warp behaves as a separate 
+* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], 
+* the value returned corresponds to the value of var held by the delta modulo width (i.e. 
+* within the same subsection). width must have a value which is a power of 2; 
+* results are undefined if width is not a power of 2, or is a number greater than 
+* warpSize. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - nv_bfloat16. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
+* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. 
+* The value of var held by the resulting lane ID is returned: in effect, var is shifted up 
+* the warp by delta threads. If width is less than warpSize then each subsection of the warp 
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
+* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. 
+* width must have a value which is a power of 2; results are undefined if width is not a power of 2, 
+* or is a number greater than warpSize. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - nv_bfloat16. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by adding delta to the caller's thread ID. 
+* The value of var held by the resulting thread ID is returned: this has the effect 
+* of shifting var down the warp by delta threads. If width is less than warpSize then 
+* each subsection of the warp behaves as a separate entity with a starting logical 
+* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread 
+* will not wrap around the value of width and so the upper delta threads 
+* will remain unchanged. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - nv_bfloat16. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
+* 
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: 
+* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each 
+* group of width consecutive threads are able to access elements from earlier groups of threads, 
+* however if they attempt to access elements from later groups of threads their own value of var 
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
+* reduction and broadcast. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - nv_bfloat16. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize);
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs nv_bfloat162 vector if-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison.
+* 
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison.
+* 
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison.
+*
+* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Determine whether \p nv_bfloat162 argument is a NaN.
+*
+* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to
+* 1.0 for NaN, 0.0 otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+add
+* or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise division of \p a with \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and
+* returns the result.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns bfloat2
+* - Returns \p a with the absolute value of both halves. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The sum of \p a and \p b, with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The result of elementwise multiplication of vectors \p a and \p b, 
+* with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* \param[in] c - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* \param[in] c - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, 
+* with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Negates both halves of the input \p nv_bfloat162 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - Returns \p a with both halves negated. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+*
+* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The absolute value of a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b. 
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode.
+* 
+* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+* 
+* \returns nv_bfloat16
+* - The result of dividing \p a by \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__  __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+* \param[in] c - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+* \param[in] c - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Negates input \p nv_bfloat16 number and returns the result.
+*
+* \details Negates input \p nv_bfloat16 number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - minus a
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true
+* iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean
+* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of not-equal comparison
+* of vectors \p a and \p b are true, 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean
+* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-equal comparison
+* of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean
+* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of greater-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean
+* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of less-than comparison
+* of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean
+* true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+* 
+* \returns bool 
+* - true if both \p nv_bfloat16 results of greater-than
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns
+* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered if-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns
+* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns
+* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and
+* returns boolean true iff both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-equal comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns
+* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise.
+*
+* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered less-than comparison of 
+* vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and
+* returns boolean true iff both \p nv_bfloat16 results are true, boolean false
+* otherwise.
+*
+* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* \param[in] b - nv_bfloat162. Is only being read. 
+*
+* \returns bool
+* - true if both \p nv_bfloat16 results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered if-equal comparison.
+*
+* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered not-equal comparison.
+*
+* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-equal comparison.
+*
+* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-equal comparison.
+*
+* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered less-than comparison.
+*
+* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Performs \p nv_bfloat16 unordered greater-than comparison.
+*
+* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - nv_bfloat16. Is only being read. 
+* \param[in] b - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Determine whether \p nv_bfloat16 argument is a NaN.
+*
+* \details Determine whether \p nv_bfloat16 value \p a is a NaN.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns bool
+* - true iff argument is NaN. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b. 
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_COMPARISON
+* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat16 min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat16 add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat16. Is only being read.
+* \param[in] b - nv_bfloat16. Is only being read.
+* \param[in] c - nv_bfloat16. Is only being read.
+*
+* \returns nv_bfloat16
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector max(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_COMPARISON
+* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p nv_bfloat162 vector min(\p a, \p b).
+* Elementwise \p nv_bfloat16 operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b,
+* then performs a \p nv_bfloat162 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as
+* complex numbers in \p nv_bfloat16 precision and performs
+* complex multiply-accumulate operation: a*b + c
+* \param[in] a - nv_bfloat162. Is only being read.
+* \param[in] b - nv_bfloat162. Is only being read.
+* \param[in] c - nv_bfloat162. Is only being read.
+*
+* \returns nv_bfloat162
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The reciprocal square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The reciprocal of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The natural logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The binary logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The decimal logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest
+* mode.
+*
+* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The natural exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest
+* mode.
+*
+* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The binary exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest
+* mode.
+*
+* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The decimal exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The cosine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS
+* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat16. Is only being read. 
+*
+* \returns nv_bfloat16
+* - The sine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest
+* mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise reciprocal on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise natural logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise binary logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise decimal logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest
+* mode.
+*
+* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+*
+* \returns nv_bfloat162
+* - The elementwise binary exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector decimal exponential function in
+* round-to-nearest-even mode.
+* 
+* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The elementwise decimal exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode.
+* 
+* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The elementwise cosine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a);
+/**
+* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS
+* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode.
+* 
+* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode.
+* \param[in] a - nv_bfloat162. Is only being read. 
+* 
+* \returns nv_bfloat162
+* - The elementwise sine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access.
+* 
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher.
+* 
+* \param[in] address - __nv_bfloat162*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat162. The value to be added.
+* 
+* \returns __nv_bfloat162
+* - The old value read from \p address.
+* 
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val);
+
+/**
+* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+* 
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher.
+* 
+* \param[in] address - __nv_bfloat16*. An address in global or shared memory.
+* \param[in] val - __nv_bfloat16. The value to be added.
+* 
+* \returns __nv_bfloat16
+* - The old value read from \p address.
+* 
+* \note_ref_guide_atomic
+*/
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val);
+
+#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
+
+#undef __CUDA_BF16_DECL__
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+
+#endif /* defined(__cplusplus) */
+
+/* Note the .hpp file is included even for host-side compilation, to capture the "nv_bfloat16" & "nv_bfloat162" definitions */
+#include "cuda_bf16.hpp"
+#undef ___CUDA_BF16_STRINGIFY_INNERMOST
+#undef __CUDA_BF16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_BF16_H__ */
diff --git a/ext/cudart/include/cuda_bf16.hpp b/ext/cudart/include/cuda_bf16.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e3858dc86d7a42270edff1c4ba801007494a38b
--- /dev/null
+++ b/ext/cudart/include/cuda_bf16.hpp
@@ -0,0 +1,2683 @@
+/*
+* Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_BF16_HPP__)
+#define __CUDA_BF16_HPP__
+
+#if !defined(__CUDA_BF16_H__)
+#error "Do not include this file directly. Instead, include cuda_bf16.h."
+#endif
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#endif
+
+/* C++11 header for std::move. 
+ * In RTC mode, std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */
+
+/* C++ header for std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_BF16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* defined(__CUDACC_) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+#endif /* defined(__CUDACC__) */
+
+/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */
+#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __BFLOAT162_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/**
+* Types which allow static initialization of "nv_bfloat16" and "nv_bfloat162" until
+* these become an actual builtin. Note this initialization is as a
+* bitfield representation of "nv_bfloat16", and not a conversion from short->nv_bfloat16.
+* Such a representation will be deprecated in a future version of CUDA. 
+* (Note these are visible to non-nvcc compilers, including C-only compilation)
+*/
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __nv_bfloat16_raw;
+
+typedef struct __CUDA_ALIGN__(4) {
+    unsigned short x;
+    unsigned short y;
+} __nv_bfloat162_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(__GNUC__) */
+
+struct __CUDA_ALIGN__(2) __nv_bfloat16 {
+protected:
+    unsigned short __x;
+
+public:
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat16() = default;
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat16() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Convert to/from __nv_bfloat16_raw */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+    /* Construct from float/double */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x;  }
+
+    __CUDA_HOSTDEVICE__ operator float() const { return __bfloat162float(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; }
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; }
+
+/* Member functions only available to nvcc compilation so far */
+#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; }
+
+    /* Allow automatic casts to supported builtin types, matching all that are permitted with float */
+    __CUDA_HOSTDEVICE__ operator short() const { return __bfloat162short_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned short() const { return __bfloat162ushort_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator int() const { return __bfloat162int_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned int() const { return __bfloat162uint_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator long long() const { return __bfloat162ll_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __bfloat162ull_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; }
+
+    /* Boolean conversion - note both 0 and -0 must return false */
+    __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFF) != 0; }
+#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+};
+
+/* Global-space operator functions are only available to nvcc compilation */
+#if defined(__CUDACC__)
+
+#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+/* Some basic arithmetic operations expected of a builtin */
+__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); }
+__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); }
+__device__ __forceinline__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); }
+__device__ __forceinline__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); }
+
+__device__ __forceinline__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; }
+
+/* Note for increment and decrement we use the raw value 0x3F80 equating to nv_bfloat16(1.0f), to avoid the extra conversion */
+__device__ __forceinline__ __nv_bfloat16 &operator++(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80; h += one; return h; }
+__device__ __forceinline__ __nv_bfloat16 &operator--(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80; h -= one; return h; }
+__device__ __forceinline__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80;
+    h += one;
+    return ret;
+}
+__device__ __forceinline__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80;
+    h -= one;
+    return ret;
+}
+/* Unary plus and inverse operators */
+__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; }
+__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); }
+
+/* Some basic comparison operations to make it look like a builtin */
+__device__ __forceinline__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); }
+__device__ __forceinline__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); }
+__device__ __forceinline__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */
+#endif /* defined(__CUDACC__) */
+
+/* __nv_bfloat162 is visible to non-nvcc host compilers */
+struct __CUDA_ALIGN__(4) __nv_bfloat162 {
+    __nv_bfloat16 x;
+    __nv_bfloat16 y;
+
+    // All construct/copy/assign/move
+public:
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat162() = default;
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); return *this; }
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat162() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { }
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); return *this; }
+
+    /* Convert to/from __nv_bfloat162_raw */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); return *this; }
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const { __nv_bfloat162_raw ret; ret.x = 0U; ret.y = 0U; __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this); return ret; }
+};
+
+/* Global-space operator functions are only available to nvcc compilation */
+#if defined(__CUDACC__)
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+
+__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); }
+__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); }
+__device__ __forceinline__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); }
+__device__ __forceinline__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); }
+
+__device__ __forceinline__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; }
+
+__device__ __forceinline__ __nv_bfloat162 &operator++(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hadd2(h, one); return h; }
+__device__ __forceinline__ __nv_bfloat162 &operator--(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hsub2(h, one); return h; }
+__device__ __forceinline__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80;
+    one.y = 0x3F80;
+    h = __hadd2(h, one);
+    return ret;
+}
+__device__ __forceinline__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80;
+    one.y = 0x3F80;
+    h = __hsub2(h, one);
+    return ret;
+}
+__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; }
+__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); }
+
+__device__ __forceinline__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); }
+__device__ __forceinline__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); }
+__device__ __forceinline__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); }
+
+#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */
+#endif /* defined(__CUDACC__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+
+#if defined(__CUDA_ARCH__)
+    x = __float_as_uint(f);
+#elif defined(__CUDACC__)
+    (void)memcpy(&x, &f, sizeof(f));
+#else
+    (void)std::memcpy(&x, &f, sizeof(f));
+#endif
+
+    if ((x & 0x7fffffffU) > 0x7f800000U) {
+        sign = 0U;
+        remainder = 0U;
+        return static_cast<unsigned short>(0x7fffU);
+    }
+    sign = x >> 31U;
+    remainder = x << 16U;
+    return static_cast<unsigned short>(x >> 16U);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("{  cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x));
+    return val;
+#else
+
+    float f = static_cast<float>(x);
+    const double d = static_cast<double>(f);
+    unsigned int u;
+
+#if defined(__CUDA_ARCH__)
+    u = __float_as_uint(f);
+#elif defined(__CUDACC__)
+    (void)memcpy(&u, &f, sizeof(f));
+#else
+    (void)std::memcpy(&u, &f, sizeof(f));
+#endif
+    bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U);
+
+
+    if ((x > 0.0) && (d > x)) {
+        u--;
+    }
+    if ((x < 0.0) && (d < x)) {
+        u--;
+    }
+    if ((d != x) && x_is_not_nan) {
+        u |= 1U;
+    }
+
+#if defined(__CUDA_ARCH__)
+    f = __int_as_float(static_cast<int>(u));
+#elif defined(__CUDACC__)
+    (void)memcpy(&f, &u, sizeof(f));
+#else
+    (void)std::memcpy(&f, &u, sizeof(f));
+#endif
+
+    return __float2bfloat16(f);
+
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a)
+{
+    __nv_bfloat16 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+#else
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a)
+{
+    __nv_bfloat16 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+#else
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a)
+{
+    __nv_bfloat16 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{  cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+#else
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("{  cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+#else
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("{  cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+#else
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a)
+{
+    __nv_bfloat162 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{.reg .b16 low;\n"
+        "  cvt.rn.bf16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a));
+#else
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a));
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b)
+{
+    __nv_bfloat162 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b));
+#else
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b));
+#endif
+    return val;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h)
+{
+    float f;
+#if defined(__CUDA_ARCH__)
+    #if (__CUDA_ARCH__ >= 900)
+        asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h));
+    #else
+        asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h));
+    #endif
+#else
+    unsigned int u = static_cast<unsigned int>(h) << 16;
+    #if defined(__CUDACC__)
+        (void)memcpy(&f, &u, sizeof(f));
+    #else
+        (void)std::memcpy(&f, &u, sizeof(f));
+    #endif
+#endif
+    return f;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x);
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y);
+}
+
+#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+
+/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */
+__VECTOR_FUNCTIONS_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+#undef __VECTOR_FUNCTIONS_DECL__
+
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a)
+{
+    __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a)
+{
+    float hi_float;
+    float lo_float;
+    lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x);
+    hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y);
+    return make_float2(lo_float, hi_float);
+}
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    int val;
+    asm("{  cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2int_rn(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    int val;
+    asm("{  cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    const float f = __bfloat162float(h);
+    int   i;
+    i = static_cast<int>(f);
+#if !(defined __CUDA_ARCH__)
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    }
+#endif
+    return i;
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+}
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    int val;
+    asm("{  cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2int_rd(__bfloat162float(h));
+#endif
+}
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    int val;
+    asm("{  cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2int_ru(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i)
+{
+#if (defined __CUDA_ARCH__)
+    #if (__CUDA_ARCH__ >= 900)
+        __nv_bfloat16 val;
+       asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+       return val;
+    #else
+        const float ru = __int2float_ru(i);
+        const float rd = __int2float_rd(i);
+        float rz = __int2float_rz(i);
+        if (ru != rd) {
+            rz = __uint_as_float(__float_as_uint(rz) | 1U);
+        }
+        return __float2bfloat16_rn(rz);
+    #endif
+#else
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_rz(__int2float_rz(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_rd(__int2float_rd(i));
+#endif
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_ru(__int2float_ru(i));
+#endif
+}
+
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h)
+{
+   short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h)
+{
+   short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#elif (defined __CUDA_ARCH__)
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rzi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+    const float f = __bfloat162float(h);
+    val = static_cast<short int>(f);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    }
+#endif
+   return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h)
+{
+   short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h)
+{
+   short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_rz(__int2float_rz(static_cast<int>(i)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_rd(__int2float_rd(static_cast<int>(i)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_ru(__int2float_ru(static_cast<int>(i)));
+#endif
+}
+
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int val;
+    asm("{  cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2uint_rn(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int val;
+    asm("{  cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+
+    const float f = __bfloat162float(h);
+    unsigned int i;
+    i = static_cast<unsigned int>(f);
+#if !(defined __CUDA_ARCH__)
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0U;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    }
+#endif
+    return i;
+
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+}
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int val;
+    asm("{  cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2uint_rd(__bfloat162float(h));
+#endif
+}
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int val;
+    asm("{  cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2uint_ru(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#elif (defined __CUDA_ARCH__)
+    const float ru = __uint2float_ru(i);
+    const float rd = __uint2float_rd(i);
+    float rz = __uint2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+#else
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_rz(__uint2float_rz(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_rd(__uint2float_rd(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_ru(__uint2float_ru(i));
+#endif
+}
+
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#elif (defined __CUDA_ARCH__)
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rzi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+    const float f = __bfloat162float(h);
+    val = static_cast<unsigned short int>(f);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    }
+#endif
+   return val;
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_rz(__uint2float_rz(static_cast<unsigned int>(i)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_rd(__uint2float_rd(static_cast<unsigned int>(i)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_ru(__uint2float_ru(static_cast<unsigned int>(i)));
+#endif
+}
+
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned long long int i;
+    asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ull_rn(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h)
+{
+    unsigned long long int i;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    const float f = __bfloat162float(h);
+    i = static_cast<unsigned long long int>(f);
+#if !(defined __CUDA_ARCH__)
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    }
+#endif
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    return i;
+}
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned long long int i;
+    asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ull_rd(__bfloat162float(h));
+#endif
+}
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned long long int i;
+    asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ull_ru(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#elif (defined __CUDA_ARCH__)
+    const float ru = __ull2float_ru(i);
+    const float rd = __ull2float_rd(i);
+    float rz = __ull2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+#else
+    float f = static_cast<float>(i);
+    const unsigned long long int uf = static_cast<unsigned long long int>(f);
+    unsigned int u;
+
+    #if defined(__CUDA_ARCH__)
+        u = __float_as_uint(f);
+    #elif defined(__CUDACC__)
+        (void)memcpy(&u, &f, sizeof(f));
+    #else
+        (void)std::memcpy(&u, &f, sizeof(f));
+    #endif
+
+    // round up happened here
+    // note: no need to handle round up to f == 0x1.p64 specially
+    if (uf > i) {
+        u--;
+    }
+    if (uf != i) {
+        u |= 1U;
+    }
+
+    #if defined(__CUDA_ARCH__)
+        f = __int_as_float(static_cast<int>(u));
+    #elif defined(__CUDACC__)
+        (void)memcpy(&f, &u, sizeof(f));
+    #else
+        (void)std::memcpy(&f, &u, sizeof(f));
+    #endif
+
+    return __float2bfloat16_rn(f);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_rz(__ull2float_rz(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_rd(__ull2float_rd(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_ru(__ull2float_ru(i));
+#endif
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    long long int i;
+    asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ll_rn(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h)
+{
+    long long int i;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+    const float f = __bfloat162float(h);
+    i = static_cast<long long int>(f);
+#if !(defined __CUDA_ARCH__)
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = min_val;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    }
+#endif
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    return i;
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    long long int i;
+    asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ll_rd(__bfloat162float(h));
+#endif
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    long long int i;
+    asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ll_ru(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#elif (defined __CUDA_ARCH__)
+    const float ru = __ll2float_ru(i);
+    const float rd = __ll2float_rd(i);
+    float rz = __ll2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+#else
+    float f = static_cast<float>(i);
+    const long long int lf = static_cast<long long int>(f);
+    unsigned int u;
+
+    #if defined(__CUDA_ARCH__)
+        u = __float_as_uint(f);
+    #elif defined(__CUDACC__)
+        (void)memcpy(&u, &f, sizeof(f));
+    #else
+        (void)std::memcpy(&u, &f, sizeof(f));
+    #endif
+
+    if ((f > 0.0f) && (lf > i)) {
+        u--;
+    }
+    if ((f < 0.0f) && (lf < i)) {
+        u--;
+    }
+    if (lf != i) {
+        u |= 1U;
+    }
+
+    #if defined(__CUDA_ARCH__)
+        f = __int_as_float(static_cast<int>(u));
+    #elif defined(__CUDACC__)
+        (void)memcpy(&f, &u, sizeof(f));
+    #else
+        (void)std::memcpy(&f, &u, sizeof(f));
+    #endif
+
+    return __float2bfloat16_rn(f);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_rz(__ll2float_rz(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_rd(__ll2float_rd(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_ru(__ll2float_ru(i));
+#endif
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+#else
+    return __float2bfloat16_rz(truncf(__bfloat162float(h)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+#else
+    return __float2bfloat16_ru(ceilf(__bfloat162float(h)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+#else
+    return __float2bfloat16_rd(floorf(__bfloat162float(h)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+#else
+    return __float2bfloat16_rn(rintf(__bfloat162float(h)));
+#endif
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low = __float2bfloat16_rz(truncf(__low2float(h)));
+    const __nv_bfloat16 high = __float2bfloat16_rz(truncf(__high2float(h)));
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low = __float2bfloat16_ru(ceilf(__low2float(h)));
+    const __nv_bfloat16 high = __float2bfloat16_ru(ceilf(__high2float(h)));
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low = __float2bfloat16_rd(floorf(__low2float(h)));
+    const __nv_bfloat16 high = __float2bfloat16_rd(floorf(__high2float(h)));
+    return __nv_bfloat162(low, high);
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h)
+{
+    return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h)));
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return ret;
+}
+__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a)
+{
+    int retval;
+    if (__BFLOAT16_TO_CUS(a) == 0xFF80U) {
+        retval = -1;
+    } else if (__BFLOAT16_TO_CUS(a) == 0x7F80U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat162 val;
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a)
+{
+    __nv_bfloat162 val;
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h)
+{
+    return static_cast<short int>(__BFLOAT16_TO_CUS(h));
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h)
+{
+    return __BFLOAT16_TO_CUS(h);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i)
+{
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i)
+{
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = i;
+    return h;
+}
+
+/******************************************************************************
+*                           __nv_bfloat16, __nv_bfloat162 warp shuffle                     *
+******************************************************************************/
+#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name) /* do */ {\
+   __nv_bfloat162 r; \
+   asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32)
+}
+
+#undef __SHUFFLE_SYNC_BFLOAT162_MACRO
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+
+/******************************************************************************
+*               __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs                *
+******************************************************************************/
+
+#if defined(__cplusplus)
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+
+#undef __LDG_PTR
+#endif /*defined(__cplusplus) */
+/******************************************************************************
+*                             __nv_bfloat162 comparison                             *
+******************************************************************************/
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+}
+#else
+#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
+        "  shr.u32 low_res, low_res, 16;\n"\
+        "  or.b32  %0, high_res, low_res;}\n"\
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+}
+#endif
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.le)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+}
+#undef __COMPARISON_OP_BFLOAT162_MACRO
+
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+   bool retval; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   if (__BFLOAT162_TO_CUI(val) == 0x3F803F80U) {\
+      retval = true; \
+   } else { \
+      retval = false; \
+   }\
+   return retval;\
+}
+#else
+
+#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   unsigned int val; \
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
+        "  and.b32 %0, high_res, low_res;}\n"\
+        :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return (val != 0U) ? true : false; \
+}
+#endif
+
+__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+}
+__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+}
+__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le)
+}
+__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+}
+__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+}
+__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+}
+__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+}
+__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+}
+__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+}
+__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+}
+__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+}
+__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+}
+#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO
+/******************************************************************************
+*                             __nv_bfloat16 comparison                              *
+******************************************************************************/
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_BF16_STRINGIFY(name) ".bf16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+}
+#else
+#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
+   unsigned int val; \
+   asm( "{.reg .b32 a,b;\n"\
+        "  mov.b32 a, {0, %1};\n"\
+        "  mov.b32 b, {0, %2};\n"\
+        "  set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\
+        :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+}
+#endif
+__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(eq)
+}
+__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(ne)
+}
+__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(le)
+}
+__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(ge)
+}
+__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(lt)
+}
+__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(gt)
+}
+__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(equ)
+}
+__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(neu)
+}
+__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(leu)
+}
+__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(geu)
+}
+__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(ltu)
+}
+__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(gtu)
+}
+#undef __COMPARISON_OP_BFLOAT16_MACRO
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                             *
+******************************************************************************/
+#define __BINARY_OP_BFLOAT162_MACRO(name) /* do */ {\
+   __nv_bfloat162 val; \
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        " .reg .b16 low,high;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 high_res, high_a, high_b;\n"\
+        "  cvt.rn.bf16.f32 low, low_res;\n"\
+        "  cvt.rn.bf16.f32 high, high_res;\n"\
+        "  mov.b32 %0, {low,high};}\n"\
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ add.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ sub.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ mul.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ add.rn.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+   asm( "{.reg .b32 f, one, zero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  fma.rn.bf16x2 f,%1,one,%2;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+   asm( "{.reg .b32 f, one, zero, mone;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mone, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 f,%2,mone,%1;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+   asm( "{.reg .b32 f, one, zero, mzero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mzero, 0x80008000U;\n"
+        "  fma.rn.bf16x2 f,%1,%2,mzero;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ .reg .b32 f, one, zero;\n"
+         "  mov.b32 one, 0x3f803f80U;\n"
+         "  mov.b32 zero, 0;\n"
+         "  fma.rn.bf16x2 f, %1, %2, %3;\n"
+         "  max.bf16x2 f, f, zero;\n"
+         "  min.bf16x2 %0, f, one;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) {
+    __nv_bfloat16 ha, hb;
+
+    ha = __low2bfloat16(a);
+    hb = __low2bfloat16(b);
+
+    const __nv_bfloat16 v1 = __hdiv(ha, hb);
+
+    ha = __high2bfloat16(a);
+    hb = __high2bfloat16(b);
+
+    const __nv_bfloat16 v2 = __hdiv(ha, hb);
+
+    return __halves2bfloat162(v1, v2);
+}
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                             *
+******************************************************************************/
+#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\
+   __nv_bfloat16 val; \
+   asm( "{.reg .b32 a,b,res;\n"\
+        "  mov.b32 a, {0,%1};\n"\
+        "  mov.b32 b, {0,%2};\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\
+        "  cvt.rn.bf16.f32 %0, res;}\n"\
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ add.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ sub.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ mul.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ add.rn.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ sub.rn.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ mul.rn.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, one, %2;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero, mone;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mone, 0xbf80U;\n"
+         "  fma.rn.bf16 f, %2, mone, %1;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero, mzero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mzero, 0x8000U;\n"
+         "  fma.rn.bf16 f, %1, %2, mzero;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, %2, %3;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
+    __BINARY_OP_BFLOAT16_MACRO(div.rn)
+}
+
+/******************************************************************************
+*                             __nv_bfloat162 functions                  *
+******************************************************************************/
+#define __APPROX_FCAST(fun) /* do */ {\
+   __nv_bfloat16 val;\
+   asm("{.reg.b32         f;        \n"\
+                " .reg.b16         r;        \n"\
+                "  mov.b16         r,%1;     \n"\
+                "  mov.b32         f,{0,r};  \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   f,f;  \n"\
+                "  cvt.rn.bf16.f32    r,f;  \n"\
+                "  mov.b16         %0,r;     \n"\
+                "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));\
+   return val;\
+} /* while(0) */
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __nv_bfloat162 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  mov.b32         fl, {0,hl};     \n"\
+                "  mov.b32         fu, {0,hu};     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fl, fl;     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fu, fu;     \n"\
+                "  cvt.rn.bf16.f32    hl, fl;     \n"\
+                "  cvt.rn.bf16.f32    hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    f = sinf(f);
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) {
+    return __hsin_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h));
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    f = cosf(f);
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) {
+    return __hcos_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h));
+}
+
+#define __BF16_SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
+#define __BF16_SPEC_CASE(i,r, spc, ulp) \
+   "{.reg.b16 spc, ulp, p;\n"\
+   "  mov.b16 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
+   "  mov.b16 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16.f16 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.bf16 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) {
+    __nv_bfloat16 val;
+    asm("{.reg.b32          f, C;           \n"
+        " .reg.b16          h,r;            \n"
+        "  mov.b16          h,%1;           \n"
+        "  mov.b32          f,{0,h};        \n"
+        "  mov.b32          C, 0x3FB8AA3CU;  \n"
+        "  mul.f32          f,f,C;          \n"
+        "  ex2.approx.f32   f,f;            \n"
+        "  cvt.rn.bf16.f32 r,f;            \n"
+        "  mov.b16          %0,r;           \n"
+        "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) {
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x3FB8AA3CU;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) {
+    __APPROX_FCAST(ex2)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(ex2)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) {
+    __nv_bfloat16 val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  mov.b32         f, {0,h};       \n"
+        "  mov.b32         C, 0x40549A78U;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  ex2.approx.f32      f, f;       \n"
+        "  cvt.rn.bf16.f32    r, f;       \n"
+        __BF16_SPEC_CASE(%1, r, 0xBC95U,0xBF00U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) {
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x40549A78U;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) {
+    __APPROX_FCAST(lg2)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(lg2)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) {
+    __nv_bfloat16 val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         r,h;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  mov.b32         f,{0,h};        \n"
+        "  lg2.approx.f32      f,f;        \n"
+        "  mov.b32         C, 0x3f317218U; \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.bf16.f32    r,f;        \n"
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) {
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) {
+    __nv_bfloat16 val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  mov.b32         f, {0,h};           \n"
+        "  lg2.approx.f32      f, f;       \n"
+        "  mov.b32         C, 0x3E9A209BU;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.bf16.f32    r, f;       \n"
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) {
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;      \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+#undef __BF16_SPEC_CASE2
+#undef __BF16_SPEC_CASE
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(rcp)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) {
+    __APPROX_FCAST(rcp)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(rsqrt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) {
+    __APPROX_FCAST(rsqrt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(sqrt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) {
+    __APPROX_FCAST(sqrt)
+}
+#undef __APPROX_FCAST
+#undef __APPROX_FCAST2
+__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a)
+{
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat162 r;
+    asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return r;
+#else
+    const __nv_bfloat162 b = a;
+    __BINARY_OP_BFLOAT162_MACRO(set.nan.f32)
+#endif
+}
+__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a)
+{
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("{set.nan.bf16.bf16 %0,%1,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return __BFLOAT16_TO_CUS(r) != 0U;
+#else
+    unsigned int r;
+    asm( "{.reg .b32 a;\n"
+         "  mov.b32 a, {0,%1};\n"
+         "  set.nan.f32.f32 %0, a, a;}\n"
+         :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r != 0U;
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+    asm("{neg.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a)
+{
+    __nv_bfloat16 r;
+    asm("{neg.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+    asm("{abs.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a)
+{
+    __nv_bfloat16 r;
+    asm("{abs.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+}
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                             *
+******************************************************************************/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+   asm( "{ max.bf16 %0,%1,%2;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ min.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ max.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ min.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                             *
+******************************************************************************/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm( "{ max.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm( "{ min.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x);
+    __nv_bfloat16 img_tmp  = __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_bfloat162(real_tmp, img_tmp);
+}
+
+
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat162 r;
+    asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n"
+                  : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val))
+                  : "memory");
+   return r;
+#else
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint, assumed;
+    do {
+        assumed = old;
+        __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
+    } while (assumed != old);
+    return *(__nv_bfloat162*)&old;
+#endif
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n"
+                  : "=h"(__BFLOAT16_TO_US(r))
+                  : __PTR(address), "h"(__BFLOAT16_TO_CUS(val))
+                  : "memory");
+   return r;
+#else
+    unsigned short int* address_as_us = (unsigned short int*)address;
+    unsigned short int old = *address_as_us, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_us, assumed,
+            __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed))));
+    } while (assumed != old);
+    return __ushort_as_bfloat16(old);
+#endif
+}
+
+#undef __PTR
+#undef __CUDA_BF16_DECL__
+#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
+#endif /* defined(__cplusplus) */
+
+#undef __BINARY_OP_BFLOAT162_MACRO
+#undef __BINARY_OP_BFLOAT16_MACRO
+
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_BF16_DECL__
+
+/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */
+/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16)
+typedef __nv_bfloat16  nv_bfloat16;
+typedef __nv_bfloat162 nv_bfloat162;
+
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */
+ 
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#undef __CPP_VERSION_AT_LEAST_11_BF16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#endif /* end of include guard: __CUDA_BF16_HPP__ */
diff --git a/ext/cudart/include/cuda_d3d10_interop.h b/ext/cudart/include/cuda_d3d10_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f485047c8dccbe4080452bc24e4e46ba87c2adb
--- /dev/null
+++ b/ext/cudart/include/cuda_d3d10_interop.h
@@ -0,0 +1,724 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_D3D10_INTEROP_H__)
+#define __CUDA_D3D10_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+/** \cond impl_private */
+#if !defined(__dv)
+
+#if defined(__cplusplus)
+
+#define __dv(v) \
+        = v
+
+#else /* __cplusplus */
+
+#define __dv(v)
+
+#endif /* __cplusplus */
+
+#endif /* !__dv */
+/** \endcond impl_private */
+
+#include <d3d10_1.h>
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_D3D10 Direct3D 10 Interoperability
+ * This section describes the Direct3D 10 interoperability functions of the CUDA
+ * runtime application programming interface. Note that mapping of Direct3D 10
+ * resources is performed with the graphics API agnostic, resource mapping 
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to a D3D10 device
+ */
+enum cudaD3D10DeviceList
+{
+  cudaD3D10DeviceListAll           = 1, /**< The CUDA devices for all GPUs used by a D3D10 device */
+  cudaD3D10DeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by a D3D10 device in its currently rendering frame */
+  cudaD3D10DeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by a D3D10 device in the next frame  */
+};
+
+/**
+ * \brief Registers a Direct3D 10 resource for access by CUDA
+ * 
+ * Registers the Direct3D 10 resource \p pD3DResource for access by CUDA.  
+ *
+ * If this call is successful, then the application will be able to map and
+ * unmap this resource until it is unregistered through
+ * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
+ * internal reference count on \p pD3DResource. This reference count will be
+ * decremented when this resource is unregistered through
+ * ::cudaGraphicsUnregisterResource().
+ *
+ * This call potentially has a high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pD3DResource must be one of the following.
+ *
+ * - ::ID3D10Buffer: may be accessed via a device pointer
+ * - ::ID3D10Texture1D: individual subresources of the texture may be accessed via arrays
+ * - ::ID3D10Texture2D: individual subresources of the texture may be accessed via arrays
+ * - ::ID3D10Texture3D: individual subresources of the texture may be accessed via arrays
+ *
+ * The \p flags argument may be used to specify additional parameters at register
+ * time.  The valid values for this parameter are 
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used.
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA.  The following are some limitations.
+ *
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * A complete list of supported DXGI formats is as follows. For compactness the
+ * notation A_{B,C,D} represents A_B, A_C, and A_D.
+ * - DXGI_FORMAT_A8_UNORM
+ * - DXGI_FORMAT_B8G8R8A8_UNORM
+ * - DXGI_FORMAT_B8G8R8X8_UNORM
+ * - DXGI_FORMAT_R16_FLOAT
+ * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R32_FLOAT
+ * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
+ * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
+ * - DXGI_FORMAT_R32_{SINT,UINT}
+ * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
+ * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
+ *
+ * If \p pD3DResource is of incorrect type or is already registered, then 
+ * ::cudaErrorInvalidResourceHandle is returned. 
+ * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
+ *
+ * \param resource - Pointer to returned resource handle
+ * \param pD3DResource - Direct3D resource to register
+ * \param flags        - Parameters for resource registration
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsD3D10RegisterResource 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D10RegisterResource(struct cudaGraphicsResource **resource, ID3D10Resource *pD3DResource, unsigned int flags);
+
+/**
+ * \brief Gets the device number for an adapter
+ *
+ * Returns in \p *device the CUDA-compatible device corresponding to the
+ * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. This call
+ * will succeed only if a device on adapter \p pAdapter is CUDA-compatible.
+ *
+ * \param device   - Returns the device corresponding to pAdapter
+ * \param pAdapter - D3D10 adapter to get device for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsD3D10RegisterResource,
+ * ::cuD3D10GetDevice 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D10GetDevice(int *device, IDXGIAdapter *pAdapter);
+
+/**
+ * \brief Gets the CUDA devices corresponding to a Direct3D 10 device
+ * 
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding 
+ * to the Direct3D 10 device \p pD3D10Device.
+ * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices 
+ * corresponding to the Direct3D 10 device \p pD3D10Device.
+ *
+ * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
+ * call will return ::cudaErrorNoDevice.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D10Device
+ * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D10Device
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param pD3D10Device     - Direct3D 10 device to query for CUDA devices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::cudaD3D10DeviceListAll for all devices, 
+ *                           ::cudaD3D10DeviceListCurrentFrame for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::cudaD3D10DeviceListNextFrame for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuD3D10GetDevices 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D10GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, enum cudaD3D10DeviceList deviceList);
+
+/** @} */ /* END CUDART_D3D10 */
+
+/**
+ * \addtogroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED]
+ * This section describes deprecated Direct3D 10 interoperability functions.
+ *
+ * @{
+ */
+
+/**
+ * CUDA D3D10 Register Flags
+ */
+enum cudaD3D10RegisterFlags
+{
+  cudaD3D10RegisterFlagsNone  = 0,  /**< Default; Resource can be accessed through a void* */
+  cudaD3D10RegisterFlagsArray = 1   /**< Resource can be accessed through a CUarray* */
+};
+
+/**
+ * CUDA D3D10 Map Flags
+ */
+enum cudaD3D10MapFlags
+{
+  cudaD3D10MapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+  cudaD3D10MapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
+  cudaD3D10MapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
+};
+
+/**
+ * \brief Gets the Direct3D device against which the current CUDA context was
+ * created
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with a D3D10
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param ppD3D10Device - Returns the Direct3D device for this thread
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaD3D10SetDirect3DDevice
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10GetDirect3DDevice(ID3D10Device **ppD3D10Device);
+
+/**
+ * \brief Sets the Direct3D 10 device to use for interoperability with 
+ * a CUDA device
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0.
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with a D3D10
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param pD3D10Device - Direct3D device to use for interoperability
+ * \param device       - The CUDA device to use.  This device must be among the devices
+ *                       returned when querying ::cudaD3D10DeviceListAll from ::cudaD3D10GetDevices,
+ *                       may be set to -1 to automatically select an appropriate CUDA device.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaD3D10GetDevice,
+ * ::cudaGraphicsD3D10RegisterResource,
+ * ::cudaDeviceReset
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10SetDirect3DDevice(ID3D10Device *pD3D10Device, int device __dv(-1));
+
+/**
+ * \brief Registers a Direct3D 10 resource for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Registers the Direct3D resource \p pResource for access by CUDA.
+ *
+ * If this call is successful, then the application will be able to map and
+ * unmap this resource until it is unregistered through
+ * ::cudaD3D10UnregisterResource(). Also on success, this call will increase
+ * the internal reference count on \p pResource. This reference count will be
+ * decremented when this resource is unregistered through
+ * ::cudaD3D10UnregisterResource().
+ *
+ * This call potentially has a high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pResource must be one of the following:
+ *
+ * - ::ID3D10Buffer: Cannot be used with \p flags set to
+ * \p cudaD3D10RegisterFlagsArray.
+ * - ::ID3D10Texture1D: No restrictions.
+ * - ::ID3D10Texture2D: No restrictions.
+ * - ::ID3D10Texture3D: No restrictions.
+ *
+ * The \p flags argument specifies the mechanism through which CUDA will
+ * access the Direct3D resource. The following values are allowed.
+ *
+ * - ::cudaD3D10RegisterFlagsNone: Specifies that CUDA will access this
+ * resource through a \p void*. The pointer, size, and pitch for each
+ * subresource of this resource may be queried through
+ * ::cudaD3D10ResourceGetMappedPointer(), ::cudaD3D10ResourceGetMappedSize(),
+ * and ::cudaD3D10ResourceGetMappedPitch() respectively. This option is valid
+ * for all resource types.
+ * - ::cudaD3D10RegisterFlagsArray: Specifies that CUDA will access this
+ * resource through a \p CUarray queried on a sub-resource basis through
+ * ::cudaD3D10ResourceGetMappedArray(). This option is only valid for resources
+ * of type ::ID3D10Texture1D, ::ID3D10Texture2D, and ::ID3D10Texture3D.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA. The following are some limitations.
+ *
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Resources allocated as shared may not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * If Direct3D interoperability is not initialized on this context then
+ * ::cudaErrorInvalidDevice is returned. If \p pResource is of incorrect type
+ * or is already registered then ::cudaErrorInvalidResourceHandle is returned.
+ * If \p pResource cannot be registered then ::cudaErrorUnknown is returned.
+ *
+ * \param pResource - Resource to register
+ * \param flags     - Parameters for resource registration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsD3D10RegisterResource
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10RegisterResource(ID3D10Resource *pResource, unsigned int flags);
+
+/**
+ * \brief Unregisters a Direct3D resource
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unregisters the Direct3D resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle
+ * is returned.
+ *
+ * \param pResource - Resource to unregister
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnregisterResource
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10UnregisterResource(ID3D10Resource *pResource);
+
+/**
+ * \brief Maps Direct3D Resources for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.  
+ *
+ * Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
+ *
+ * The resources in \p ppResources may be accessed in CUDA kernels until they
+ * are unmapped. Direct3D should not access any resources while they are
+ * mapped by CUDA. If an application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any Direct3D
+ * calls issued before ::cudaD3D10MapResources() will complete before any CUDA
+ * kernels issued after ::cudaD3D10MapResources() begin.
+ *
+ * If any of \p ppResources have not been registered for use with CUDA or if
+ * \p ppResources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
+ * is returned. If any of \p ppResources are presently mapped for access by
+ * CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count       - Number of resources to map for CUDA
+ * \param ppResources - Resources to map for CUDA
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10MapResources(int count, ID3D10Resource **ppResources);
+
+/**
+ * \brief Unmaps Direct3D resources
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.   
+ *
+ * Unmaps the \p count Direct3D resource in \p ppResources.
+ *
+ * This function provides the synchronization guarantee that any CUDA kernels
+ * issued before ::cudaD3D10UnmapResources() will complete before any Direct3D
+ * calls issued after ::cudaD3D10UnmapResources() begin.
+ *
+ * If any of \p ppResources have not been registered for use with CUDA or if
+ * \p ppResources contains any duplicate entries, then
+ * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
+ * not presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count       - Number of resources to unmap for CUDA
+ * \param ppResources - Resources to unmap for CUDA
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10UnmapResources(int count, ID3D10Resource **ppResources);
+
+/**
+ * \brief Gets an array through which to access a subresource of a Direct3D
+ * resource which has been mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Returns in \p *ppArray an array through which the subresource of the mapped
+ * Direct3D resource \p pResource which corresponds to \p subResource may be
+ * accessed. The value set in \p ppArray may change every time that
+ * \p pResource is mapped.
+ *
+ * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
+ * returned. If \p pResource was not registered with usage flags
+ * ::cudaD3D10RegisterFlagsArray, then ::cudaErrorInvalidResourceHandle is
+ * returned. If \p pResource is not mapped then ::cudaErrorUnknown is returned.
+ *
+ * For usage requirements of the \p subResource parameter, see
+ * ::cudaD3D10ResourceGetMappedPointer().
+ *
+ * \param ppArray     - Returned array corresponding to subresource
+ * \param pResource   - Mapped resource to access
+ * \param subResource - Subresource of pResource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsSubResourceGetMappedArray
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedArray(cudaArray **ppArray, ID3D10Resource *pResource, unsigned int subResource);
+
+/**
+ * \brief Set usage flags for mapping a Direct3D resource
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Set usage flags for mapping the Direct3D resource \p pResource.  
+ *
+ * Changes to flags will take effect the next time \p pResource is mapped.
+ * The \p flags argument may be any of the following:
+ *
+ * - ::cudaD3D10MapFlagsNone: Specifies no hints about how this resource will
+ * be used. It is therefore assumed that this resource will be read from and
+ * written to by CUDA kernels. This is the default value.
+ * - ::cudaD3D10MapFlagsReadOnly: Specifies that CUDA kernels which access
+ * this resource will not write to this resource.
+ * - ::cudaD3D10MapFlagsWriteDiscard: Specifies that CUDA kernels which access
+ * this resource will not read from this resource and will write over the
+ * entire contents of the resource, so none of the data previously stored in
+ * the resource will be preserved.
+ *
+ * If \p pResource has not been registered for use with CUDA then
+ * ::cudaErrorInvalidHandle is returned. If \p pResource is presently mapped
+ * for access by CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param pResource - Registered resource to set flags for
+ * \param flags     - Parameters for resource mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsResourceSetMapFlags
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceSetMapFlags(ID3D10Resource *pResource, unsigned int flags); 
+
+/**
+ * \brief Gets the dimensions of a registered Direct3D surface
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
+ * subresource of the mapped Direct3D resource \p pResource which corresponds
+ * to \p subResource.
+ *
+ * Since anti-aliased surfaces may have multiple samples per pixel, it is
+ * possible that the dimensions of a resource will be an integer factor larger
+ * than the dimensions reported by the Direct3D runtime.
+ *
+ * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
+ * surfaces, the value returned in \p *pDepth will be 0.
+ *
+ * If \p pResource is not of type ::ID3D10Texture1D, ::ID3D10Texture2D, or
+ * ::ID3D10Texture3D, or if \p pResource has not been registered for use with
+ * CUDA, then ::cudaErrorInvalidHandle is returned.
+
+ * For usage requirements of \p subResource parameters see
+ * ::cudaD3D10ResourceGetMappedPointer().
+ *
+ * \param pWidth      - Returned width of surface
+ * \param pHeight     - Returned height of surface
+ * \param pDepth      - Returned depth of surface
+ * \param pResource   - Registered resource to access
+ * \param subResource - Subresource of pResource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsSubResourceGetMappedArray
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int subResource); 
+
+/**
+ * \brief Gets a pointer through which to access a subresource of a Direct3D
+ * resource which has been mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Returns in \p *pPointer the base pointer of the subresource of the mapped
+ * Direct3D resource \p pResource which corresponds to \p subResource. The
+ * value set in \p pPointer may change every time that \p pResource is mapped.
+ *
+ * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
+ * returned. If \p pResource was not registered with usage flags
+ * ::cudaD3D9RegisterFlagsNone, then ::cudaErrorInvalidResourceHandle is
+ * returned. If \p pResource is not mapped then ::cudaErrorUnknown is returned.
+ *
+ * If \p pResource is of type ::ID3D10Buffer then \p subResource must be 0.
+ * If \p pResource is of any other type, then the value of \p subResource must
+ * come from the subresource calculation in ::D3D10CalcSubResource().
+ *
+ * \param pPointer    - Returned pointer corresponding to subresource
+ * \param pResource   - Mapped resource to access
+ * \param subResource - Subresource of pResource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsResourceGetMappedPointer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedPointer(void **pPointer, ID3D10Resource *pResource, unsigned int subResource);
+
+/**
+ * \brief Gets the size of a subresource of a Direct3D resource which has been
+ * mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Returns in \p *pSize the size of the subresource of the mapped Direct3D
+ * resource \p pResource which corresponds to \p subResource. The value set in
+ * \p pSize may change every time that \p pResource is mapped.
+ *
+ * If \p pResource has not been registered for use with CUDA then
+ * ::cudaErrorInvalidHandle is returned. If \p pResource was not registered
+ * with usage flags ::cudaD3D10RegisterFlagsNone, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped for
+ * access by CUDA then ::cudaErrorUnknown is returned.
+ *
+ * For usage requirements of the \p subResource parameter see
+ * ::cudaD3D10ResourceGetMappedPointer().
+ *
+ * \param pSize       - Returned size of subresource
+ * \param pResource   - Mapped resource to access
+ * \param subResource - Subresource of pResource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsResourceGetMappedPointer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedSize(size_t *pSize, ID3D10Resource *pResource, unsigned int subResource);
+
+/**
+ * \brief Gets the pitch of a subresource of a Direct3D resource which has been
+ * mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
+ * the subresource of the mapped Direct3D resource \p pResource, which
+ * corresponds to \p subResource. The values set in \p pPitch and
+ * \p pPitchSlice may change every time that \p pResource is mapped.
+ *
+ * The pitch and Z-slice pitch values may be used to compute the location of a
+ * sample on a surface as follows.
+ *
+ * For a 2D surface, the byte offset of the sample at position \b x, \b y from
+ * the base pointer of the surface is:
+ *
+ * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
+ *
+ * For a 3D surface, the byte offset of the sample at position \b x, \b y,
+ * \b z from the base pointer of the surface is:
+ *
+ * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
+ *
+ * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
+ * NULL.
+ *
+ * If \p pResource is not of type ::ID3D10Texture1D, ::ID3D10Texture2D, or
+ * ::ID3D10Texture3D, or if \p pResource has not been registered for use with
+ * CUDA, then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was
+ * not registered with usage flags ::cudaD3D10RegisterFlagsNone, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
+ * for access by CUDA then ::cudaErrorUnknown is returned.
+ *
+ * For usage requirements of the \p subResource parameter see
+ * ::cudaD3D10ResourceGetMappedPointer().
+ *
+ * \param pPitch      - Returned pitch of subresource
+ * \param pPitchSlice - Returned Z-slice pitch of subresource
+ * \param pResource   - Mapped resource to access
+ * \param subResource - Subresource of pResource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsSubResourceGetMappedArray
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int subResource);
+
+/** @} */ /* END CUDART_D3D10_DEPRECATED */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#undef __dv
+#undef __CUDA_DEPRECATED
+
+#endif /* __CUDA_D3D10_INTEROP_H__ */
diff --git a/ext/cudart/include/cuda_d3d11_interop.h b/ext/cudart/include/cuda_d3d11_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e0c14cf8d2d0e4b739cc9af0651d52b4afc4b5a
--- /dev/null
+++ b/ext/cudart/include/cuda_d3d11_interop.h
@@ -0,0 +1,323 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_D3D11_INTEROP_H__)
+#define __CUDA_D3D11_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+/** \cond impl_private */
+#if !defined(__dv)
+
+#if defined(__cplusplus)
+
+#define __dv(v) \
+        = v
+
+#else /* __cplusplus */
+
+#define __dv(v)
+
+#endif /* __cplusplus */
+
+#endif /* !__dv */
+/** \endcond impl_private */
+
+#include <d3d11.h>
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_D3D11 Direct3D 11 Interoperability
+ * This section describes the Direct3D 11 interoperability functions of the CUDA
+ * runtime application programming interface. Note that mapping of Direct3D 11
+ * resources is performed with the graphics API agnostic, resource mapping 
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to a D3D11 device
+ */
+enum cudaD3D11DeviceList
+{
+  cudaD3D11DeviceListAll           = 1, /**< The CUDA devices for all GPUs used by a D3D11 device */
+  cudaD3D11DeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame */
+  cudaD3D11DeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by a D3D11 device in the next frame  */
+};
+
+/**
+ * \brief Register a Direct3D 11 resource for access by CUDA
+ * 
+ * Registers the Direct3D 11 resource \p pD3DResource for access by CUDA.  
+ *
+ * If this call is successful, then the application will be able to map and
+ * unmap this resource until it is unregistered through
+ * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
+ * internal reference count on \p pD3DResource. This reference count will be
+ * decremented when this resource is unregistered through
+ * ::cudaGraphicsUnregisterResource().
+ *
+ * This call potentially has a high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pD3DResource must be one of the following.
+ *
+ * - ::ID3D11Buffer: may be accessed via a device pointer
+ * - ::ID3D11Texture1D: individual subresources of the texture may be accessed via arrays
+ * - ::ID3D11Texture2D: individual subresources of the texture may be accessed via arrays
+ * - ::ID3D11Texture3D: individual subresources of the texture may be accessed via arrays
+ *
+ * The \p flags argument may be used to specify additional parameters at register
+ * time.  The valid values for this parameter are 
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used.
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA.  The following are some limitations.
+ *
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * A complete list of supported DXGI formats is as follows. For compactness the
+ * notation A_{B,C,D} represents A_B, A_C, and A_D.
+ * - DXGI_FORMAT_A8_UNORM
+ * - DXGI_FORMAT_B8G8R8A8_UNORM
+ * - DXGI_FORMAT_B8G8R8X8_UNORM
+ * - DXGI_FORMAT_R16_FLOAT
+ * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R32_FLOAT
+ * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT}
+ * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT}
+ * - DXGI_FORMAT_R32_{SINT,UINT}
+ * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB}
+ * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM}
+ * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM}
+ *
+ * If \p pD3DResource is of incorrect type or is already registered, then 
+ * ::cudaErrorInvalidResourceHandle is returned. 
+ * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
+ *
+ * \param resource - Pointer to returned resource handle
+ * \param pD3DResource - Direct3D resource to register
+ * \param flags        - Parameters for resource registration
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsD3D11RegisterResource 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D11RegisterResource(struct cudaGraphicsResource **resource, ID3D11Resource *pD3DResource, unsigned int flags);
+
+/**
+ * \brief Gets the device number for an adapter
+ *
+ * Returns in \p *device the CUDA-compatible device corresponding to the
+ * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. This call
+ * will succeed only if a device on adapter \p pAdapter is CUDA-compatible.
+ *
+ * \param device   - Returns the device corresponding to pAdapter
+ * \param pAdapter - D3D11 adapter to get device for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuD3D11GetDevice 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D11GetDevice(int *device, IDXGIAdapter *pAdapter);
+
+/**
+ * \brief Gets the CUDA devices corresponding to a Direct3D 11 device
+ * 
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding 
+ * to the Direct3D 11 device \p pD3D11Device.
+ * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices 
+ * corresponding to the Direct3D 11 device \p pD3D11Device.
+ *
+ * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
+ * call will return ::cudaErrorNoDevice.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D11Device
+ * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D11Device
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param pD3D11Device     - Direct3D 11 device to query for CUDA devices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::cudaD3D11DeviceListAll for all devices, 
+ *                           ::cudaD3D11DeviceListCurrentFrame for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::cudaD3D11DeviceListNextFrame for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuD3D11GetDevices 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D11GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, enum cudaD3D11DeviceList deviceList);
+
+/** @} */ /* END CUDART_D3D11 */
+
+/**
+ * \addtogroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED]
+ * This section describes deprecated Direct3D 11 interoperability functions.
+ *
+ * @{
+ */
+
+/**
+ * \brief Gets the Direct3D device against which the current CUDA context was
+ * created
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with a D3D11
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param ppD3D11Device - Returns the Direct3D device for this thread
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaD3D11SetDirect3DDevice
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D11GetDirect3DDevice(ID3D11Device **ppD3D11Device);
+
+/**
+ * \brief Sets the Direct3D 11 device to use for interoperability with 
+ * a CUDA device
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with a D3D11
+ * device in order to achieve maximum interoperability performance.
+ *
+ * \param pD3D11Device - Direct3D device to use for interoperability
+ * \param device       - The CUDA device to use.  This device must be among the devices
+ *                       returned when querying ::cudaD3D11DeviceListAll from ::cudaD3D11GetDevices,
+ *                       may be set to -1 to automatically select an appropriate CUDA device.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaD3D11GetDevice,
+ * ::cudaGraphicsD3D11RegisterResource,
+ * ::cudaDeviceReset
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D11SetDirect3DDevice(ID3D11Device *pD3D11Device, int device __dv(-1));
+
+/** @} */ /* END CUDART_D3D11_DEPRECATED */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#undef __dv
+#undef __CUDA_DEPRECATED
+
+#endif /* __CUDA_D3D11_INTEROP_H__ */
diff --git a/ext/cudart/include/cuda_d3d9_interop.h b/ext/cudart/include/cuda_d3d9_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bac47db597c48bcfab5971cd47645679348294a
--- /dev/null
+++ b/ext/cudart/include/cuda_d3d9_interop.h
@@ -0,0 +1,782 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_D3D9_INTEROP_H__)
+#define __CUDA_D3D9_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+/** \cond impl_private */
+#if !defined(__dv)
+
+#if defined(__cplusplus)
+
+#define __dv(v) \
+        = v
+
+#else /* __cplusplus */
+
+#define __dv(v)
+
+#endif /* __cplusplus */
+
+#endif /* !__dv */
+/** \endcond impl_private */
+
+#include <d3d9.h>
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_D3D9 Direct3D 9 Interoperability
+ * This section describes the Direct3D 9 interoperability functions of the CUDA
+ * runtime application programming interface. Note that mapping of Direct3D 9
+ * resources is performed with the graphics API agnostic, resource mapping 
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to a D3D9 device
+ */
+enum cudaD3D9DeviceList
+{
+  cudaD3D9DeviceListAll           = 1, /**< The CUDA devices for all GPUs used by a D3D9 device */
+  cudaD3D9DeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by a D3D9 device in its currently rendering frame */
+  cudaD3D9DeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by a D3D9 device in the next frame  */
+};
+
+/**
+ * \brief Gets the Direct3D device against which the current CUDA context was
+ * created
+ *
+ * Returns in \p *ppD3D9Device the Direct3D device against which this CUDA
+ * context was created in ::cudaD3D9SetDirect3DDevice().
+ *
+ * \param ppD3D9Device - Returns the Direct3D device for this thread
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidGraphicsContext,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaD3D9SetDirect3DDevice,
+ * ::cuD3D9GetDirect3DDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDirect3DDevice(IDirect3DDevice9 **ppD3D9Device);
+
+/**
+ * \brief Register a Direct3D 9 resource for access by CUDA
+ * 
+ * Registers the Direct3D 9 resource \p pD3DResource for access by CUDA.  
+ *
+ * If this call is successful then the application will be able to map and
+ * unmap this resource until it is unregistered through
+ * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the
+ * internal reference count on \p pD3DResource. This reference count will be
+ * decremented when this resource is unregistered through
+ * ::cudaGraphicsUnregisterResource().
+ *
+ * This call potentially has a high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pD3DResource must be one of the following.
+ *
+ * - ::IDirect3DVertexBuffer9: may be accessed through a device pointer
+ * - ::IDirect3DIndexBuffer9: may be accessed through a device pointer
+ * - ::IDirect3DSurface9: may be accessed through an array.
+ *     Only stand-alone objects of type ::IDirect3DSurface9
+ *     may be explicitly shared. In particular, individual mipmap levels and faces
+ *     of cube maps may not be registered directly. To access individual surfaces
+ *     associated with a texture, one must register the base texture object.
+ * - ::IDirect3DBaseTexture9: individual surfaces on this texture may be accessed
+ *     through an array.
+ *
+ * The \p flags argument may be used to specify additional parameters at register
+ * time.  The valid values for this parameter are 
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used.
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA.  The following are some limitations.
+ *
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Resources allocated as shared may not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * A complete list of supported formats is as follows:
+ * - D3DFMT_L8
+ * - D3DFMT_L16
+ * - D3DFMT_A8R8G8B8
+ * - D3DFMT_X8R8G8B8
+ * - D3DFMT_G16R16
+ * - D3DFMT_A8B8G8R8
+ * - D3DFMT_A8
+ * - D3DFMT_A8L8
+ * - D3DFMT_Q8W8V8U8
+ * - D3DFMT_V16U16
+ * - D3DFMT_A16B16G16R16F
+ * - D3DFMT_A16B16G16R16
+ * - D3DFMT_R32F
+ * - D3DFMT_G16R16F
+ * - D3DFMT_A32B32G32R32F
+ * - D3DFMT_G32R32F
+ * - D3DFMT_R16F
+ *
+ * If \p pD3DResource is of incorrect type or is already registered, then 
+ * ::cudaErrorInvalidResourceHandle is returned. 
+ * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned.
+ *
+ * \param resource - Pointer to returned resource handle
+ * \param pD3DResource - Direct3D resource to register
+ * \param flags        - Parameters for resource registration
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaD3D9SetDirect3DDevice,
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsD3D9RegisterResource
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D9RegisterResource(struct cudaGraphicsResource **resource, IDirect3DResource9 *pD3DResource, unsigned int flags);
+
+/**
+ * \brief Gets the device number for an adapter
+ *
+ * Returns in \p *device the CUDA-compatible device corresponding to the
+ * adapter name \p pszAdapterName obtained from ::EnumDisplayDevices or
+ * ::IDirect3D9::GetAdapterIdentifier(). If no device on the adapter with name
+ * \p pszAdapterName is CUDA-compatible then the call will fail.
+ *
+ * \param device         - Returns the device corresponding to pszAdapterName
+ * \param pszAdapterName - D3D9 adapter to get device for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaD3D9SetDirect3DDevice,
+ * ::cudaGraphicsD3D9RegisterResource,
+ * ::cuD3D9GetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDevice(int *device, const char *pszAdapterName);
+
+/**
+ * \brief Gets the CUDA devices corresponding to a Direct3D 9 device
+ * 
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding 
+ * to the Direct3D 9 device \p pD3D9Device.
+ * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices 
+ * corresponding to the Direct3D 9 device \p pD3D9Device.
+ *
+ * If any of the GPUs being used to render \p pDevice are not CUDA capable then the
+ * call will return ::cudaErrorNoDevice.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D9Device
+ * \param pCudaDevices     - Returned CUDA devices corresponding to \p pD3D9Device
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param pD3D9Device      - Direct3D 9 device to query for CUDA devices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::cudaD3D9DeviceListAll for all devices, 
+ *                           ::cudaD3D9DeviceListCurrentFrame for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::cudaD3D9DeviceListNextFrame for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuD3D9GetDevices 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, enum cudaD3D9DeviceList deviceList);
+
+/**
+ * \brief Sets the Direct3D 9 device to use for interoperability with 
+ * a CUDA device
+ *
+ * Records \p pD3D9Device as the Direct3D 9 device to use for Direct3D 9
+ * interoperability with the CUDA device \p device and sets \p device as 
+ * the current device for the calling host thread.
+ * 
+ * If \p device has already been initialized then this call will fail with 
+ * the error ::cudaErrorSetOnActiveProcess.  In this case it is necessary 
+ * to reset \p device using ::cudaDeviceReset() before Direct3D 9 
+ * interoperability on \p device may be enabled.
+ *
+ * Successfully initializing CUDA interoperability with \p pD3D9Device 
+ * will increase the internal reference count on \p pD3D9Device.  This 
+ * reference count will be decremented when \p device is reset using 
+ * ::cudaDeviceReset().
+ *
+ * Note that this function is never required for correct functionality.  Use of 
+ * this function will result in accelerated interoperability only when the
+ * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice 
+ * is not an IDirect3DDevice9Ex.  In all other cirumstances, this function is 
+ * not necessary.
+ *
+ * \param pD3D9Device - Direct3D device to use for this thread
+ * \param device      - The CUDA device to use.  This device must be among the devices
+ *                      returned when querying ::cudaD3D9DeviceListAll from ::cudaD3D9GetDevices,
+ *                      may be set to -1 to automatically select an appropriate CUDA device.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaD3D9GetDevice,
+ * ::cudaGraphicsD3D9RegisterResource,
+ * ::cudaDeviceReset
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D9SetDirect3DDevice(IDirect3DDevice9 *pD3D9Device, int device __dv(-1));
+
+/** @} */ /* END CUDART_D3D9 */
+
+/**
+ * \addtogroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED]
+ * This section describes deprecated Direct3D 9 interoperability functions.
+ *
+ * @{
+ */
+
+/**
+ * CUDA D3D9 Register Flags
+ */
+enum cudaD3D9RegisterFlags
+{
+  cudaD3D9RegisterFlagsNone  = 0,  /**< Default; Resource can be accessed througa void* */
+  cudaD3D9RegisterFlagsArray = 1   /**< Resource can be accessed through a CUarray* */
+};
+
+/**
+ * CUDA D3D9 Map Flags
+ */
+enum cudaD3D9MapFlags
+{
+  cudaD3D9MapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+  cudaD3D9MapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
+  cudaD3D9MapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
+};
+
+/**
+ * \brief Registers a Direct3D resource for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Registers the Direct3D resource \p pResource for access by CUDA.
+ *
+ * If this call is successful, then the application will be able to map and
+ * unmap this resource until it is unregistered through
+ * ::cudaD3D9UnregisterResource(). Also on success, this call will increase
+ * the internal reference count on \p pResource. This reference count will be
+ * decremented when this resource is unregistered through
+ * ::cudaD3D9UnregisterResource().
+ *
+ * This call potentially has a high-overhead and should not be called every frame
+ * in interactive applications.
+ *
+ * The type of \p pResource must be one of the following.
+ *
+ * - ::IDirect3DVertexBuffer9: No notes.
+ * - ::IDirect3DIndexBuffer9: No notes.
+ * - ::IDirect3DSurface9: Only stand-alone objects of type ::IDirect3DSurface9
+ * may be explicitly shared. In particular, individual mipmap levels and faces
+ * of cube maps may not be registered directly. To access individual surfaces
+ * associated with a texture, one must register the base texture object.
+ * - ::IDirect3DBaseTexture9: When a texture is registered, all surfaces
+ * associated with all mipmap levels of all faces of the texture will be
+ * accessible to CUDA.
+ *
+ * The \p flags argument specifies the mechanism through which CUDA will
+ * access the Direct3D resource. The following value is allowed:
+ *
+ * - ::cudaD3D9RegisterFlagsNone: Specifies that CUDA will access this
+ * resource through a \p void*. The pointer, size, and pitch for each
+ * subresource of this resource may be queried through
+ * ::cudaD3D9ResourceGetMappedPointer(), ::cudaD3D9ResourceGetMappedSize(),
+ * and ::cudaD3D9ResourceGetMappedPitch() respectively. This option is valid
+ * for all resource types.
+ *
+ * Not all Direct3D resources of the above types may be used for
+ * interoperability with CUDA. The following are some limitations:
+ *
+ * - The primary rendertarget may not be registered with CUDA.
+ * - Resources allocated as shared may not be registered with CUDA.
+ * - Any resources allocated in ::D3DPOOL_SYSTEMMEM or ::D3DPOOL_MANAGED may
+ *   not be registered with CUDA.
+ * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16,
+ *   or 32-bit integer or floating-point data cannot be shared.
+ * - Surfaces of depth or stencil formats cannot be shared.
+ *
+ * If Direct3D interoperability is not initialized on this context, then
+ * ::cudaErrorInvalidDevice is returned. If \p pResource is of incorrect type
+ * (e.g, is a non-stand-alone ::IDirect3DSurface9) or is already registered,
+ * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource cannot
+ * be registered then ::cudaErrorUnknown is returned.
+ *
+ * \param pResource - Resource to register
+ * \param flags     - Parameters for resource registration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsD3D9RegisterResource
+ */
+extern __host__ cudaError_t CUDARTAPI cudaD3D9RegisterResource(IDirect3DResource9 *pResource, unsigned int flags);
+
+/**
+ * \brief Unregisters a Direct3D resource for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unregisters the Direct3D resource \p pResource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
+ * returned.
+ *
+ * \param pResource - Resource to unregister
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnregisterResource(IDirect3DResource9 *pResource);
+
+/**
+ * \brief Map Direct3D resources for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Maps the \p count Direct3D resources in \p ppResources for access by CUDA.
+ *
+ * The resources in \p ppResources may be accessed in CUDA kernels until they
+ * are unmapped. Direct3D should not access any resources while they are
+ * mapped by CUDA. If an application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any Direct3D
+ * calls issued before ::cudaD3D9MapResources() will complete before any CUDA
+ * kernels issued after ::cudaD3D9MapResources() begin.
+ *
+ * If any of \p ppResources have not been registered for use with CUDA or if
+ * \p ppResources contains any duplicate entries then
+ * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
+ * presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count       - Number of resources to map for CUDA
+ * \param ppResources - Resources to map for CUDA
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9MapResources(int count, IDirect3DResource9 **ppResources);
+
+/**
+ * \brief Unmap Direct3D resources for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Unmaps the \p count Direct3D resources in \p ppResources.  
+ *
+ * This function provides the synchronization guarantee that any CUDA kernels
+ * issued before ::cudaD3D9UnmapResources() will complete before any Direct3D
+ * calls issued after ::cudaD3D9UnmapResources() begin.
+ *
+ * If any of \p ppResources have not been registered for use with CUDA or if
+ * \p ppResources contains any duplicate entries, then
+ * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are
+ * not presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count       - Number of resources to unmap for CUDA
+ * \param ppResources - Resources to unmap for CUDA
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+  * ::cudaGraphicsUnmapResources
+  */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnmapResources(int count, IDirect3DResource9 **ppResources);
+
+/**
+ * \brief Set usage flags for mapping a Direct3D resource
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Set flags for mapping the Direct3D resource \p pResource.
+ *
+ * Changes to flags will take effect the next time \p pResource is mapped.
+ * The \p flags argument may be any of the following:
+ *
+ * - ::cudaD3D9MapFlagsNone: Specifies no hints about how this resource will
+ * be used. It is therefore assumed that this resource will be read from and
+ * written to by CUDA kernels. This is the default value.
+ * - ::cudaD3D9MapFlagsReadOnly: Specifies that CUDA kernels which access this
+ * resource will not write to this resource.
+ * - ::cudaD3D9MapFlagsWriteDiscard: Specifies that CUDA kernels which access
+ * this resource will not read from this resource and will write over the
+ * entire contents of the resource, so none of the data previously stored in
+ * the resource will be preserved.
+ *
+ * If \p pResource has not been registered for use with CUDA, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is presently
+ * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
+ *
+ * \param pResource - Registered resource to set flags for
+ * \param flags     - Parameters for resource mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaInteropResourceSetMapFlags
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceSetMapFlags(IDirect3DResource9 *pResource, unsigned int flags); 
+
+/**
+ * \brief Get the dimensions of a registered Direct3D surface
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the
+ * subresource of the mapped Direct3D resource \p pResource which corresponds
+ * to \p face and \p level.
+ *
+ * Since anti-aliased surfaces may have multiple samples per pixel, it is
+ * possible that the dimensions of a resource will be an integer factor larger
+ * than the dimensions reported by the Direct3D runtime.
+ *
+ * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D
+ * surfaces, the value returned in \p *pDepth will be 0.
+ *
+ * If \p pResource is not of type ::IDirect3DBaseTexture9 or
+ * ::IDirect3DSurface9 or if \p pResource has not been registered for use with
+ * CUDA, then ::cudaErrorInvalidResourceHandle is returned.
+ *
+ * For usage requirements of \p face and \p level parameters, see
+ * ::cudaD3D9ResourceGetMappedPointer.
+ *
+ * \param pWidth    - Returned width of surface
+ * \param pHeight   - Returned height of surface
+ * \param pDepth    - Returned depth of surface
+ * \param pResource - Registered resource to access
+ * \param face      - Face of resource to access
+ * \param level     - Level of resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsSubResourceGetMappedArray
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int face, unsigned int level); 
+
+/**
+ * \brief Get an array through which to access a subresource of a Direct3D
+ * resource which has been mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pArray an array through which the subresource of the mapped
+ * Direct3D resource \p pResource, which corresponds to \p face and \p level
+ * may be accessed. The value set in \p pArray may change every time that
+ * \p pResource is mapped.
+ *
+ * If \p pResource is not registered then ::cudaErrorInvalidResourceHandle is
+ * returned. If \p pResource was not registered with usage flags
+ * ::cudaD3D9RegisterFlagsArray, then ::cudaErrorInvalidResourceHandle is
+ * returned. If \p pResource is not mapped, then ::cudaErrorUnknown is
+ * returned.
+ *
+ * For usage requirements of \p face and \p level parameters, see
+ * ::cudaD3D9ResourceGetMappedPointer().
+ *
+ * \param ppArray   - Returned array corresponding to subresource
+ * \param pResource - Mapped resource to access
+ * \param face      - Face of resource to access
+ * \param level     - Level of resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsSubResourceGetMappedArray
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedArray(cudaArray **ppArray, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
+
+/**
+ * \brief Get a pointer through which to access a subresource of a Direct3D
+ * resource which has been mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pPointer the base pointer of the subresource of the mapped
+ * Direct3D resource \p pResource, which corresponds to \p face and \p level.
+ * The value set in \p pPointer may change every time that \p pResource is
+ * mapped.
+ *
+ * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is
+ * returned. If \p pResource was not registered with usage flags
+ * ::cudaD3D9RegisterFlagsNone, then ::cudaErrorInvalidResourceHandle is
+ * returned. If \p pResource is not mapped, then ::cudaErrorUnknown is
+ * returned.
+ *
+ * If \p pResource is of type ::IDirect3DCubeTexture9, then \p face must one
+ * of the values enumerated by type ::D3DCUBEMAP_FACES. For all other types,
+ * \p face must be 0. If \p face is invalid, then ::cudaErrorInvalidValue is
+ * returned.
+ *
+ * If \p pResource is of type ::IDirect3DBaseTexture9, then \p level must
+ * correspond to a valid mipmap level. Only mipmap level 0 is supported for
+ * now. For all other types \p level must be 0. If \p level is invalid, then
+ * ::cudaErrorInvalidValue is returned.
+ *
+ * \param pPointer  - Returned pointer corresponding to subresource
+ * \param pResource - Mapped resource to access
+ * \param face      - Face of resource to access
+ * \param level     - Level of resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsResourceGetMappedPointer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedPointer(void **pPointer, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
+
+/**
+ * \brief Get the size of a subresource of a Direct3D resource which has been
+ * mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pSize the size of the subresource of the mapped Direct3D
+ * resource \p pResource, which corresponds to \p face and \p level. The value
+ * set in \p pSize may change every time that \p pResource is mapped.
+ *
+ * If \p pResource has not been registered for use with CUDA then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
+ * registered with usage flags ::cudaD3D9RegisterFlagsNone, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
+ * for access by CUDA then ::cudaErrorUnknown is returned.
+ *
+ * For usage requirements of \p face and \p level parameters, see
+ * ::cudaD3D9ResourceGetMappedPointer().
+ *
+ * \param pSize     - Returned size of subresource
+ * \param pResource - Mapped resource to access
+ * \param face      - Face of resource to access
+ * \param level     - Level of resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsResourceGetMappedPointer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedSize(size_t *pSize, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
+
+/**
+ * \brief Get the pitch of a subresource of a Direct3D resource which has been
+ * mapped for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0.
+ *
+ * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of
+ * the subresource of the mapped Direct3D resource \p pResource, which
+ * corresponds to \p face and \p level. The values set in \p pPitch and
+ * \p pPitchSlice may change every time that \p pResource is mapped.
+ *
+ * The pitch and Z-slice pitch values may be used to compute the location of a
+ * sample on a surface as follows.
+ *
+ * For a 2D surface, the byte offset of the sample at position \b x, \b y from
+ * the base pointer of the surface is:
+ *
+ * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
+ *
+ * For a 3D surface, the byte offset of the sample at position \b x, \b y,
+ * \b z from the base pointer of the surface is:
+ *
+ * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x
+ *
+ * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to
+ * NULL.
+ *
+ * If \p pResource is not of type ::IDirect3DBaseTexture9 or one of its
+ * sub-types or if \p pResource has not been registered for use with CUDA,
+ * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not
+ * registered with usage flags ::cudaD3D9RegisterFlagsNone, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped
+ * for access by CUDA then ::cudaErrorUnknown is returned.
+ *
+ * For usage requirements of \p face and \p level parameters, see
+ * ::cudaD3D9ResourceGetMappedPointer().
+ *
+ * \param pPitch      - Returned pitch of subresource
+ * \param pPitchSlice - Returned Z-slice pitch of subresource
+ * \param pResource   - Mapped resource to access
+ * \param face        - Face of resource to access
+ * \param level       - Level of resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsResourceGetMappedPointer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int face, unsigned int level);
+
+/* D3D9 1.x interop interface */
+
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9Begin(IDirect3DDevice9 *pDevice);
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9End(void);
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9RegisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnregisterVertexBuffer(IDirect3DVertexBuffer9 *pVB);
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9MapVertexBuffer(void **dptr, IDirect3DVertexBuffer9 *pVB);
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnmapVertexBuffer(IDirect3DVertexBuffer9 *pVB);
+
+/** @} */ /* END CUDART_D3D9_DEPRECATED */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#undef __dv
+#undef __CUDA_DEPRECATED
+
+#endif /* __CUDA_D3D9_INTEROP_H__ */
diff --git a/ext/cudart/include/cuda_device_runtime_api.h b/ext/cudart/include/cuda_device_runtime_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..e3bc812d777c2d2ba612e6b55f5f101c75fb24e3
--- /dev/null
+++ b/ext/cudart/include/cuda_device_runtime_api.h
@@ -0,0 +1,268 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_DEVICE_RUNTIME_API_H__)
+#define __CUDA_DEVICE_RUNTIME_API_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+
+#if !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__)
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudaFuncAttributes;
+
+
+inline __device__  cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s) 
+{ 
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) 
+{ 
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaGetDevice(int *device)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize)
+{
+  return cudaErrorUnknown;
+}
+
+inline __device__  cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags)
+{
+  return cudaErrorUnknown;
+}
+
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) &&  !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */
+
+#endif /* !defined(__CUDACC_RTC__) */
+
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+# define __DEPRECATED__(msg)
+#elif defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING)
+# define __CDPRT_DEPRECATED(func_name) __DEPRECATED__("Use of "#func_name" from device code is deprecated. Moreover, such use will cause this module to fail to load on sm_90+ devices. If calls to "#func_name" from device code cannot be removed for older devices at this time, you may guard them with __CUDA_ARCH__ macros to remove them only for sm_90+ devices, making sure to generate code for compute_90 for the macros to take effect. Note that this mitigation will no longer work when support for "#func_name" from device code is eventually dropped for all devices. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.")
+#else
+# define __CDPRT_DEPRECATED(func_name)
+#endif
+
+#if defined(__cplusplus) && defined(__CUDACC__)         /* Visible to nvcc front-end only */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)   // Visible to SM>=3.5 and "__host__ __device__" only
+
+#include "driver_types.h"
+#include "crt/host_defines.h"
+
+extern "C"
+{
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+#if (__CUDA_ARCH__ < 900)
+// cudaDeviceSynchronize is removed on sm_90+
+extern __device__ __cudart_builtin__ __CDPRT_DEPRECATED(cudaDeviceSynchronize) cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
+#endif
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaDeviceSynchronizeDeprecationAvoidance(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
+extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Obtains a parameter buffer
+ *
+ * Obtains a parameter buffer which can be filled with parameters for a kernel launch.
+ * Parameters passed to ::cudaLaunchDevice must be allocated via this function.
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch kernels.
+ *
+ * \param alignment - Specifies alignment requirement of the parameter buffer
+ * \param size      - Specifies size requirement in bytes
+ *
+ * \return
+ * Returns pointer to the allocated parameterBuffer
+ * \notefnerr
+ *
+ * \sa cudaLaunchDevice
+ */
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size);
+
+/**
+ * \ingroup CUDART_EXECUTION
+ * \brief Launches a specified kernel
+ *
+ * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained
+ * by calling ::cudaGetParameterBuffer().
+ *
+ * This is a low level API and can only be accessed from Parallel Thread Execution (PTX).
+ * CUDA user code should use <<< >>> to launch the kernels.
+ *
+ * \param func            - Pointer to the kernel to be launched
+ * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional)
+ * \param gridDimension   - Specifies grid dimensions
+ * \param blockDimension  - Specifies block dimensions
+ * \param sharedMemSize   - Specifies size of shared memory
+ * \param stream          - Specifies the stream to be used
+ *
+ * \return
+ * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources
+ * \notefnerr
+ * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming
+ * Guide for the detailed descriptions of launch configuration and parameter layout respectively.
+ *
+ * \sa cudaGetParameterBuffer
+ */
+extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream);
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__)
+    // When compiling for the device and per thread default stream is enabled, add
+    // a static inline redirect to the per thread stream entry points.
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream)
+    {
+        return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream);
+    }
+
+    static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI
+    cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream)
+    {
+        return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream);
+    }
+#else
+    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream);
+    extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream);
+#endif
+
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle);
+extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle);
+}
+
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize);
+template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags);
+
+
+#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)
+#endif /* defined(__cplusplus) && defined(__CUDACC__) */
+
+#undef __DEPRECATED__
+#undef __CDPRT_DEPRECATED
+
+#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */
diff --git a/ext/cudart/include/cuda_egl_interop.h b/ext/cudart/include/cuda_egl_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..40ab01b33e0e9bec536192676c2a804809276fc4
--- /dev/null
+++ b/ext/cudart/include/cuda_egl_interop.h
@@ -0,0 +1,642 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_EGL_INTEROP_H__)
+#define __CUDA_EGL_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+#include "cuda_runtime.h"
+#include "cudart_platform.h"
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_TYPES
+ * @{
+ */
+
+ /**
+ * Maximum number of planes per frame
+ */
+#define CUDA_EGL_MAX_PLANES 3
+
+/**
+ * CUDA EglFrame type - array or pointer
+ */
+typedef enum cudaEglFrameType_enum
+{
+    cudaEglFrameTypeArray = 0,  /**< Frame type CUDA array */
+    cudaEglFrameTypePitch = 1,  /**< Frame type CUDA pointer */
+} cudaEglFrameType;
+
+/**
+ * Resource location flags- sysmem or vidmem
+ *
+ * For CUDA context on iGPU, since video and system memory are equivalent -
+ * these flags will not have an effect on the execution.
+ *
+ * For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
+ * to give a hint about the desired location.
+ *
+ * ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
+ * to be accessed by CUDA.
+ *
+ * ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
+ * video memory to be accessed by CUDA.
+ *
+ * There may be an additional latency due to new allocation and data migration,
+ * if the frame is produced on a different memory.
+ */
+typedef enum cudaEglResourceLocationFlags_enum {
+    cudaEglResourceLocationSysmem   = 0x00,       /**< Resource location sysmem */
+    cudaEglResourceLocationVidmem   = 0x01,       /**< Resource location vidmem */
+} cudaEglResourceLocationFlags;
+
+/**
+ * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
+ */
+typedef enum cudaEglColorFormat_enum {
+    cudaEglColorFormatYUV420Planar            = 0,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420SemiPlanar        = 1,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
+    cudaEglColorFormatYUV422Planar            = 2,  /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422SemiPlanar        = 3,  /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
+    cudaEglColorFormatARGB                    = 6,  /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
+    cudaEglColorFormatRGBA                    = 7,  /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
+    cudaEglColorFormatL                       = 8,  /**< single luminance channel in one surface. */
+    cudaEglColorFormatR                       = 9,  /**< single color channel in one surface. */
+    cudaEglColorFormatYUV444Planar            = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV444SemiPlanar        = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
+    cudaEglColorFormatYUYV422                 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatUYVY422                 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
+    cudaEglColorFormatABGR                    = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
+    cudaEglColorFormatBGRA                    = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
+    cudaEglColorFormatA                       = 16, /**< Alpha color format - one channel in one surface. */
+    cudaEglColorFormatRG                      = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
+    cudaEglColorFormatAYUV                    = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    cudaEglColorFormatYVU444SemiPlanar        = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422SemiPlanar        = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420SemiPlanar        = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatVYUY_ER                 = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
+    cudaEglColorFormatUYVY_ER                 = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
+    cudaEglColorFormatYUYV_ER                 = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatYVYU_ER                 = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
+    cudaEglColorFormatYUVA_ER                 = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    cudaEglColorFormatAYUV_ER                 = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    cudaEglColorFormatYUV444Planar_ER         = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422Planar_ER         = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV420Planar_ER         = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV444SemiPlanar_ER     = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422SemiPlanar_ER     = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV420SemiPlanar_ER     = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU444Planar_ER         = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422Planar_ER         = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420Planar_ER         = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU444SemiPlanar_ER     = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422SemiPlanar_ER     = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_ER     = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatBayerRGGB               = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
+    cudaEglColorFormatBayerBGGR               = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
+    cudaEglColorFormatBayerGRBG               = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
+    cudaEglColorFormatBayerGBRG               = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
+    cudaEglColorFormatBayer10RGGB             = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10BGGR             = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10GRBG             = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10GBRG             = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer12RGGB             = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12BGGR             = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12GRBG             = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12GBRG             = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer14RGGB             = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14BGGR             = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14GRBG             = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14GBRG             = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer20RGGB             = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20BGGR             = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20GRBG             = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20GBRG             = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatYVU444Planar            = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422Planar            = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420Planar            = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatBayerIspRGGB            = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspBGGR            = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspGRBG            = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspGBRG            = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerBCCR               = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
+    cudaEglColorFormatBayerRCCB               = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
+    cudaEglColorFormatBayerCRBC               = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
+    cudaEglColorFormatBayerCBRC               = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
+    cudaEglColorFormatBayer10CCCC             = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer12BCCR             = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12RCCB             = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CRBC             = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CBRC             = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CCCC             = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatY                       = 82, /**< Color format for single Y plane. */
+    cudaEglColorFormatYUV420SemiPlanar_2020   = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_2020   = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420Planar_2020       = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420Planar_2020       = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420SemiPlanar_709    = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_709    = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420Planar_709        = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420Planar_709        = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_709  = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar      = 94, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar_709  = 95, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY_ER                         = 96, /**< Extended Range Color format for single Y plane. */
+    cudaEglColorFormatY_709_ER                     = 97, /**< Extended Range Color format for single Y plane. */
+    cudaEglColorFormatY10_ER                       = 98, /**< Extended Range Color format for single Y10 plane. */
+    cudaEglColorFormatY10_709_ER                   = 99, /**< Extended Range Color format for single Y10 plane. */
+    cudaEglColorFormatY12_ER                       = 100, /**< Extended Range Color format for single Y12 plane. */
+    cudaEglColorFormatY12_709_ER                   = 101, /**< Extended Range Color format for single Y12 plane. */
+    cudaEglColorFormatYUVA                         = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    cudaEglColorFormatYVYU                         = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
+    cudaEglColorFormatVYUY                         = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_ER     = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar_ER     = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ 
+    cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar_ER     = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar_ER     = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+} cudaEglColorFormat;
+
+/**
+ * CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
+ */
+typedef struct cudaEglPlaneDesc_st {
+    unsigned int width;                         /**< Width of plane */
+    unsigned int height;                        /**< Height of plane */
+    unsigned int depth;                         /**< Depth of plane */
+    unsigned int pitch;                         /**< Pitch of plane */
+    unsigned int numChannels;                   /**< Number of channels for the plane */
+    struct cudaChannelFormatDesc channelDesc;   /**< Channel Format Descriptor */
+    unsigned int reserved[4];                   /**< Reserved for future use */
+} cudaEglPlaneDesc;
+
+/**
+ * CUDA EGLFrame Descriptor - structure defining one frame of EGL.
+ *
+ * Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
+ * Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
+ * \code
+ * typedef struct cudaEglPlaneDesc_st {
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int numChannels;
+ *     struct cudaChannelFormatDesc channelDesc;
+ *     unsigned int reserved[4];
+ * } cudaEglPlaneDesc;
+ * \endcode
+
+*/
+typedef struct cudaEglFrame_st {
+   union {
+       cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];     /**< Array of CUDA arrays corresponding to each plane*/
+       struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];     /**< Array of Pointers corresponding to each plane*/
+   } frame;
+   cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];     /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
+   unsigned int planeCount;                             /**< Number of planes */
+   cudaEglFrameType frameType;                          /**< Array or Pitch */
+   cudaEglColorFormat eglColorFormat;                   /**< CUDA EGL Color Format*/
+} cudaEglFrame;
+
+/**
+ * CUDA EGLSream Connection
+ */
+typedef struct  CUeglStreamConnection_st *cudaEglStreamConnection;
+
+/** @} */ /* END CUDART_TYPES */
+
+/**
+ * \addtogroup CUDART_EGL EGL Interoperability
+ * This section describes the EGL interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Registers an EGL image
+ *
+ * Registers the EGLImageKHR specified by \p image for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * Additional Mapping/Unmapping is not required for the registered resource and
+ * ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
+ *
+ * The application will be responsible for synchronizing access to shared objects.
+ * The application must ensure that any pending operation which access the objects have completed
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
+ * The application will be also responsible for ensuring that any pending operation on the
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
+ * accesing the same memory objects.
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
+ *
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
+ * typedef void* EGLImageKHR
+ *
+ * \param pCudaResource   - Pointer to the returned object handle
+ * \param image           - An EGLImageKHR image which can be used to create target resource.
+ * \param flags           - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsResourceGetMappedEglFrame,
+ * ::cuGraphicsEGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
+
+/**
+ * \brief Connect CUDA to EGLStream as a consumer.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param eglStream         - EGLStreamKHR handle
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerConnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
+
+/**
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
+ * ::cudaEglResourceLocationFlags.
+ *
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
+ * Default is ::cudaEglResourceLocationVidmem.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param eglStream         - EGLStreamKHR handle
+ * \param flags             - Flags denote intended location - system or video.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerConnectWithFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
+
+/**
+ * \brief Disconnect CUDA as a consumer to EGLStream .
+ *
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerDisconnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
+
+/**
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
+ *
+ * Acquire an image frame from EGLStreamKHR.
+ * ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
+ * ::cudaEglFrame.
+ *
+ * \param conn            - Connection on which to acquire
+ * \param pCudaResource   - CUDA resource on which the EGLStream frame will be mapped for use.
+ * \param pStream         - CUDA stream for synchronization and any data migrations
+ * implied by ::cudaEglResourceLocationFlags.
+ * \param timeout         - Desired timeout in usec.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorLaunchTimeout
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerAcquireFrame
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
+        cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
+/**
+ * \brief Releases the last frame acquired from the EGLStream.
+ *
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
+ *
+ * \param conn            - Connection on which to release
+ * \param pCudaResource   - CUDA resource whose corresponding frame is to be released
+ * \param pStream         - CUDA stream on which release will be done.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cuEGLStreamConsumerReleaseFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
+                                                  cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
+
+/**
+ * \brief Connect CUDA to EGLStream as a producer.
+ *
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn   - Pointer to the returned connection handle
+ * \param eglStream - EGLStreamKHR handle
+ * \param width  - width of the image to be submitted to the stream
+ * \param height - height of the image to be submitted to the stream
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerConnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
+                                                EGLStreamKHR eglStream, EGLint width, EGLint height);
+
+/**
+ * \brief Disconnect CUDA as a producer  to EGLStream .
+ *
+ * Disconnect CUDA as a producer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerDisconnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
+
+/**
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
+ *
+ * The ::cudaEglFrame is defined as:
+ * \code
+ * typedef struct cudaEglFrame_st {
+ *     union {
+ *         cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];
+ *         struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];
+ *     } frame;
+ *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
+ *     unsigned int planeCount;
+ *     cudaEglFrameType frameType;
+ *     cudaEglColorFormat eglColorFormat;
+ * } cudaEglFrame;
+ * \endcode
+ *
+ * For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
+ * allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
+ * the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to present the frame.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerPresentFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
+                                                 cudaEglFrame eglframe, cudaStream_t *pStream);
+
+/**
+ * \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
+ * 
+ * This API can potentially return cudaErrorLaunchTimeout if the consumer has not 
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to return the frame.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cuEGLStreamProducerReturnFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
+                                                cudaEglFrame *eglframe, cudaStream_t *pStream);
+
+/**
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
+ *
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
+ * \p resource may be accessed.
+ * This API can only be called for EGL graphics resources.
+ *
+ * The ::cudaEglFrame is defined as
+ * \code
+ * typedef struct cudaEglFrame_st {
+ *     union {
+ *         cudaArray_t             pArray[CUDA_EGL_MAX_PLANES];
+ *         struct cudaPitchedPtr   pPitch[CUDA_EGL_MAX_PLANES];
+ *     } frame;
+ *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
+ *     unsigned int planeCount;
+ *     cudaEglFrameType frameType;
+ *     cudaEglColorFormat eglColorFormat;
+ * } cudaEglFrame;
+ * \endcode
+ *
+ *
+ * \param eglFrame   - Returned eglFrame.
+ * \param resource   - Registered resource to access.
+ * \param index      - Index for cubemap surfaces.
+ * \param mipLevel   - Mipmap level for the subresource to access.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
+ *
+ * \sa
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsResourceGetMappedEglFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
+                                        cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
+
+/**
+ * \brief Creates an event from EGLSync object
+ *
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
+ * via \p flags. Valid flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that the created event should use blocking
+ * synchronization.  A CPU thread that uses ::cudaEventSynchronize() to wait on
+ * an event created with this flag will block until the event has actually
+ * been completed.
+ *
+ * ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
+ *
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
+ * typedef void* EGLSyncKHR
+ *
+ * \param phEvent - Returns newly created event
+ * \param eglSync - Opaque handle to EGLSync object
+ * \param flags   - Event creation flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ *
+ * \sa
+ * ::cudaEventQuery,
+ * ::cudaEventSynchronize,
+ * ::cudaEventDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+
+/** @} */ /* END CUDART_EGL */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* __CUDA_EGL_INTEROP_H__ */
+
diff --git a/ext/cudart/include/cuda_fp16.h b/ext/cudart/include/cuda_fp16.h
new file mode 100644
index 0000000000000000000000000000000000000000..8dc1c29c78d23517ab9f9c5ebfb9da3a531b0240
--- /dev/null
+++ b/ext/cudart/include/cuda_fp16.h
@@ -0,0 +1,3794 @@
+/*
+* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+/**
+* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics
+* This section describes half precision intrinsic functions that are
+* only supported in device code.
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+/**
+* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions
+* \ingroup CUDA_MATH_INTRINSIC_HALF
+* To use these functions, include the header file \p cuda_fp16.h in your program.
+*/
+
+#ifndef __CUDA_FP16_H__
+#define __CUDA_FP16_H__
+
+#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x
+#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x)
+
+#if defined(__cplusplus)
+#if defined(__CUDACC__)
+#define __CUDA_FP16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
+#else
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static
+#endif /* defined(__CUDACC__) */
+
+#define __CUDA_FP16_TYPES_EXIST__
+
+/* Forward-declaration of structures defined in "cuda_fp16.hpp" */
+
+/**
+ * \brief half datatype 
+ * 
+ * \details This structure implements the datatype for storing 
+ * half-precision floating-point numbers. The structure implements 
+ * assignment operators and type conversions. 
+ * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, 
+ * and the significand is being stored in 10 bits. 
+ * The total precision is 11 bits. There are 15361 representable 
+ * numbers within the interval [0.0, 1.0], endpoints included. 
+ * On average we have log10(2**11) ~ 3.311 decimal digits. 
+ * 
+ * \internal
+ * \req IEEE 754-2008 compliant implementation of half-precision 
+ * floating-point numbers. 
+ * \endinternal
+ */
+struct __half;
+
+/**
+ * \brief half2 datatype
+ * 
+ * \details This structure implements the datatype for storing two 
+ * half-precision floating-point numbers. 
+ * The structure implements assignment operators and type conversions. 
+ * 
+ * \internal
+ * \req Vectorified version of half. 
+ * \endinternal
+ */
+struct __half2;
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts double number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts double number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - double. Is only being read.
+* \returns half
+* - \p a converted to half.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value. 
+* 
+* \details Converts float number \p a to half precision in round-to-nearest-even mode. 
+* \param[in] a - float. Is only being read. 
+* \returns half
+* - \p a converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-to-nearest-even mode
+* and returns \p half with converted value.
+*
+* \details Converts float number \p a to half precision in round-to-nearest-even mode.
+* \param[in] a - float. Is only being read. 
+* \returns half
+* - \p a converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-towards-zero mode
+* and returns \p half with converted value.
+* 
+* \details Converts float number \p a to half precision in round-towards-zero mode.
+* \param[in] a - float. Is only being read. 
+* \returns half
+* - \p a converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-down mode
+* and returns \p half with converted value.
+* 
+* \details Converts float number \p a to half precision in round-down mode.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns half
+* - \p a converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts float number to half precision in round-up mode
+* and returns \p half with converted value.
+* 
+* \details Converts float number \p a to half precision in round-up mode.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns half
+* - \p a converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts \p half number to float.
+* 
+* \details Converts half number \p a to float.
+* \param[in] a - float. Is only being read. 
+* 
+* \returns float
+* - \p a converted to float. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts input to half precision in round-to-nearest-even mode and
+* populates both halves of \p half2 with converted value.
+*
+* \details Converts input \p a to half precision in round-to-nearest-even mode and
+* populates both halves of \p half2 with converted value.
+* \param[in] a - float. Is only being read. 
+*
+* \returns half2
+* - The \p half2 value with both halves equal to the converted half
+* precision number.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both input floats to half precision in round-to-nearest-even
+* mode and returns \p half2 with converted values.
+*
+* \details Converts both input floats to half precision in round-to-nearest-even mode
+* and combines the results into one \p half2 number. Low 16 bits of the return
+* value correspond to the input \p a, high 16 bits correspond to the input \p
+* b.
+* \param[in] a - float. Is only being read. 
+* \param[in] b - float. Is only being read. 
+* 
+* \returns half2
+* - The \p half2 value with corresponding halves equal to the
+* converted input floats.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts low 16 bits of \p half2 to float and returns the result
+* 
+* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns float
+* - The low 16 bits of \p a converted to float.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts high 16 bits of \p half2 to float and returns the result
+* 
+* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number
+* and returns the result.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns float
+* - The high 16 bits of \p a converted to float.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns short int
+* - \p h converted to a signed short integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns int
+* - \p h converted to a signed integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-towards-zero mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode.
+*
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns long long int
+* - \p h converted to a signed 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero
+* mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - half. Is only being read.
+*
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h);
+
+#if defined(__CUDACC__)
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both components of float2 number to half precision in
+* round-to-nearest-even mode and returns \p half2 with converted values.
+* 
+* \details Converts both components of float2 to half precision in round-to-nearest
+* mode and combines the results into one \p half2 number. Low 16 bits of the
+* return value correspond to \p a.x and high 16 bits of the return value
+* correspond to \p a.y.
+* \param[in] a - float2. Is only being read. 
+*  
+* \returns half2
+* - The \p half2 which has corresponding halves equal to the
+* converted float2 components.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Converts both halves of \p half2 to float2 and returns the result.
+* 
+* \details Converts both halves of \p half2 input \p a to float2 and returns the
+* result.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns float2
+* - \p a converted to float2.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-to-nearest-even mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed integer in
+* round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns int
+* - \p h converted to a signed integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-to-nearest-even mode.
+* 
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-towards-zero mode.
+* 
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __int2half_rz(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-down mode.
+* 
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __int2half_rd(const int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed integer to a half in round-up mode.
+* 
+* \details Convert the signed integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __int2half_ru(const int i);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed short integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns short int
+* - \p h converted to a signed short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-to-nearest-even
+* mode.
+* 
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-towards-zero mode.
+* 
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __short2half_rz(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-down mode.
+* 
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __short2half_rd(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed short integer to a half in round-up mode.
+* 
+* \details Convert the signed short integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __short2half_ru(const short int i);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-to-nearest-even mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-down mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned integer in round-up mode.
+*
+* \details Convert the half-precision floating-point value \p h to an unsigned integer
+* in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+*
+* \returns unsigned int
+* - \p h converted to an unsigned integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-to-nearest-even mode.
+* 
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-towards-zero mode.
+* 
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half.  
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-down mode.
+* 
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned integer to a half in round-up mode.
+* 
+* \details Convert the unsigned integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-to-nearest-even mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-down mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned short integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned short
+* integer in round-up mode. NaN inputs are converted to 0.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned short int
+* - \p h converted to an unsigned short integer. 
+*/
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-to-nearest-even
+* mode.
+* 
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-towards-zero
+* mode.
+* 
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-down mode.
+* 
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned short integer to a half in round-up mode.
+* 
+* \details Convert the unsigned short integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-down mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to an unsigned 64-bit integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit
+* integer in round-up mode. NaN inputs return 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned long long int
+* - \p h converted to an unsigned 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even
+* mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero
+* mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-down mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half.  
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert an unsigned 64-bit integer to a half in round-up mode.
+* 
+* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - unsigned long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even
+* mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-down mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a half to a signed 64-bit integer in round-up mode.
+* 
+* \details Convert the half-precision floating-point value \p h to a signed 64-bit
+* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns long long int
+* - \p h converted to a signed 64-bit integer. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even
+* mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-to-nearest-even mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-towards-zero mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+*/
+__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-down mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-down mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Convert a signed 64-bit integer to a half in round-up mode.
+* 
+* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point
+* value in round-up mode.
+* \param[in] i - long long int. Is only being read. 
+* 
+* \returns half
+* - \p i converted to half. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i);
+
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Truncate input argument to the integral part.
+* 
+* \details Round \p h to the nearest integer value that does not exceed \p h in
+* magnitude.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns half
+* - The truncated integer value. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half htrunc(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculate ceiling of the input argument.
+* 
+* \details Compute the smallest integer value not less than \p h.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns half
+* - The smallest integer value not less than \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hceil(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+* 
+* \details Calculate the largest integer value which is less than or equal to \p h.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns half
+* - The largest integer value which is less than or equal to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hfloor(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Round input to nearest integer value in half-precision floating-point
+* number.
+* 
+* \details Round \p h to the nearest integer value in half-precision floating-point
+* format, with halfway cases rounded to the nearest even integer value.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns half
+* - The nearest integer to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrint(const __half h);
+
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Truncate \p half2 vector input argument to the integral part.
+* 
+* \details Round each component of vector \p h to the nearest integer value that does
+* not exceed \p h in magnitude.
+* \param[in] h - half2. Is only being read. 
+* 
+* \returns half2
+* - The truncated \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculate \p half2 vector ceiling of the input argument.
+* 
+* \details For each component of vector \p h compute the smallest integer value not less
+* than \p h.
+* \param[in] h - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector of smallest integers not less than \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculate the largest integer less than or equal to \p h.
+* 
+* \details For each component of vector \p h calculate the largest integer value which
+* is less than or equal to \p h.
+* \param[in] h - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector of largest integers which is less than or equal to \p h. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Round input to nearest integer value in half-precision floating-point
+* number.
+* 
+* \details Round each component of \p half2 vector \p h to the nearest integer value in
+* half-precision floating-point format, with halfway cases rounded to the
+* nearest even integer value.
+* \param[in] h - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector of rounded integer values. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns \p half2 with both halves equal to the input value.
+* 
+* \details Returns \p half2 number with both halves equal to the input \p a \p half
+* number.
+* \param[in] a - half. Is only being read. 
+* 
+* \returns half2
+* - The vector which has both its halves equal to the input \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __half2half2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Swaps both halves of the \p half2 input.
+* 
+* \details Swaps both halves of the \p half2 input and returns a new \p half2 number
+* with swapped halves.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - \p a with its halves being swapped. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines
+* into one \p half2 number. 
+* 
+* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into
+* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of
+* the return value, low 16 bits from input \p b is stored in high 16 bits of
+* the return value. 
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The low 16 bits of \p a and of \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts high 16 bits from each of the two \p half2 inputs and
+* combines into one \p half2 number.
+* 
+* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into
+* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of
+* the return value, high 16 bits from input \p b is stored in high 16 bits of
+* the return value.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The high 16 bits of \p a and of \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns high 16 bits of \p half2 input.
+*
+* \details Returns high 16 bits of \p half2 input \p a.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half
+* - The high 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __high2half(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Returns low 16 bits of \p half2 input.
+*
+* \details Returns low 16 bits of \p half2 input \p a.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half
+* - Returns \p half which contains low 16 bits of the input \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __low2half(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Checks if the input \p half number is infinite.
+* 
+* \details Checks if the input \p half number \p a is infinite. 
+* \param[in] a - half. Is only being read. 
+* 
+* \returns int 
+* - -1 iff \p a is equal to negative infinity, 
+* - 1 iff \p a is equal to positive infinity, 
+* - 0 otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ int __hisinf(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Combines two \p half numbers into one \p half2 number.
+* 
+* \details Combines two input \p half number \p a and \p b into one \p half2 number.
+* Input \p a is stored in low 16 bits of the return value, input \p b is stored
+* in high 16 bits of the return value.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+* 
+* \returns half2
+* - The half2 with one half equal to \p a and the other to \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts low 16 bits from \p half2 input.
+* 
+* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2
+* number which has both halves equal to the extracted bits.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The half2 with both halves equal to the low 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Extracts high 16 bits from \p half2 input.
+* 
+* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2
+* number which has both halves equal to the extracted bits.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The half2 with both halves equal to the high 16 bits of the input. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a);
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a \p half as a signed short integer.
+* 
+* \details Reinterprets the bits in the half-precision floating-point number \p h
+* as a signed short integer. 
+* \param[in] h - half. Is only being read. 
+* 
+* \returns short int
+* - The reinterpreted value. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ short int __half_as_short(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a \p half as an unsigned short integer.
+* 
+* \details Reinterprets the bits in the half-precision floating-point \p h
+* as an unsigned short number.
+* \param[in] h - half. Is only being read. 
+* 
+* \returns unsigned short int
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in a signed short integer as a \p half.
+* 
+* \details Reinterprets the bits in the signed short integer \p i as a
+* half-precision floating-point number.
+* \param[in] i - short int. Is only being read. 
+* 
+* \returns half
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __short_as_half(const short int i);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Reinterprets bits in an unsigned short integer as a \p half.
+* 
+* \details Reinterprets the bits in the unsigned short integer \p i as a
+* half-precision floating-point number.
+* \param[in] i - unsigned short int. Is only being read. 
+* 
+* \returns half
+* - The reinterpreted value.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half maximum of two input values.
+*
+* \details Calculates \p half max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half minimum of two input values.
+*
+* \details Calculates \p half min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector maximum of two inputs.
+*
+* \details Calculates \p half2 vector max(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise maximum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector minimum of two inputs.
+*
+* \details Calculates \p half2 vector min(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, the other input is returned.
+* - If both inputs are NaNs, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise minimum of vectors \p a  and \p b
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b);
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+#define __WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+
+__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize);
+__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize);
+#endif
+
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
+* 
+* \details Returns the value of var held by the thread whose ID is given by delta. 
+* If width is less than warpSize then each subsection of the warp behaves as a separate 
+* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], 
+* the value returned corresponds to the value of var held by the delta modulo width (i.e. 
+* within the same subsection). width must have a value which is a power of 2; 
+* results are undefined if width is not a power of 2, or is a number greater than 
+* warpSize. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - half2. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. 
+* The value of var held by the resulting lane ID is returned: in effect, var is shifted up 
+* the warp by delta threads. If width is less than warpSize then each subsection of the warp 
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
+* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. 
+* width must have a value which is a power of 2; results are undefined if width is not a power of 2, 
+* or is a number greater than warpSize. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - half2. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by adding delta to the caller's thread ID. 
+* The value of var held by the resulting thread ID is returned: this has the effect 
+* of shifting var down the warp by delta threads. If width is less than warpSize then 
+* each subsection of the warp behaves as a separate entity with a starting logical 
+* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread 
+* will not wrap around the value of width and so the upper delta threads 
+* will remain unchanged. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - half2. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
+* 
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: 
+* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each 
+* group of width consecutive threads are able to access elements from earlier groups of threads, 
+* however if they attempt to access elements from later groups of threads their own value of var 
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
+* reduction and broadcast. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - half2. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. 
+* 
+* \details Returns the value of var held by the thread whose ID is given by delta. 
+* If width is less than warpSize then each subsection of the warp behaves as a separate 
+* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], 
+* the value returned corresponds to the value of var held by the delta modulo width (i.e. 
+* within the same subsection). width must have a value which is a power of 2; 
+* results are undefined if width is not a power of 2, or is a number greater than 
+* warpSize. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - half. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by var from the source thread ID as half. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. 
+* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. 
+* The value of var held by the resulting lane ID is returned: in effect, var is shifted up 
+* the warp by delta threads. If width is less than warpSize then each subsection of the warp 
+* behaves as a separate entity with a starting logical thread ID of 0. The source thread index 
+* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. 
+* width must have a value which is a power of 2; results are undefined if width is not a power of 2, 
+* or is a number greater than warpSize. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - half. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by var from the source thread ID as half. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. 
+* 
+* \details Calculates a source thread ID by adding delta to the caller's thread ID. 
+* The value of var held by the resulting thread ID is returned: this has the effect 
+* of shifting var down the warp by delta threads. If width is less than warpSize then 
+* each subsection of the warp behaves as a separate entity with a starting logical 
+* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread 
+* will not wrap around the value of width and so the upper delta threads 
+* will remain unchanged. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - half. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by var from the source thread ID as half. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. 
+* 
+* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: 
+* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each 
+* group of width consecutive threads are able to access elements from earlier groups of threads, 
+* however if they attempt to access elements from later groups of threads their own value of var 
+* will be returned. This mode implements a butterfly addressing pattern such as is used in tree 
+* reduction and broadcast. 
+* \param[in] mask - unsigned int. Is only being read. 
+* \param[in] var - half. Is only being read. 
+* \param[in] delta - int. Is only being read. 
+* \param[in] width - int. Is only being read. 
+* 
+* \returns Returns the 2-byte word referenced by var from the source thread ID as half. 
+* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. 
+* \note_ref_guide_warp_shuffle
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior not reentrant, not thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize);
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */
+
+#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.nc` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cg` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.ca` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cs` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.lu` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `ld.global.cv` load instruction.
+* \param[in] ptr - memory location
+* \returns The value pointed by `ptr`
+*/
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wb` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cg` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.cs` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value);
+/**
+* \ingroup CUDA_MATH__HALF_MISC
+* \brief Generates a `st.global.wt` store instruction.
+* \param[out] ptr - memory location
+* \param[in] value - the value to be stored
+*/
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value);
+#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs half2 vector if-equal comparison.
+* 
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector result of if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison.
+* 
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector result of not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The \p half2 result of less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The vector result of greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The half2 vector result of less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison.
+* 
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector result of greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison.
+* 
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns half2
+* - The vector result of unordered if-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The vector result of unordered not-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison.
+*
+* Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The vector result of unordered less-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The vector result of unordered less-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The corresponding \p half results are set to 1.0 for true, or 0.0 for false.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Determine whether \p half2 argument is a NaN.
+*
+* \details Determine whether each half of input \p half2 number \p a is a NaN.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The half2 with the corresponding \p half results set to
+* 1.0 for NaN, 0.0 otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The sum of vectors \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The result of elementwise multiplying the vectors \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-95
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The sum of vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub
+* into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-104
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode. Prevents floating-point contractions of
+* mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-102
+* \endinternal
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise multiplying the vectors \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector division in round-to-nearest-even mode.
+*
+* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-103
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise division of \p a with \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Calculates the absolute value of both halves of the input \p half2 number and
+* returns the result.
+*
+* \details Calculates the absolute value of both halves of the input \p half2 number and
+* returns the result.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - Returns \p a with the absolute value of both halves. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest
+* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The sum of \p a and \p b, with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Subtracts \p half2 input vector \p b from input vector \p a in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The subtraction of vector \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in
+* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN
+* results are flushed to +0.0.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns half2
+* - The result of elementwise multiplication of vectors \p a and \p b, 
+* with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode.
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-105
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* \param[in] c - half2. Is only being read. 
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode, with saturation to [0.0, 1.0].
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the
+* results to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* \param[in] c - half2. Is only being read. 
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, 
+* with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Negates both halves of the input \p half2 number and returns the
+* result.
+*
+* \details Negates both halves of the input \p half2 number \p a and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-101
+* \endinternal
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - Returns \p a with both halves negated. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Calculates the absolute value of input \p half number and returns the result.
+*
+* \details Calculates the absolute value of input \p half number and returns the result.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The absolute value of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __habs(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode.
+*
+* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The sum of \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The result of subtracting \p b from \p a. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The result of multiplying \p a and \p b. 
+*/
+__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode.
+*
+* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even
+* mode. Prevents floating-point contractions of mul+add into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-94
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The sum of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode.
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest
+* mode. Prevents floating-point contractions of mul+sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-97
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of subtracting \p b from \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode.
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest
+* mode. Prevents floating-point contractions of mul+add or sub into fma.
+* \internal
+* \req DEEPLEARN-SRM_REQ-99
+* \endinternal
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* - The result of multiplying \p a and \p b.
+*/
+__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half division in round-to-nearest-even mode.
+* 
+* \details Divides \p half input \p a by input \p b in round-to-nearest
+* mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-98
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+* 
+* \returns half
+* - The result of dividing \p a by \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__  __half __hdiv(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half addition in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The sum of \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half subtraction in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Subtracts \p half input \p b from input \p a in round-to-nearest
+* mode,
+* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The result of subtraction of \p b from \p a, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half multiplication in round-to-nearest-even mode, with
+* saturation to [0.0, 1.0].
+*
+* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest
+* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to
+* +0.0.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns half
+* - The result of multiplying \p a and \p b, with respect to saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode.
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* \internal
+* \req DEEPLEARN-SRM_REQ-96
+* \endinternal
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+* \param[in] c - half. Is only being read. 
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode,
+* with saturation to [0.0, 1.0].
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode, and clamps the result
+* to range [0.0, 1.0]. NaN results are flushed to +0.0.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+* \param[in] c - half. Is only being read. 
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c, with respect to saturation. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Negates input \p half number and returns the result.
+*
+* \details Negates input \p half number and returns the result.
+* \internal
+* \req DEEPLEARN-SRM_REQ-100
+* \endinternal
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - minus a
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hneg(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector if-equal comparison and returns boolean true
+* iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of if-equal comparison
+* of vectors \p a and \p b are true;
+* - false otherwise.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector not-equal comparison and returns boolean
+* true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of not-equal comparison
+* of vectors \p a and \p b are true, 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-equal comparison and returns boolean
+* true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of less-equal comparison
+* of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-equal comparison and returns boolean
+* true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of greater-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector less-than comparison and returns boolean
+* true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of less-than comparison
+* of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector greater-than comparison and returns boolean
+* true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate false results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+* 
+* \returns bool 
+* - true if both \p half results of greater-than
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered if-equal comparison and returns
+* boolean true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half if-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered if-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered not-equal comparison and returns
+* boolean true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half not-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered not-equal
+* comparison of vectors \p a and \p b are true;
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-equal comparison and returns
+* boolean true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered less-equal
+* comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-equal comparison and
+* returns boolean true iff both \p half results are true, boolean false
+* otherwise.
+*
+* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-equal comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered
+* greater-equal comparison of vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered less-than comparison and returns
+* boolean true iff both \p half results are true, boolean false otherwise.
+*
+* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half less-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered less-than comparison of 
+* vectors \p a and \p b are true; 
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Performs \p half2 vector unordered greater-than comparison and
+* returns boolean true iff both \p half results are true, boolean false
+* otherwise.
+*
+* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b.
+* The bool result is set to true only if both \p half greater-than comparisons
+* evaluate to true, or false otherwise.
+* NaN inputs generate true results.
+* \param[in] a - half2. Is only being read. 
+* \param[in] b - half2. Is only being read. 
+*
+* \returns bool
+* - true if both \p half results of unordered
+* greater-than comparison of vectors \p a and \p b are true;
+* - false otherwise. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half if-equal comparison.
+*
+* \details Performs \p half if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of if-equal comparison of \p a and \p b. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half not-equal comparison.
+*
+* \details Performs \p half not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of not-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half less-equal comparison.
+*
+* \details Performs \p half less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of less-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half greater-equal comparison.
+*
+* \details Performs \p half greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of greater-equal comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half less-than comparison.
+*
+* \details Performs \p half less-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of less-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half greater-than comparison.
+*
+* \details Performs \p half greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate false results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of greater-than comparison of \p a and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered if-equal comparison.
+*
+* \details Performs \p half if-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered if-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered not-equal comparison.
+*
+* \details Performs \p half not-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered not-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered less-equal comparison.
+*
+* \details Performs \p half less-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered less-equal comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered greater-equal comparison.
+*
+* \details Performs \p half greater-equal comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered greater-equal comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered less-than comparison.
+*
+* \details Performs \p half less-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered less-than comparison of \p a and
+* \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Performs \p half unordered greater-than comparison.
+*
+* \details Performs \p half greater-than comparison of inputs \p a and \p b.
+* NaN inputs generate true results.
+* \param[in] a - half. Is only being read. 
+* \param[in] b - half. Is only being read. 
+*
+* \returns bool
+* - The boolean result of unordered greater-than comparison of \p a
+* and \p b.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Determine whether \p half argument is a NaN.
+*
+* \details Determine whether \p half value \p a is a NaN.
+* \param[in] a - half. Is only being read. 
+*
+* \returns bool
+* - true iff argument is NaN. 
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ bool __hisnan(const __half a);
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half maximum of two input values, NaNs pass through.
+*
+* \details Calculates \p half max(\p a, \p b)
+* defined as (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_COMPARISON
+* \brief Calculates \p half minimum of two input values, NaNs pass through.
+*
+* \details Calculates \p half min(\p a, \p b)
+* defined as (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+*
+* \returns half
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b);
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation.
+*
+* \details Performs \p half multiply on inputs \p a and \p b,
+* then performs a \p half add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - half. Is only being read.
+* \param[in] b - half. Is only being read.
+* \param[in] c - half. Is only being read.
+*
+* \returns half
+* - The result of fused multiply-add operation on \p
+* a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through.
+*
+* \details Calculates \p half2 vector max(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a > \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise maximum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_COMPARISON
+* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through.
+*
+* \details Calculates \p half2 vector min(\p a, \p b).
+* Elementwise \p half operation is defined as
+* (\p a < \p b) ? \p a : \p b.
+* - If either of inputs is NaN, then canonical NaN is returned.
+* - If values of both inputs are 0.0, then +0.0 > -0.0
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise minimum of vectors \p a  and \p b, with NaNs pass through
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b);
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even
+* mode with relu saturation.
+*
+* \details Performs \p half2 vector multiply on inputs \p a and \p b,
+* then performs a \p half2 vector add of the result with \p c,
+* rounding the result once in round-to-nearest-even mode.
+* Then negative result is clamped to 0.
+* NaN result is converted to canonical NaN.
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c);
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) */
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Performs fast complex multiply-accumulate
+*
+* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as
+* complex numbers in \p half precision and performs
+* complex multiply-accumulate operation: a*b + c
+* \param[in] a - half2. Is only being read.
+* \param[in] b - half2. Is only being read.
+* \param[in] c - half2. Is only being read.
+*
+* \returns half2
+* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half square root in round-to-nearest-even mode.
+*
+* \details Calculates \p half square root of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hsqrt(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half reciprocal square root in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half reciprocal square root of input \p a in round-to-nearest
+* mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The reciprocal square root of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The reciprocal of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hrcp(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half natural logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The natural logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half binary logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The binary logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode.
+*
+* \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The decimal logarithm of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hlog10(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half natural exponential function in round-to-nearest
+* mode.
+*
+* \details Calculates \p half natural exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The natural exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half binary exponential function in round-to-nearest
+* mode.
+*
+* \details Calculates \p half binary exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The binary exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp2(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half decimal exponential function in round-to-nearest
+* mode.
+*
+* \details Calculates \p half decimal exponential function of input \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The decimal exponential function on \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hexp10(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half cosine in round-to-nearest-even mode.
+*
+* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The cosine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hcos(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF_FUNCTIONS
+* \brief Calculates \p half sine in round-to-nearest-even mode.
+*
+* \details Calculates \p half sine of input \p a in round-to-nearest-even mode.
+* \param[in] a - half. Is only being read. 
+*
+* \returns half
+* - The sine of \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half hsin(const __half a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector square root in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 square root of input vector \p a in round-to-nearest
+* mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest
+* mode.
+*
+* \details Calculates \p half2 reciprocal square root of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise reciprocal square root on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode.
+*
+* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise reciprocal on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 natural logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise natural logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest
+* mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise binary logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even
+* mode.
+*
+* \details Calculates \p half2 decimal logarithm of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise decimal logarithm on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector exponential function in round-to-nearest
+* mode.
+*
+* \details Calculates \p half2 exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector binary exponential function in
+* round-to-nearest-even mode.
+*
+* \details Calculates \p half2 binary exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+*
+* \returns half2
+* - The elementwise binary exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector decimal exponential function in
+* round-to-nearest-even mode.
+* 
+* \details Calculates \p half2 decimal exponential function of input vector \p a in
+* round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The elementwise decimal exponential function on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode.
+* 
+* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even
+* mode.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The elementwise cosine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a);
+/**
+* \ingroup CUDA_MATH__HALF2_FUNCTIONS
+* \brief Calculates \p half2 vector sine in round-to-nearest-even mode.
+* 
+* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode.
+* \param[in] a - half2. Is only being read. 
+* 
+* \returns half2
+* - The elementwise sine on vector \p a.
+* \internal
+* \exception-guarantee no-throw guarantee
+* \behavior reentrant, thread safe
+* \endinternal
+*/
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a);
+
+#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
+
+/**
+* \ingroup CUDA_MATH__HALF2_ARITHMETIC
+* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this
+* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the
+* two __half elements; the entire __half2 is not guaranteed to be atomic as a single 32-bit access.
+* 
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is only supported by devices of compute capability 6.x and higher.
+* 
+* \param[in] address - half2*. An address in global or shared memory.
+* \param[in] val - half2. The value to be added.
+* 
+* \returns half2
+* - The old value read from \p address.
+*
+* \note_ref_guide_atomic
+*/
+__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val);
+
+#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)*/
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+
+/**
+* \ingroup CUDA_MATH__HALF_ARITHMETIC
+* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value
+* back to \p address. This operation is performed in one atomic operation.
+* 
+* \details The location of \p address must be in global or shared memory. This operation has undefined
+* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher.
+* 
+* \param[in] address - half*. An address in global or shared memory.
+* \param[in] val - half. The value to be added.
+* 
+* \returns half
+* - The old value read from \p address.
+* 
+* \note_ref_guide_atomic
+*/
+__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val);
+
+#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)*/
+
+#endif /* defined(__CUDACC__) */
+
+#undef __CUDA_FP16_DECL__
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+
+#endif /* defined(__cplusplus) */
+
+/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */
+#include "cuda_fp16.hpp"
+#undef ___CUDA_FP16_STRINGIFY_INNERMOST
+#undef __CUDA_FP16_STRINGIFY
+
+#endif /* end of include guard: __CUDA_FP16_H__ */
diff --git a/ext/cudart/include/cuda_fp16.hpp b/ext/cudart/include/cuda_fp16.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..e15f46b35a90fc7940d152e02bc2cec0fe87f857
--- /dev/null
+++ b/ext/cudart/include/cuda_fp16.hpp
@@ -0,0 +1,2614 @@
+/*
+* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_FP16_HPP__)
+#define __CUDA_FP16_HPP__
+
+#if !defined(__CUDA_FP16_H__)
+#error "Do not include this file directly. Instead, include cuda_fp16.h."
+#endif
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_FP16
+#endif
+
+/* C++11 header for std::move. 
+ * In RTC mode, std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */
+
+/* C++ header for std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* defined(__CUDACC_) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+#endif /* defined(__CUDACC__) */
+
+/* Macros to allow half & half2 to be used by inline assembly */
+#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/* Macros for half & half2 binary arithmetic */
+#define __BINARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\
+   __half val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \
+        :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \
+   return val; \
+} /* while(0) */
+#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \
+   return val; \
+} /* while(0) */
+
+/**
+* Types which allow static initialization of "half" and "half2" until
+* these become an actual builtin. Note this initialization is as a
+* bitfield representation of "half", and not a conversion from short->half.
+* Such a representation will be deprecated in a future version of CUDA. 
+* (Note these are visible to non-nvcc compilers, including C-only compilation)
+*/
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+typedef struct __CUDA_ALIGN__(4) {
+    unsigned short x;
+    unsigned short y;
+} __half2_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(__GNUC__) */
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+
+public:
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half() = default;
+#else
+    __CUDA_HOSTDEVICE__ __half() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+    /* Convert to/from __half_raw */
+    __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { }
+    __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; }
+    __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; }
+
+#if !defined(__CUDA_NO_HALF_CONVERSIONS__)
+
+    /* Construct from float/double */
+    __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x;  }
+
+    __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; }
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; }
+
+/* Member functions only available to nvcc compilation so far */
+#if defined(__CUDACC__)
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+    __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; }
+
+    /* Allow automatic casts to supported builtin types, matching all that are permitted with float */
+    __CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const short val) { __x = __short2half_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const int val) { __x = __int2half_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); }
+    __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; }
+
+    /* Boolean conversion - note both 0 and -0 must return false */
+    __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFFU) != 0U; }
+#endif /* defined(__CUDACC__) */
+#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */
+};
+
+/* Global-space operator functions are only available to nvcc compilation */
+#if defined(__CUDACC__)
+
+/* Arithmetic FP16 operations only supported on arch >= 5.3 */
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+#if !defined(__CUDA_NO_HALF_OPERATORS__)
+/* Some basic arithmetic operations expected of a builtin */
+__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); }
+__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); }
+__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); }
+__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); }
+
+__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; }
+__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; }
+__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; }
+__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; }
+
+/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */
+__device__ __forceinline__ __half &operator++(__half &h)      { __half_raw one; one.x = 0x3C00U; h += one; return h; }
+__device__ __forceinline__ __half &operator--(__half &h)      { __half_raw one; one.x = 0x3C00U; h -= one; return h; }
+__device__ __forceinline__ __half  operator++(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h += one;
+    return ret;
+}
+__device__ __forceinline__ __half  operator--(__half &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half ret = h;
+    __half_raw one;
+    one.x = 0x3C00U;
+    h -= one;
+    return ret;
+}
+
+/* Unary plus and inverse operators */
+__device__ __forceinline__ __half operator+(const __half &h) { return h; }
+__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); }
+
+/* Some basic comparison operations to make it look like a builtin */
+__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); }
+__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); }
+__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
+#endif /* defined(__CUDACC__) */
+
+/* __half2 is visible to non-nvcc host compilers */
+struct __CUDA_ALIGN__(4) __half2 {
+    __half x;
+    __half y;
+
+    // All construct/copy/assign/move
+public:
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+    __half2() = default;
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); }
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; }
+#else
+    __CUDA_HOSTDEVICE__ __half2() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+    __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { }
+    __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); }
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; }
+
+    /* Convert to/from __half2_raw */
+    __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); }
+    __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; }
+    __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; }
+};
+
+/* Global-space operator functions are only available to nvcc compilation */
+#if defined(__CUDACC__)
+
+/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) && !defined(__CUDA_NO_HALF2_OPERATORS__)
+
+__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); }
+__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); }
+__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); }
+__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); }
+
+__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; }
+__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; }
+__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; }
+__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; }
+
+__device__ __forceinline__ __half2 &operator++(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; }
+__device__ __forceinline__ __half2 &operator--(__half2 &h)      { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; }
+__device__ __forceinline__ __half2  operator++(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hadd2(h, one);
+    return ret;
+}
+__device__ __forceinline__ __half2  operator--(__half2 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __half2 ret = h;
+    __half2_raw one;
+    one.x = 0x3C00U;
+    one.y = 0x3C00U;
+    h = __hsub2(h, one);
+    return ret;
+}
+
+__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; }
+__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); }
+
+__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); }
+__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); }
+__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); }
+
+#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */
+#endif /* defined(__CUDACC__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+    unsigned int u;
+    unsigned int result;
+#if defined(__CUDACC__)
+    (void)memcpy(&x, &f, sizeof(f));
+#else
+    (void)std::memcpy(&x, &f, sizeof(f));
+#endif
+    u = (x & 0x7fffffffU);
+    sign = ((x >> 16U) & 0x8000U);
+    // NaN/+Inf/-Inf
+    if (u >= 0x7f800000U) {
+        remainder = 0U;
+        result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU);
+    } else if (u > 0x477fefffU) { // Overflows
+        remainder = 0x80000000U;
+        result = (sign | 0x7bffU);
+    } else if (u >= 0x38800000U) { // Normal numbers
+        remainder = u << 19U;
+        u -= 0x38000000U;
+        result = (sign | (u >> 13U));
+    } else if (u < 0x33000001U) { // +0/-0
+        remainder = u;
+        result = sign;
+    } else { // Denormal numbers
+        const unsigned int exponent = u >> 23U;
+        const unsigned int shift = 0x7eU - exponent;
+        unsigned int mantissa = (u & 0x7fffffU);
+        mantissa |= 0x800000U;
+        remainder = mantissa << (32U - shift);
+        result = (sign | (mantissa >> shift));
+        result &= 0x0000FFFFU;
+    }
+    return static_cast<unsigned short>(result);
+}
+#endif  /* #if !defined(__CUDACC_RTC__) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a)
+{
+#if defined(__CUDA_ARCH__)
+    __half val;
+    asm("{  cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a));
+    return val;
+#else
+    __half result;
+    /*
+    // Perform rounding to 11 bits of precision, convert value
+    // to float and call existing float to half conversion.
+    // By pre-rounding to 11 bits we avoid additional rounding
+    // in float to half conversion.
+    */
+    unsigned long long int absa;
+    unsigned long long int ua;
+    #if defined(__CUDACC__)
+        (void)memcpy(&ua, &a, sizeof(a));
+    #else
+        (void)std::memcpy(&ua, &a, sizeof(a));
+    #endif
+    absa = (ua & 0x7fffffffffffffffULL);
+    if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL))
+    {
+        /*
+        // |a| >= 2^16 or NaN or |a| <= 2^(-25)
+        // double-rounding is not a problem
+        */
+        result = __float2half(static_cast<float>(a));
+    }
+    else
+    {
+        /*
+        // here 2^(-25) < |a| < 2^16
+        // prepare shifter value such that a + shifter
+        // done in double precision performs round-to-nearest-even
+        // and (a + shifter) - shifter results in a rounded to
+        // 11 bits of precision. Shifter needs to have exponent of
+        // a plus 53 - 11 = 42 and a leading bit in mantissa to guard
+        // against negative values.
+        // So need to have |a| capped to avoid overflow in exponent.
+        // For inputs that are smaller than half precision minnorm
+        // we prepare fixed shifter exponent.
+        */
+        unsigned long long shifterBits;
+        if (absa >= 0x3f10000000000000ULL)
+        {
+            /*
+            // Here if |a| >= 2^(-14)
+            // add 42 to exponent bits
+            */
+            shifterBits  = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL;
+        }
+        else
+        {
+            /*
+            // 2^(-25) < |a| < 2^(-14), potentially results in denormal
+            // set exponent bits to 42 - 14 + bias
+            */
+            shifterBits = 0x41B0000000000000ULL;
+        }
+        // set leading mantissa bit to protect against negative inputs
+        shifterBits |= 0x0008000000000000ULL;
+        double shifter;
+        #if defined(__CUDACC__)
+            (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        #else
+            (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits));
+        #endif
+        double aShiftRound = a + shifter;
+
+        /*
+        // Prevent the compiler from optimizing away a + shifter - shifter
+        // by doing intermediate memcopy and harmless bitwize operation
+        */
+        unsigned long long int aShiftRoundBits;
+        #if defined(__CUDACC__)
+            (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+        #else
+            (void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound));
+        #endif
+
+        // the value is positive, so this operation doesn't change anything
+        aShiftRoundBits &= 0x7fffffffffffffffULL;
+
+        #if defined(__CUDACC__)
+            (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+        #else
+            (void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound));
+        #endif
+
+        result = __float2half(static_cast<float>(aShiftRound - shifter));
+    }
+
+    return result;
+#endif
+}
+
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a)
+{
+    __half val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a));
+#else
+    __half_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2half(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a)
+{
+    __half2 val;
+#if defined(__CUDA_ARCH__)
+    asm("{.reg .f16 low;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a));
+#else
+    val = __half2(__float2half_rn(a), __float2half_rn(a));
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b)
+{
+    __half2 val;
+#if defined(__CUDA_ARCH__)
+#if (__CUDA_ARCH__ >= 800)
+    asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n"
+        : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+#else
+    asm("{.reg .f16 low,high;\n"
+        "  cvt.rn.f16.f32 low, %1;\n"
+        "  cvt.rn.f16.f32 high, %2;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b));
+#endif
+#else
+    val = __half2(__float2half_rn(a), __float2half_rn(b));
+#endif
+    return val;
+}
+
+#ifndef __CUDACC_RTC__  /* no host functions in NVRTC mode */
+static inline float __internal_half2float(const unsigned short h)
+{
+    unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U);
+    unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU);
+    unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U);
+    float f;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)std::memcpy(&f, &u, sizeof(u));
+#endif
+    return f;
+}
+#endif  /* !defined(__CUDACC_RTC__) */
+
+__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a)
+{
+    float val;
+#if defined(__CUDA_ARCH__)
+    asm("{  cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a)));
+#else
+    val = __internal_half2float(static_cast<__half_raw>(a).x);
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a)
+{
+    float val;
+#if defined(__CUDA_ARCH__)
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+#else
+    val = __internal_half2float(static_cast<__half2_raw>(a).x);
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a)
+{
+    float val;
+#if defined(__CUDA_ARCH__)
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a)));
+#else
+    val = __internal_half2float(static_cast<__half2_raw>(a).y);
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h)
+{
+    short int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<short int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h)
+{
+    unsigned short int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned short int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h)
+{
+    int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h)
+{
+    unsigned int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h)
+{
+    long long int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = min_val;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<long long int>(f);
+    }
+#endif
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h)
+{
+    unsigned long long int i;
+#if defined __CUDA_ARCH__
+    asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+#else
+    const float f = __half2float(h);
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xF800U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    } else {
+        // normal value, conversion is well-defined
+        i = static_cast<unsigned long long int>(f);
+    }
+#endif
+    return i;
+}
+
+/* Intrinsic functions only available to nvcc compilers */
+#if defined(__CUDACC__)
+
+/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */
+__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(const __half x, const __half y)
+{
+    __half2 t; t.x = x; t.y = y; return t;
+}
+#undef __VECTOR_FUNCTIONS_DECL__
+
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a)
+{
+    const __half2 val = __floats2half2_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a)
+{
+    float hi_float;
+    float lo_float;
+#if defined(__CUDA_ARCH__)
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a)));
+
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high},%1;\n"
+        "  cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a)));
+#else
+    lo_float = __internal_half2float(((__half2_raw)a).x);
+    hi_float = __internal_half2float(((__half2_raw)a).y);
+#endif
+    return make_float2(lo_float, hi_float);
+}
+__CUDA_FP16_DECL__ int __half2int_rn(const __half h)
+{
+    int i;
+    asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_rd(const __half h)
+{
+    int i;
+    asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ int __half2int_ru(const __half h)
+{
+    int i;
+    asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i)
+{
+    __half h;
+#if defined(__CUDA_ARCH__)
+    asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+#else
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __int2half_rz(const int i)
+{
+    __half h;
+    asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __int2half_rd(const int i)
+{
+    __half h;
+    asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __int2half_ru(const int i)
+{
+    __half h;
+    asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+
+__CUDA_FP16_DECL__ short int __half2short_rn(const __half h)
+{
+    short int i;
+    asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_rd(const __half h)
+{
+    short int i;
+    asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ short int __half2short_ru(const __half h)
+{
+    short int i;
+    asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i)
+{
+    __half h;
+#if defined __CUDA_ARCH__
+    asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+#else
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __short2half_rz(const short int i)
+{
+    __half h;
+    asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __short2half_rd(const short int i)
+{
+    __half h;
+    asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __short2half_ru(const short int i)
+{
+    __half h;
+    asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+
+__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h)
+{
+    unsigned int i;
+    asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i)
+{
+    __half h;
+#if defined __CUDA_ARCH__
+    asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+#else
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i)
+{
+    __half h;
+    asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i)
+{
+    __half h;
+    asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i)
+{
+    __half h;
+    asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i));
+    return h;
+}
+
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h)
+{
+    unsigned short int i;
+    asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i)
+{
+    __half h;
+#if defined __CUDA_ARCH__
+    asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+#else
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i)
+{
+    __half h;
+    asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i)
+{
+    __half h;
+    asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i)
+{
+    __half h;
+    asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i));
+    return h;
+}
+
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h)
+{
+    unsigned long long int i;
+    asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i)
+{
+    __half h;
+#if defined(__CUDA_ARCH__)
+    asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+#else
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i)
+{
+    __half h;
+    asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i)
+{
+    __half h;
+    asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i)
+{
+    __half h;
+    asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+
+__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h)
+{
+    long long int i;
+    asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h)
+{
+    long long int i;
+    asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h)
+{
+    long long int i;
+    asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h)));
+    return i;
+}
+__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i)
+{
+    __half h;
+#if defined(__CUDA_ARCH__)
+    asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+#else
+    // double-rounding is not a problem here: if integer
+    // has more than 24 bits, it is already too large to
+    // be represented in half precision, and result will
+    // be infinity.
+    const float  f = static_cast<float>(i);
+                 h = __float2half_rn(f);
+#endif
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i)
+{
+    __half h;
+    asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i)
+{
+    __half h;
+    asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i)
+{
+    __half h;
+    asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i));
+    return h;
+}
+
+__CUDA_FP16_DECL__ __half htrunc(const __half h)
+{
+    __half r;
+    asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hceil(const __half h)
+{
+    __half r;
+    asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hfloor(const __half h)
+{
+    __half r;
+    asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hrint(const __half h)
+{
+    __half r;
+    asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h)));
+    return r;
+}
+
+__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rzi.f16.f16 low, low;\n"
+        "  cvt.rzi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rpi.f16.f16 low, low;\n"
+        "  cvt.rpi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rmi.f16.f16 low, low;\n"
+        "  cvt.rmi.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  cvt.rni.f16.f16 low, low;\n"
+        "  cvt.rni.f16.f16 high, high;\n"
+        "  mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b)
+{
+    __half2 val;
+    asm("{.reg .f16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half __low2half(const __half2 a)
+{
+    __half ret;
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+    return ret;
+}
+__CUDA_FP16_DECL__ int __hisinf(const __half a)
+{
+    int retval;
+    if (__HALF_TO_CUS(a) == 0xFC00U) {
+        retval = -1;
+    } else if (__HALF_TO_CUS(a) == 0x7C00U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half __high2half(const __half2 a)
+{
+    __half ret;
+    asm("{.reg .f16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a)));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b)
+{
+    __half2 val;
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __half2half2(const __half a)
+{
+    __half2 val;
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a)
+{
+    __half2 val;
+    asm("{.reg .f16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ short int __half_as_short(const __half h)
+{
+    return static_cast<short int>(__HALF_TO_CUS(h));
+}
+__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h)
+{
+    return __HALF_TO_CUS(h);
+}
+__CUDA_FP16_DECL__ __half __short_as_half(const short int i)
+{
+    __half h;
+    __HALF_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+}
+__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i)
+{
+    __half h;
+    __HALF_TO_US(h) = i;
+    return h;
+}
+
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b)
+{
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+    __BINARY_OP_HALF_MACRO(max)
+#else
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+#endif
+}
+__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b)
+{
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+    __BINARY_OP_HALF_MACRO(min)
+#else
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+    float fr;
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr) : "f"(fa), "f"(fb));
+    const __half hr = __float2half(fr);
+    return hr;
+#endif
+}
+
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b)
+{
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+    __BINARY_OP_HALF2_MACRO(max)
+#else
+    const float2 fa = __half22float2(a);
+    const float2 fb = __half22float2(b);
+    float2 fr;
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
+    asm("{max.f32 %0,%1,%2;\n}"
+        :"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
+    const __half2 hr = __float22half2_rn(fr);
+    return hr;
+#endif
+}
+__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b)
+{
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+    __BINARY_OP_HALF2_MACRO(min)
+#else
+    const float2 fa = __half22float2(a);
+    const float2 fb = __half22float2(b);
+    float2 fr;
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr.x) : "f"(fa.x), "f"(fb.x));
+    asm("{min.f32 %0,%1,%2;\n}"
+        :"=f"(fr.y) : "f"(fa.y), "f"(fb.y));
+    const __half2 hr = __float22half2_rn(fr);
+    return hr;
+#endif
+}
+
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)
+/******************************************************************************
+*                           __half, __half2 warp shuffle                     *
+******************************************************************************/
+#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \
+   return r; \
+} /* while(0) */
+
+#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\
+   __half2 r; \
+   asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+
+__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.idx.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_HALF2_MACRO(shfl.up.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.down.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_HALF2_MACRO(shfl.bfly.b32)
+}
+
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32)
+}
+__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32)
+}
+
+#undef __SHUFFLE_HALF2_MACRO
+#undef __SHUFFLE_SYNC_HALF2_MACRO
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+
+__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down(temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor(temp1, delta, width);
+    return __low2half(temp2);
+}
+
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width)
+{
+    const __half2 temp1 = __halves2half2(var, var);
+    const __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width);
+    return __low2half(temp2);
+}
+
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)*/
+/******************************************************************************
+*               __half and __half2 __ldg,__ldcg,__ldca,__ldcs                *
+******************************************************************************/
+
+#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_FP16_DECL__ __half2 __ldg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcg(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldca(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcs(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldlu(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half2 __ldcv(const  __half2 *const ptr)
+{
+    __half2 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr)
+{
+    __half ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory");
+}
+__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__HALF_TO_CUS(value)) : "memory");
+}
+#undef __LDG_PTR
+#endif /*defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))*/
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)
+/******************************************************************************
+*                             __half2 comparison                             *
+******************************************************************************/
+#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.eq)
+}
+__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.ne)
+}
+__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.le)
+}
+__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.ge)
+}
+__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.lt)
+}
+__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.gt)
+}
+__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.equ)
+}
+__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.neu)
+}
+__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.leu)
+}
+__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.geu)
+}
+__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.ltu)
+}
+__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b)
+{
+    __COMPARISON_OP_HALF2_MACRO(set.gtu)
+}
+#undef __COMPARISON_OP_HALF2_MACRO
+#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\
+   __half2 val; \
+   bool retval; \
+   asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \
+        :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \
+   if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\
+      retval = true; \
+   } else { \
+      retval = false; \
+   }\
+   return retval;\
+} /* while(0) */
+__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq)
+}
+__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne)
+}
+__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.le)
+}
+__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge)
+}
+__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt)
+}
+__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt)
+}
+__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ)
+}
+__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu)
+}
+__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu)
+}
+__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu)
+}
+__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu)
+}
+__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b)
+{
+    __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu)
+}
+#undef __BOOL_COMPARISON_OP_HALF2_MACRO
+/******************************************************************************
+*                             __half comparison                              *
+******************************************************************************/
+#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_FP16_STRINGIFY(name) ".f16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+} /* while(0) */
+__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(eq)
+}
+__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ne)
+}
+__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(le)
+}
+__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ge)
+}
+__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(lt)
+}
+__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(gt)
+}
+__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(equ)
+}
+__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(neu)
+}
+__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(leu)
+}
+__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(geu)
+}
+__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(ltu)
+}
+__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b)
+{
+    __COMPARISON_OP_HALF_MACRO(gtu)
+}
+#undef __COMPARISON_OP_HALF_MACRO
+/******************************************************************************
+*                            __half2 arithmetic                             *
+******************************************************************************/
+__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(add)
+}
+__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(sub)
+}
+__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(mul)
+}
+__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(add.sat)
+}
+__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(sub.sat)
+}
+__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(mul.sat)
+}
+__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(add.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(sub.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(mul.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn.sat)
+}
+__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) {
+    __half ha = __low2half(a);
+    __half hb = __low2half(b);
+
+    const __half v1 = __hdiv(ha, hb);
+
+    ha = __high2half(a);
+    hb = __high2half(b);
+
+    const __half v2 = __hdiv(ha, hb);
+
+    return __halves2half2(v1, v2);
+}
+/******************************************************************************
+*                             __half arithmetic                             *
+******************************************************************************/
+__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(add)
+}
+__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(sub)
+}
+__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(mul)
+}
+__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(add.sat)
+}
+__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(sub.sat)
+}
+__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(mul.sat)
+}
+__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(add.rn)
+}
+__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(sub.rn)
+}
+__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(mul.rn)
+}
+__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn)
+}
+__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn.sat)
+}
+__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) {
+    __half v;
+    __half abs;
+    __half den;
+    __HALF_TO_US(den) = 0x008FU;
+
+    float rcp;
+    const float fa = __half2float(a);
+    const float fb = __half2float(b);
+
+    asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb));
+
+    float fv = rcp * fa;
+
+    v = __float2half(fv);
+    __HALF_TO_US(abs) = static_cast<unsigned short>(static_cast<unsigned int>(__HALF_TO_CUS(v)) & 0x00007FFFU);
+    if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000U))) {
+        const float err = __fmaf_rn(-fb, fv, fa);
+        fv = __fmaf_rn(rcp, err, fv);
+        v = __float2half(fv);
+    }
+    return v;
+}
+
+/******************************************************************************
+*                             __half2 functions                  *
+******************************************************************************/
+#define __SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+#define __SPEC_CASE(i,r, spc, ulp) \
+   "{.reg.b16 spc, ulp, p;\n"\
+   "  mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\
+   "  mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n"
+#define __APPROX_FCAST(fun) /* do */ {\
+   __half val;\
+   asm("{.reg.b32         f;        \n"\
+                " .reg.b16         r;        \n"\
+                "  mov.b16         r,%1;     \n"\
+                "  cvt.f32.f16     f,r;      \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   f,f;  \n"\
+                "  cvt.rn.f16.f32      r,f;  \n"\
+                "  mov.b16         %0,r;     \n"\
+                "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\
+   return val;\
+} /* while(0) */
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __half2 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  cvt.f32.f16     fl, hl;         \n"\
+                "  cvt.f32.f16     fu, hu;         \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fl, fl;     \n"\
+                "  " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32   fu, fu;     \n"\
+                "  cvt.rn.f16.f32      hl, fl;     \n"\
+                "  cvt.rn.f16.f32      hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+static __device__ __forceinline__ float __float_simpl_sinf(float a);
+static __device__ __forceinline__ float __float_simpl_cosf(float a);
+__CUDA_FP16_DECL__ __half hsin(const __half a) {
+    const float sl = __float_simpl_sinf(__half2float(a));
+    __half r = __float2half_rn(sl);
+    asm("{\n\t"
+        "  .reg.b16 i,r,t;     \n\t"
+        "  mov.b16 r, %0;      \n\t"
+        "  mov.b16 i, %1;      \n\t"
+        "  and.b16 t, r, 0x8000U; \n\t"
+        "  abs.f16 r, r;   \n\t"
+        "  abs.f16 i, i;   \n\t"
+        __SPEC_CASE(i, r, 0X32B3U, 0x0800U)
+        __SPEC_CASE(i, r, 0X5CB0U, 0x9000U)
+        "  or.b16  r,r,t;      \n\t"
+        "  mov.b16 %0, r;      \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) {
+    const float sl = __float_simpl_sinf(__half2float(a.x));
+    const float sh = __float_simpl_sinf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(sl, sh);
+    asm("{\n\t"
+        "  .reg.b32 i,r,t;             \n\t"
+        "  mov.b32 r, %0;              \n\t"
+        "  mov.b32 i, %1;              \n\t"
+        "  and.b32 t, r, 0x80008000U;   \n\t"
+        "  abs.f16x2 r, r;   \n\t"
+        "  abs.f16x2 i, i;   \n\t"
+        __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U)
+        __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U)
+        "  or.b32  r, r, t;            \n\t"
+        "  mov.b32 %0, r;              \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half hcos(const __half a) {
+    const float cl = __float_simpl_cosf(__half2float(a));
+    __half r = __float2half_rn(cl);
+    asm("{\n\t"
+        "  .reg.b16 i,r;        \n\t"
+        "  mov.b16 r, %0;       \n\t"
+        "  mov.b16 i, %1;       \n\t"
+        "  abs.f16 i, i;        \n\t"
+        __SPEC_CASE(i, r, 0X2B7CU, 0x1000U)
+        "  mov.b16 %0, r;       \n"
+        "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) {
+    const float cl = __float_simpl_cosf(__half2float(a.x));
+    const float ch = __float_simpl_cosf(__half2float(a.y));
+    __half2 r = __floats2half2_rn(cl, ch);
+    asm("{\n\t"
+        "  .reg.b32 i,r;   \n\t"
+        "  mov.b32 r, %0;  \n\t"
+        "  mov.b32 i, %1;  \n\t"
+        "  abs.f16x2 i, i; \n\t"
+        __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U)
+        "  mov.b32 %0, r;  \n"
+        "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant)
+{
+    const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F);
+    const unsigned q = __float_as_uint(ar);
+    const float j = __fsub_rn(ar, 12582912.0F);
+    float t = __fmaf_rn(j, -1.5707962512969971e+000F, a);
+    t = __fmaf_rn(j, -7.5497894158615964e-008F, t);
+    *quadrant = q;
+    return t;
+}
+static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i)
+{
+    float z;
+    const float x2 = x*x;
+    float a8;
+    float a6;
+    float a4;
+    float a2;
+    float a1;
+    float a0;
+
+    if ((i & 1U) != 0U) {
+        // cos
+        a8 =  2.44331571e-5F;
+        a6 = -1.38873163e-3F;
+        a4 =  4.16666457e-2F;
+        a2 = -5.00000000e-1F;
+        a1 = x2;
+        a0 = 1.0F;
+    }
+    else {
+        // sin
+        a8 = -1.95152959e-4F;
+        a6 =  8.33216087e-3F;
+        a4 = -1.66666546e-1F;
+        a2 = 0.0F;
+        a1 = x;
+        a0 = x;
+    }
+
+    z = __fmaf_rn(a8, x2, a6);
+    z = __fmaf_rn(z, x2, a4);
+    z = __fmaf_rn(z, x2, a2);
+    z = __fmaf_rn(z, a1, a0);
+
+    if ((i & 2U) != 0U) {
+        z = -z;
+    }
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_sinf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, i);
+    return z;
+}
+static __device__ __forceinline__ float __float_simpl_cosf(float a)
+{
+    float z;
+    unsigned i;
+    a = __internal_trig_reduction_kernel(a, &i);
+    z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U);
+    return z;
+}
+
+__CUDA_FP16_DECL__ __half hexp(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C, nZ;       \n"
+        " .reg.b16         h,r;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f,f;        \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X1F79U, 0x9400U)
+        __SPEC_CASE(h, r, 0X25CFU, 0x9400U)
+        __SPEC_CASE(h, r, 0XC13BU, 0x0400U)
+        __SPEC_CASE(h, r, 0XC1EFU, 0x0200U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x3fb8aa3bU; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U)
+        __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U)
+        __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U)
+        __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hexp2(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, ULP;         \n"
+        " .reg.b16         r;              \n"
+        "  mov.b16         r,%1;           \n"
+        "  cvt.f32.f16     f,r;            \n"
+        "  ex2.approx.ftz.f32      f,f;    \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      f,f,ULP,f;      \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, ULP;    \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  mov.b32         ULP, 0x33800000U;\n"
+        "  fma.rn.f32      fl,fl,ULP,fl;   \n"
+        "  fma.rn.f32      fu,fu,ULP,fu;   \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         %0, {hl, hu};   \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hexp10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h,r;            \n"
+        " .reg.b32         f, C, nZ;       \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      f,f,C,nZ;       \n"
+        "  ex2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x34DEU, 0x9800U)
+        __SPEC_CASE(h, r, 0x9766U, 0x9000U)
+        __SPEC_CASE(h, r, 0x9972U, 0x1000U)
+        __SPEC_CASE(h, r, 0xA5C4U, 0x1000U)
+        __SPEC_CASE(h, r, 0xBF0AU, 0x8100U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu,C,nZ; \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  mov.b32         C, 0x40549A78U; \n"
+        "  mov.b32         nZ, 0x80000000U;\n"
+        "  fma.rn.f32      fl,fl,C,nZ;     \n"
+        "  fma.rn.f32      fu,fu,C,nZ;     \n"
+        "  ex2.approx.ftz.f32  fl, fl;     \n"
+        "  ex2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U)
+        __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog2(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f;              \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(r, r, 0xA2E2U, 0x8080U)
+        __SPEC_CASE(r, r, 0xBF46U, 0x9400U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         fl, fu, r, p;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  cvt.f32.f16     fl, hl;         \n"
+        "  cvt.f32.f16     fu, hu;         \n"
+        "  lg2.approx.ftz.f32  fl, fl;     \n"
+        "  lg2.approx.ftz.f32  fu, fu;     \n"
+        "  cvt.rn.f16.f32      hl, fl;     \n"
+        "  cvt.rn.f16.f32      hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U)
+        __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U)
+        "  mov.b32         %0, r;          \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog(const __half a) {
+    __half val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         r,h;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  cvt.f32.f16     f,h;            \n"
+        "  lg2.approx.ftz.f32  f,f;        \n"
+        "  mov.b32         C, 0x3f317218U;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r,f;        \n"
+        __SPEC_CASE(h, r, 0X160DU, 0x9C00U)
+        __SPEC_CASE(h, r, 0X3BFEU, 0x8010U)
+        __SPEC_CASE(h, r, 0X3C0BU, 0x8080U)
+        __SPEC_CASE(h, r, 0X6051U, 0x1C00U)
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U)
+        __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U)
+        __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U)
+        __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half hlog10(const __half a) {
+    __half val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  cvt.f32.f16     f, h;           \n"
+        "  lg2.approx.ftz.f32  f, f;       \n"
+        "  mov.b32         C, 0x3E9A209BU; \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.f16.f32      r, f;       \n"
+        __SPEC_CASE(h, r, 0x338FU, 0x1000U)
+        __SPEC_CASE(h, r, 0x33F8U, 0x9000U)
+        __SPEC_CASE(h, r, 0x57E1U, 0x9800U)
+        __SPEC_CASE(h, r, 0x719DU, 0x9C00U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));
+    return val;
+}
+__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) {
+    __half2 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  cvt.f32.f16     fl, hl;             \n"
+        "  cvt.f32.f16     fu, hu;             \n"
+        "  lg2.approx.ftz.f32  fl, fl;         \n"
+        "  lg2.approx.ftz.f32  fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.f16.f32      hl, fl;         \n"
+        "  cvt.rn.f16.f32      hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U)
+        __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U)
+        __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U)
+        __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U)
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)));
+    return val;
+}
+#undef __SPEC_CASE2
+#undef __SPEC_CASE
+__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) {
+    __APPROX_FCAST2(rcp)
+}
+__CUDA_FP16_DECL__ __half hrcp(const __half a) {
+    __APPROX_FCAST(rcp)
+}
+__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) {
+    __APPROX_FCAST2(rsqrt)
+}
+__CUDA_FP16_DECL__ __half hrsqrt(const __half a) {
+    __APPROX_FCAST(rsqrt)
+}
+__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) {
+    __APPROX_FCAST2(sqrt)
+}
+__CUDA_FP16_DECL__ __half hsqrt(const __half a) {
+    __APPROX_FCAST(sqrt)
+}
+#undef __APPROX_FCAST
+#undef __APPROX_FCAST2
+__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a)
+{
+    __half2 r;
+    asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ bool __hisnan(const __half a)
+{
+    __half r;
+    asm("{set.nan.f16.f16 %0,%1,%2;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a)));
+    return __HALF_TO_CUS(r) != 0U;
+}
+__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a)
+{
+    __half2 r;
+    asm("{neg.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half __hneg(const __half a)
+{
+    __half r;
+    asm("{neg.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a)
+{
+    __half2 r;
+    asm("{abs.f16x2 %0,%1;\n}"
+        :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)));
+    return r;
+}
+__CUDA_FP16_DECL__ __half __habs(const __half a)
+{
+    __half r;
+    asm("{abs.f16 %0,%1;\n}"
+        :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)));
+    return r;
+}
+
+__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __half real_tmp =  __hfma(a.x, b.x, c.x);
+    __half img_tmp  =  __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_half2(real_tmp, img_tmp);
+}
+
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)
+__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(max.NaN)
+}
+__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b)
+{
+    __BINARY_OP_HALF_MACRO(min.NaN)
+}
+__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c)
+{
+    __TERNARY_OP_HALF_MACRO(fma.rn.relu)
+}
+
+__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(max.NaN)
+}
+__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b)
+{
+    __BINARY_OP_HALF2_MACRO(min.NaN)
+}
+__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c)
+{
+    __TERNARY_OP_HALF2_MACRO(fma.rn.relu)
+}
+#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/
+
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+__CUDA_FP16_DECL__  __half2 atomicAdd(__half2 *const address, const __half2 val) {
+    __half2 r;
+    asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n"
+                  : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val))
+                  : "memory");
+   return r;
+}
+
+#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700
+
+__CUDA_FP16_DECL__  __half atomicAdd(__half *const address, const __half val) {
+    __half r;
+    asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n"
+                  : "=h"(__HALF_TO_US(r))
+                  : __PTR(address), "h"(__HALF_TO_CUS(val))
+                  : "memory");
+   return r;
+}
+
+#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/
+
+#undef __PTR
+
+#undef __CUDA_FP16_DECL__
+#endif /* defined(__CUDACC__) */
+#endif /* defined(__cplusplus) */
+
+#undef __TERNARY_OP_HALF2_MACRO
+#undef __TERNARY_OP_HALF_MACRO
+#undef __BINARY_OP_HALF2_MACRO
+#undef __BINARY_OP_HALF_MACRO
+
+#undef __CUDA_HOSTDEVICE_FP16_DECL__
+#undef __CUDA_FP16_DECL__
+
+#undef __HALF_TO_US
+#undef __HALF_TO_CUS
+#undef __HALF2_TO_UI
+#undef __HALF2_TO_CUI
+
+/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */
+/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_HALF)
+typedef __half half;
+typedef __half2 half2;
+// for consistency with __nv_bfloat16
+typedef __half      __nv_half;
+typedef __half2     __nv_half2;
+typedef __half_raw  __nv_half_raw;
+typedef __half2_raw __nv_half2_raw;
+typedef __half        nv_half;
+typedef __half2       nv_half2;
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP16)
+#undef __CPP_VERSION_AT_LEAST_11_FP16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */
+
+#endif /* end of include guard: __CUDA_FP16_HPP__ */
diff --git a/ext/cudart/include/cuda_fp8.h b/ext/cudart/include/cuda_fp8.h
new file mode 100644
index 0000000000000000000000000000000000000000..13a1d733f10f9dd90d98e280cef34be1472132fd
--- /dev/null
+++ b/ext/cudart/include/cuda_fp8.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright 2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDA_FP8_H__
+#define __CUDA_FP8_H__
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP8_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP8__ __host__ __device__
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE_FP8__
+#endif /* defined(__CUDACC_) */
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP8
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP8
+#endif
+
+/* bring in __half_raw data type */
+#include "cuda_fp16.h"
+/* bring in __nv_bfloat16_raw data type */
+#include "cuda_bf16.h"
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics
+ * This section describes fp8 intrinsic functions.
+ * To use these functions, include the header file \p cuda_fp8.h in your
+ * program.
+ */
+
+/**
+ * \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ * To use these functions, include the header file \p cuda_fp8.h in your
+ * program.
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used to for \p fp8 floating-point
+ * numbers storage.
+ */
+typedef unsigned char __nv_fp8_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 16-bit \p unsigned \p integer
+ * type abstraction used to for storage of pairs of
+ * \p fp8 floating-point numbers.
+ */
+typedef unsigned short int __nv_fp8x2_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 32-bit \p unsigned \p integer
+ * type abstraction used to for storage of tetrads of
+ * \p fp8 floating-point numbers.
+ */
+typedef unsigned int __nv_fp8x4_storage_t;
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Enumerates the modes applicable when
+ * performing a narrowing conversion to \p fp8 destination types.
+ */
+typedef enum __nv_saturation_t {
+    /**
+     * Means no saturation to finite is performed when conversion
+     * results in rounding values outside the range of destination
+     * type.
+     * NOTE: for fp8 type of e4m3 kind, the results that are larger
+     * than the maximum representable finite number of the target
+     * format become NaN.
+     */
+    __NV_NOSAT,
+    /**
+     * Means input larger than the maximum representable
+     * finite number MAXNORM of the target format round to the
+     * MAXNORM of the same sign as input.
+     */
+    __NV_SATFINITE,
+} __nv_saturation_t;
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Enumerates the possible
+ * interpretations of the 8-bit values when referring to them as
+ * \p fp8 types.
+ */
+typedef enum __nv_fp8_interpretation_t {
+    __NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */
+    __NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */
+} __nv_fp8_interpretation_t;
+
+/* Forward-declaration of C-style APIs */
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p double precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
+                       const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p double precision numbers packed
+ * in \p double2 \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
+                          const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p single precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
+                      const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p single precision numbers packed
+ * in \p float2 \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
+                         const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p half precision \p x to \p fp8 type of the requested
+ * kind using round-to-nearest-even rounding and requested saturation mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
+                        const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p half precision numbers packed
+ * in \p __half2_raw \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
+    const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp8x2(
+    const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p fp8 \p x of the specified kind
+ * to \p half precision.
+ *
+ * \details Converts input \p x of \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter
+ * to \p half precision.
+ *
+ * \returns
+ * - The \p __half_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
+__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
+                        const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p fp8 values of the specified kind
+ * to a vector of two \p half precision values packed in \p __half2_raw
+ * structure.
+ *
+ * \details Converts input vector \p x of \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter
+ * to a vector of two \p half precision values and returns as \p __half2_raw
+ * structure.
+ *
+ * \returns
+ * - The \p __half2_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation);
+
+#if defined(__cplusplus)
+
+#define __CUDA_FP8_TYPES_EXIST__
+
+/* Forward-declaration of structures defined in "cuda_fp8.hpp" */
+struct __nv_fp8_e5m2;
+struct __nv_fp8x2_e5m2;
+struct __nv_fp8x4_e5m2;
+
+struct __nv_fp8_e4m3;
+struct __nv_fp8x2_e4m3;
+struct __nv_fp8x4_e4m3;
+
+#endif /* defined(__cplusplus) */
+
+#include "cuda_fp8.hpp"
+
+#undef __CUDA_FP8_DECL__
+#undef __CUDA_HOSTDEVICE_FP8__
+#undef __CUDA_HOSTDEVICE_FP8_DECL__
+
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+#undef __CPP_VERSION_AT_LEAST_11_FP8
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#endif /* end of include guard: __CUDA_FP8_H__ */
diff --git a/ext/cudart/include/cuda_fp8.hpp b/ext/cudart/include/cuda_fp8.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9bfe2b7891c994b5432837f865716e29d3ae831b
--- /dev/null
+++ b/ext/cudart/include/cuda_fp8.hpp
@@ -0,0 +1,1546 @@
+/*
+ * Copyright 2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_FP8_HPP__)
+#define __CUDA_FP8_HPP__
+
+#if !defined(__CUDA_FP8_H__)
+#error "Do not include this file directly. Instead, include cuda_fp8.h."
+#endif
+
+/* C++ header for std::memcpy (used for type punning in host-side
+ * implementations). When compiling as a CUDA source file memcpy is provided
+ * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#elif !defined(__cplusplus) && !defined(__CUDACC__)
+#include <string.h>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+/* Set up structure-alignment attribute */
+#if !(defined __CUDA_ALIGN__)
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas"
+ * is available) */
+#if __cplusplus >= 201103L
+#define __CUDA_ALIGN__(n)                                                      \
+    alignas(n) /* C++11 kindly gives us a keyword for this */
+#else          /* !defined(__CPP_VERSION_AT_LEAST_11_FP8)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#endif /* defined(__CUDACC__) */
+#endif /* !(defined __CUDA_ALIGN__) */
+
+#if !(defined __CPP_VERSION_AT_LEAST_11_FP8)
+/* need c++11 for explicit operators */
+#define __CUDA_NO_FP8_CONVERSION_OPERATORS__
+#endif
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
+                       const __nv_fp8_interpretation_t fp8_interpretation) {
+    unsigned char res;
+    unsigned long long int xbits;
+
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&xbits, &x, sizeof(x));
+#else
+    (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+    unsigned char FP8_MAXNORM;
+    unsigned char FP8_MANTISSA_MASK;
+    unsigned short int FP8_EXP_BIAS;
+    unsigned long long int FP8_SIGNIFICAND_BITS;
+    const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL;
+    unsigned long long int FP8_MINDENORM_O2;
+    unsigned long long int FP8_OVERFLOW_THRESHOLD;
+    unsigned long long int FP8_MINNORM;
+
+    if (fp8_interpretation == __NV_E4M3) {
+        FP8_EXP_BIAS = 7U;
+        FP8_SIGNIFICAND_BITS = 4ULL;
+        FP8_MANTISSA_MASK = 0x7U;
+        FP8_MINDENORM_O2 = 0x3F50000000000000ULL; // mindenorm/2 = 2^-10
+        FP8_OVERFLOW_THRESHOLD =
+            0x407D000000000000ULL; // maxnorm + 1/2ulp = 0x1.Cp+8 + 0x1p+4
+        FP8_MAXNORM = 0x7EU;
+        FP8_MINNORM = 0x3F90000000000000ULL; // minnorm = 2^-6
+    } else {                                 //__NV_E5M2
+        FP8_EXP_BIAS = 15U;
+        FP8_SIGNIFICAND_BITS = 3ULL;
+        FP8_MANTISSA_MASK = 0x3U;
+        FP8_MINDENORM_O2 = 0x3EE0000000000000ULL; // mindenorm/2 = 2^-17
+        FP8_OVERFLOW_THRESHOLD =
+            0x40EE000000000000ULL -
+            1ULL; // maxnorm + 1/2ulp = 0x1.Ep+15, and -1 to have common code
+        FP8_MAXNORM = 0x7BU;
+        FP8_MINNORM = 0x3F10000000000000ULL; // minnorm = 2^-14
+    }
+
+    // 1/2 LSB of the target format, positioned in double precision mantissa
+    // helpful in midpoints detection during round-to-nearest-even step
+    const unsigned long long int FP8_DP_HALF_ULP =
+        (unsigned long long int)1ULL << (53ULL - FP8_SIGNIFICAND_BITS - 1ULL);
+    // prepare sign bit in target format
+    unsigned char sign = (unsigned char)((xbits >> 63ULL) << 7U);
+    // prepare exponent field in target format
+    unsigned char exp =
+        (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) -
+                        1023U + FP8_EXP_BIAS);
+    // round mantissa to target format width, rounding towards zero
+    unsigned char mantissa =
+        (unsigned char)(xbits >> (53ULL - FP8_SIGNIFICAND_BITS)) &
+        FP8_MANTISSA_MASK;
+    unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL;
+
+    if (absx <= FP8_MINDENORM_O2) {
+        // zero or underflow
+        res = 0U;
+    } else if (absx > DP_INF_BITS) {
+        // NaN
+        if (fp8_interpretation == __NV_E4M3) {
+            res = 0x7FU;
+        } else {
+            // NaN --> QNaN
+            res = 0x7EU | mantissa;
+        }
+    } else if (absx > FP8_OVERFLOW_THRESHOLD) {
+        if (saturate == __NV_SATFINITE) {
+            res = FP8_MAXNORM;
+        } else {
+            // __NV_NOSAT
+            if (fp8_interpretation == __NV_E4M3) {
+                // no Inf in E4M3
+                res = 0x7FU; // NaN
+            } else {
+                res = 0x7CU; // Inf in E5M2
+            }
+        }
+    } else if (absx >= FP8_MINNORM) {
+        res = (unsigned char)((exp << (FP8_SIGNIFICAND_BITS - 1U)) | mantissa);
+        // rounded-off bits
+        unsigned long long int round =
+            xbits & ((FP8_DP_HALF_ULP << 1ULL) - 1ULL);
+        // round-to-nearest-even adjustment
+        if ((round > FP8_DP_HALF_ULP) ||
+            ((round == FP8_DP_HALF_ULP) && (mantissa & 1U))) {
+            res = (unsigned char)(res + 1U);
+        }
+    } else // Denormal range
+    {
+        unsigned char shift = (unsigned char)(1U - exp);
+        // add implicit leading bit
+        mantissa |= (unsigned char)(1U << (FP8_SIGNIFICAND_BITS - 1U));
+        // additional round-off due to denormalization
+        res = (unsigned char)(mantissa >> shift);
+
+        // rounded-off bits, including implicit leading bit
+        unsigned long long int round =
+            (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) &
+            ((FP8_DP_HALF_ULP << (shift + 1ULL)) - 1ULL);
+        // round-to-nearest-even adjustment
+        if ((round > (FP8_DP_HALF_ULP << shift)) ||
+            ((round == (FP8_DP_HALF_ULP << shift)) && (res & 1U))) {
+            res = (unsigned char)(res + 1U);
+        }
+    }
+
+    res |= sign;
+
+    return (__nv_fp8_storage_t)res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
+                          const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t storage = (__nv_fp8x2_storage_t)__nv_cvt_double_to_fp8(
+        x.y, saturate, fp8_interpretation);
+    storage = (__nv_fp8x2_storage_t)(storage << 8U);
+    storage = (__nv_fp8x2_storage_t)(storage |
+                                     __nv_cvt_double_to_fp8(
+                                         x.x, saturate, fp8_interpretation));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
+                      const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8_storage_t res = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if (saturate == __NV_SATFINITE) {
+        __nv_fp8x2_storage_t storage;
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x), "f"(0.0f));
+        }
+        res = (__nv_fp8_storage_t)storage;
+    } else
+#endif
+    {
+        unsigned int xbits;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+        (void)memcpy(&xbits, &x, sizeof(x));
+#else
+        (void)std::memcpy(&xbits, &x, sizeof(x));
+#endif
+
+        // isnan
+        if ((xbits & 0x7FFFFFFFU) > 0x7F800000U) {
+            // Canonical NaN
+            xbits = 0x7FFFFFFFU;
+        }
+
+        float fx;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+        (void)memcpy(&fx, &xbits, sizeof(xbits));
+#else
+        (void)std::memcpy(&fx, &xbits, sizeof(xbits));
+#endif
+
+        const double dx = (double)fx;
+        res = __nv_cvt_double_to_fp8(dx, saturate, fp8_interpretation);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
+                         const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t storage;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if (saturate == __NV_SATFINITE) {
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n"
+                : "=h"(storage)
+                : "f"(x.x), "f"(x.y));
+        }
+    } else
+#endif
+    {
+        storage = (__nv_fp8x2_storage_t)__nv_cvt_float_to_fp8(
+            x.y, saturate, fp8_interpretation);
+        storage = (__nv_fp8x2_storage_t)(storage << 8U);
+        storage = (__nv_fp8x2_storage_t)(storage | __nv_cvt_float_to_fp8(
+                                                       x.x, saturate,
+                                                       fp8_interpretation));
+    }
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float
+__internal_halfraw_to_float(const __half_raw x) {
+    float f;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+    asm("{cvt.f32.f16 %0, %1;}\n" : "=f"(f) : "h"(x.x));
+#else
+    const unsigned int ux = (unsigned int)x.x;
+    unsigned int sign = (ux >> 15U) & 1U;
+    unsigned int exponent = (ux >> 10U) & 0x1fU;
+    unsigned int mantissa = (ux & 0x3ffU) << 13U;
+    if (exponent == 0x1fU) { /* NaN or Inf */
+        /* discard sign of a NaN */
+        sign = ((mantissa != 0U) ? (sign >> 1U) : sign);
+        mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U);
+        exponent = 0xffU;
+    } else if (exponent == 0U) { /* Denorm or Zero */
+        if (mantissa != 0U) {
+            unsigned int msb;
+            exponent = 0x71U;
+            do {
+                msb = (mantissa & 0x400000U);
+                mantissa <<= 1U; /* normalize */
+                --exponent;
+            } while (msb == 0U);
+            mantissa &= 0x7fffffU; /* 1.mantissa is implicit */
+        }
+    } else {
+        exponent += 0x70U;
+    }
+    const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa);
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&f, &u, sizeof(u));
+#else
+    (void)std::memcpy(&f, &u, sizeof(u));
+#endif
+#endif /* (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) */
+    return f;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float2
+__internal_halfraw2_to_float2(const __half2_raw x) {
+    __half_raw raw;
+    float2 res;
+    raw.x = x.x;
+    res.x = __internal_halfraw_to_float(raw);
+    raw.x = x.y;
+    res.y = __internal_halfraw_to_float(raw);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
+                        const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8_storage_t res = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if (saturate == __NV_SATFINITE) {
+        unsigned int half2_storage = (unsigned int)(x.x);
+        __nv_fp8x2_storage_t tmp;
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        }
+        res = (__nv_fp8_storage_t)tmp;
+    } else
+#endif
+    {
+        float fx = __internal_halfraw_to_float(x);
+        res = __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
+    }
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_fp8x2_storage_t tmp;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    if (saturate == __NV_SATFINITE) {
+        unsigned int half2_storage;
+        (void)memcpy(&half2_storage, &x, sizeof(x));
+
+        if (fp8_interpretation == __NV_E5M2) {
+            asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        } else {
+            asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n"
+                : "=h"(tmp)
+                : "r"(half2_storage));
+        }
+    } else
+#endif
+    {
+        __half_raw raw;
+        raw.x = x.x;
+        __nv_fp8_storage_t lo =
+            __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
+        raw.x = x.y;
+        __nv_fp8_storage_t hi =
+            __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation);
+        tmp = hi;
+        tmp = (__nv_fp8x2_storage_t)(tmp << 8U);
+        tmp = (__nv_fp8x2_storage_t)(tmp | lo);
+    }
+    return tmp;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ float
+__internal_bf16raw_to_float(const __nv_bfloat16_raw x) {
+    const unsigned int ux = ((unsigned int)x.x) << 16U;
+    float fx;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&fx, &ux, sizeof(ux));
+#else
+    (void)std::memcpy(&fx, &ux, sizeof(ux));
+#endif
+    return fx;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw
+__internal_float_to_bf16raw_rz(const float x) {
+    unsigned int ux;
+    __nv_bfloat16_raw r;
+#if defined(__CUDACC__) || (!defined __cplusplus)
+    (void)memcpy(&ux, &x, sizeof(x));
+#else
+    (void)std::memcpy(&ux, &x, sizeof(x));
+#endif
+    r.x = (unsigned short int)(ux >> 16U);
+    return r;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
+    const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    const float fx = __internal_bf16raw_to_float(x);
+    const __nv_fp8_storage_t res =
+        __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation);
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp8x2(
+    const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation) {
+    __nv_bfloat16_raw raw;
+    raw.x = x.y;
+    __nv_fp8x2_storage_t storage =
+        (__nv_fp8x2_storage_t)__nv_cvt_bfloat16raw_to_fp8(raw, saturate,
+                                                          fp8_interpretation);
+    storage = (__nv_fp8x2_storage_t)(storage << 8U);
+    raw.x = x.x;
+    storage = (__nv_fp8x2_storage_t)(storage |
+                                     __nv_cvt_bfloat16raw_to_fp8(
+                                         raw, saturate, fp8_interpretation));
+    return storage;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation);
+__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
+__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
+                        const __nv_fp8_interpretation_t fp8_interpretation) {
+    __half_raw res;
+    res.x = 0U;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    res.x =
+        __nv_cvt_fp8x2_to_halfraw2((__nv_fp8x2_storage_t)x, fp8_interpretation)
+            .x;
+#else
+    unsigned short int ur = (unsigned short int)x;
+    ur = (unsigned short int)(ur << 8U);
+
+    if (fp8_interpretation == __NV_E5M2) {
+        if ((ur & 0x7FFFU) > 0x7C00U) {
+            /* If NaN, return canonical NaN */
+            ur = 0x7FFFU;
+        }
+    } else { // __NV_E4M3
+        unsigned short int sign = ur & 0x8000U;
+        unsigned short int exponent =
+            (unsigned short int)(((ur & 0x7800U) >> 1U) + 0x2000U);
+        unsigned short int mantissa = (ur & 0x0700U) >> 1U;
+        unsigned char absx = 0x7FU & (unsigned char)x;
+
+        if (absx == 0x7FU) // NaN
+        {
+            ur = 0x7FFFU; // fp16 canonical NaN, discard sign
+        } else if (exponent == 0x2000U) {
+            // zero or denormal
+            if (mantissa != 0U) {
+                // normalize
+                mantissa = (unsigned short int)(mantissa << 1U);
+                while ((mantissa & 0x0400U) == 0U) {
+                    mantissa = (unsigned short int)(mantissa << 1U);
+                    exponent = (unsigned short int)(exponent - 0x0400U);
+                }
+                // discard implicit leading bit
+                mantissa &= 0x03FFU;
+            } else { // Zero
+                exponent = 0U;
+            }
+
+            ur = (sign | exponent) | mantissa;
+        } else {
+            ur = (sign | exponent) | mantissa;
+        }
+    }
+    res.x = ur;
+#endif
+    return res;
+}
+
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation) {
+    __half2_raw res;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int half2_storage;
+    if (fp8_interpretation == __NV_E5M2) {
+        asm("{cvt.rn.f16x2.e5m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    } else {
+        asm("{cvt.rn.f16x2.e4m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x));
+    }
+    (void)memcpy(&res, &half2_storage, sizeof(half2_storage));
+#else
+    res.x =
+        __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)x, fp8_interpretation).x;
+    res.y = __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)(x >> 8U),
+                                    fp8_interpretation)
+                .x;
+#endif
+    return res;
+}
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/**
+ * \defgroup CUDA_MATH_FP8_E5M2_STRUCT C++ struct for handling fp8 data type of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+ * \brief __nv_fp8_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling
+ * \p fp8 floating-point numbers of \p e5m2 kind:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Storage variable contains the \p fp8 floating-point data.
+     */
+    __nv_fp8_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
+                                      __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const float f) {
+        __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const double f) {
+        __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E5M2);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e5m2(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e5m2(const unsigned long long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p short \p int data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const short int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long long int val) {
+        __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp8_to_halfraw(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return static_cast<__nv_bfloat16>(
+            __internal_float_to_bf16raw_rz(float(*this)));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char min_val = 0U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) > 0x7CU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const signed char min_val = (signed char)0x80U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) > 0x7CU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p short \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p long \p long \p int data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E5M2_STRUCT
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return (__x & 0x7FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X2_E5M2_STRUCT C++ struct for handling vector type of two fp8 values of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+ * \brief __nv_fp8x2_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling two
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Storage variable contains the vector of two \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
+                                         __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const float2 f) {
+        __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const double2 f) {
+        __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2));
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int
+__internal_pack_u16x2_to_u32(const unsigned short int src_lo,
+                             const unsigned short int src_hi) {
+    unsigned int dst;
+#if (defined __CUDACC__) && (defined __CUDA_ARCH__)
+    asm("{  mov.b32 %0, {%1,%2};}\n" : "=r"(dst) : "h"(src_lo), "h"(src_hi));
+#else
+    dst = (static_cast<unsigned int>(src_hi) << 16U) |
+          static_cast<unsigned int>(src_lo);
+#endif
+    return dst;
+}
+
+/**
+ * \defgroup CUDA_MATH_FP8X4_E5M2_STRUCT C++ struct for handling vector type of four fp8 values of e5m2 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+ * \brief __nv_fp8x4_e5m2 datatype
+ *
+ * \details This structure implements the datatype for handling four
+ * \p fp8 floating-point numbers of \p e5m2 kind each:
+ * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e5m2 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Storage variable contains the vector of four \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e5m2() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
+        const __nv_fp8x2_storage_t shi =
+            static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E5M2));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E5M2));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8_E4M3_STRUCT C++ struct for handling fp8 data type of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+ * \brief __nv_fp8_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storing
+ * \p fp8 floating-point numbers of \p e4m3 kind:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ *
+ * The structure implements converting constructors and operators.
+ */
+struct __CUDA_ALIGN__(1) __nv_fp8_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Storage variable contains the \p fp8 floating-point data.
+     */
+    __nv_fp8_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider FP types */
+    /* Note we do avoid constructor init-list because of special host/device
+     * compilation rules */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p __half data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __half f) {
+        __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f),
+                                      __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __nv_bfloat16 f) {
+        __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f),
+                                          __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const float f) {
+        __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p double data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const double f) {
+        __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E4M3);
+    }
+
+    /* Converts from integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p unsigned \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e4m3(const unsigned short int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p unsigned \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p unsigned \p long \p long \p int data type, relies on
+     * \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__
+    __nv_fp8_e4m3(const unsigned long long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p short \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const short int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior
+     * for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Constructor from \p long \p long \p int data type, relies on \p
+     * __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long long int val) {
+        __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x;
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening FP converts */
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p __half data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const {
+        return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p float data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float() const {
+        return __internal_halfraw_to_float(
+            __nv_cvt_fp8_to_halfraw(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p __nv_bfloat16 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const {
+        return static_cast<__nv_bfloat16>(
+            __internal_float_to_bf16raw_rz(float(*this)));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p double data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator double() const {
+        return static_cast<double>(float(*this));
+    }
+
+    /* Convert to integral */
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p unsigned \p char data type.
+     * Clamps negative and too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const {
+        unsigned char i;
+        const float f = float(*this);
+        const unsigned char max_val = 0xFFU;
+        const unsigned char min_val = 0U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) == 0x7FU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<unsigned char>(f);
+        }
+        return i;
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p unsigned \p short \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const {
+        return __half2ushort_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p unsigned \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const {
+        return __half2uint_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p unsigned \p long \p long \p int data type.
+     * Clamps negative inputs to zero.
+     * \p NaN inputs convert to \p 0x8000000000000000ULL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const {
+        return __half2ull_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p signed \p char data type.
+     * Clamps too large inputs to the output range.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const {
+        signed char i;
+        const float f = float(*this);
+        const signed char max_val = (signed char)0x7FU;
+        const signed char min_val = (signed char)0x80U;
+        const unsigned char bits = (*this).__x;
+        // saturation fixup
+        if ((bits & 0x7FU) == 0x7FU) {
+            // NaN
+            i = 0;
+        } else if (f > static_cast<float>(max_val)) {
+            // saturate maximum
+            i = max_val;
+        } else if (f < static_cast<float>(min_val)) {
+            // saturate minimum
+            i = min_val;
+        } else {
+            // normal value
+            i = static_cast<signed char>(f);
+        }
+        return i;
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p short \p int data type.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const {
+        return __half2short_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p int data type.
+     * \p NaN inputs convert to \p zero.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator int() const {
+        return __half2int_rz(__half(*this));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p long \p long \p int data type.
+     * \p NaN inputs convert to \p 0x8000000000000000LL.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const {
+        return __half2ll_rz(__half(*this));
+    }
+
+    /**
+     * \ingroup CUDA_MATH_FP8_E4M3_STRUCT
+     * Conversion operator to \p bool data type.
+     * +0 and -0 inputs convert to \p false.
+     * Non-zero inputs convert to \p true.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const {
+        return (__x & 0x7FU) != 0U;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X2_E4M3_STRUCT C++ struct for handling vector type of two fp8 values of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+ * \brief __nv_fp8x2_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of two \p fp8 values of \p e4m3 kind each:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ */
+struct __CUDA_ALIGN__(2) __nv_fp8x2_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Storage variable contains the vector of two \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x2_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x2_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __half2 f) {
+        __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f),
+                                         __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __nv_bfloat162 f) {
+        __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f),
+                                             __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor from \p float2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const float2 f) {
+        __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Constructor from \p double2 data type, relies on \p __NV_SATFINITE
+     * behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const double2 f) {
+        __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Conversion operator to \p __half2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const {
+        return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT
+     * Conversion operator to \p float2 data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const {
+        return __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3));
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+/**
+ * \defgroup CUDA_MATH_FP8X4_E4M3_STRUCT C++ struct for handling vector type of four fp8 values of e4m3 kind.
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ */
+
+/**
+ * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+ * \brief __nv_fp8x4_e4m3 datatype
+ *
+ * \details This structure implements the datatype for storage
+ * and operations on the vector of four \p fp8 values of \p e4m3 kind each:
+ * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits.
+ * The encoding doesn't support Infinity.
+ * NaNs are limited to 0x7F and 0xFF values.
+ */
+struct __CUDA_ALIGN__(4) __nv_fp8x4_e4m3 {
+  public:
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Storage variable contains the vector of four \p fp8 floating-point data
+     * values.
+     */
+    __nv_fp8x4_storage_t __x;
+
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor by default.
+     */
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+    __nv_fp8x4_e4m3() = default;
+#else
+    __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3() {}
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+
+#if !defined(__CUDA_NO_FP8_CONVERSIONS__)
+
+    /* Construct from wider types */
+
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor from a pair of \p __half2 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __half2 flo,
+                                                     const __half2 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2(
+            static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor from a pair of \p __nv_bfloat162 data type values,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __nv_bfloat162 flo,
+                                                     const __nv_bfloat162 fhi) {
+        const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2(
+            static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor from \p float4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const float4 f) {
+        const float2 flo = {f.x, f.y};
+        const float2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Constructor from \p double4 vector data type,
+     * relies on \p __NV_SATFINITE behavior for out-of-range values.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const double4 f) {
+        const double2 flo = {f.x, f.y};
+        const double2 fhi = {f.z, f.w};
+        const __nv_fp8x2_storage_t rlo =
+            __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3);
+        const __nv_fp8x2_storage_t rhi =
+            __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3);
+        __x = __internal_pack_u16x2_to_u32(rlo, rhi);
+    }
+
+#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__)
+    /* Widening converts */
+
+    /**
+     * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT
+     * Conversion operator to \p float4 vector data type.
+     */
+    explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const {
+        const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x);
+        const __nv_fp8x2_storage_t shi =
+            static_cast<__nv_fp8x2_storage_t>(__x >> 16U);
+        float2 rlo = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E4M3));
+        float2 rhi = __internal_halfraw2_to_float2(
+            __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E4M3));
+        float4 res = {rlo.x, rlo.y, rhi.x, rhi.y};
+        return res;
+    }
+#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */
+#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */
+};
+
+#endif /* defined(__cplusplus) */
+
+#endif /* end of include guard: __CUDA_FP8_HPP__ */
diff --git a/ext/cudart/include/cuda_gl_interop.h b/ext/cudart/include/cuda_gl_interop.h
new file mode 100644
index 0000000000000000000000000000000000000000..68f6ac94f3b7912f42f01b775a102ac323427fe1
--- /dev/null
+++ b/ext/cudart/include/cuda_gl_interop.h
@@ -0,0 +1,508 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_GL_INTEROP_H__)
+#define __CUDA_GL_INTEROP_H__
+
+#include "cuda_runtime_api.h"
+
+#if defined(__APPLE__)
+
+#include <OpenGL/gl.h>
+
+#else /* __APPLE__ */
+
+#if defined(__arm__) || defined(__aarch64__)
+#ifndef GL_VERSION
+#error Please include the appropriate gl headers before including cuda_gl_interop.h
+#endif
+#else
+#include <GL/gl.h>
+#endif
+
+#endif /* __APPLE__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \addtogroup CUDART_OPENGL OpenGL Interoperability
+ * This section describes the OpenGL interoperability functions of the CUDA
+ * runtime application programming interface. Note that mapping of OpenGL
+ * resources is performed with the graphics API agnostic, resource mapping 
+ * interface described in \ref CUDART_INTEROP "Graphics Interopability".
+ *
+ * @{
+ */
+
+/**
+ * CUDA devices corresponding to the current OpenGL context
+ */
+enum cudaGLDeviceList
+{
+  cudaGLDeviceListAll           = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */
+  cudaGLDeviceListCurrentFrame  = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */
+  cudaGLDeviceListNextFrame     = 3  /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame  */
+};
+
+/**
+ * \brief Gets the CUDA devices associated with the current OpenGL context
+ *
+ * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices 
+ * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices 
+ * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to 
+ * the current OpenGL context. If any of the GPUs being used by the current OpenGL
+ * context are not CUDA capable then the call will return ::cudaErrorNoDevice.
+ *
+ * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the 
+ *                           current OpenGL context
+ * \param pCudaDevices     - Returned CUDA devices corresponding to the current 
+ *                           OpenGL context
+ * \param cudaDeviceCount  - The size of the output device array \p pCudaDevices
+ * \param deviceList       - The set of devices to return.  This set may be
+ *                           ::cudaGLDeviceListAll for all devices, 
+ *                           ::cudaGLDeviceListCurrentFrame for the devices used to
+ *                           render the current frame (in SLI), or
+ *                           ::cudaGLDeviceListNextFrame for the devices used to
+ *                           render the next frame (in SLI).
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorInvalidGraphicsContext,
+ * ::cudaErrorUnknown
+ *
+ * \note This function is not supported on Mac OS X.
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray, 
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGLGetDevices 
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList);
+
+/**
+ * \brief Register an OpenGL texture or renderbuffer object
+ *
+ * Registers the texture or renderbuffer object specified by \p image for access by CUDA.
+ * A handle to the registered object is returned as \p resource.
+ *
+ * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, 
+ * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, 
+ * or ::GL_RENDERBUFFER.
+ *
+ * The register flags \p flags specify the intended usage, as follows: 
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will
+ *   bind this resource to a surface reference.
+ * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform
+ *   texture gather operations on this resource.
+ *
+ * The following image formats are supported. For brevity's sake, the list is abbreviated.
+ * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats 
+ * {GL_R8, GL_R16, GL_RG8, GL_RG16} :
+ * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY
+ * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I}
+ * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X
+ * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT}
+ *
+ * The following image classes are currently disallowed:
+ * - Textures with borders
+ * - Multisampled renderbuffers
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param image    - name of texture or renderbuffer object to be registered
+ * \param target   - Identifies the type of object specified by \p image 
+ * \param flags    - Register flags
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources, 
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags);
+
+/**
+ * \brief Registers an OpenGL buffer object
+ *
+ * Registers the buffer object specified by \p buffer for access by
+ * CUDA.  A handle to the registered object is returned as \p
+ * resource.  The register flags \p flags specify the intended usage,
+ * as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * \param resource - Pointer to the returned object handle
+ * \param buffer   - name of buffer object to be registered
+ * \param flags    - Register flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa 
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsGLRegisterBuffer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags);
+
+#ifdef _WIN32
+#ifndef WGL_NV_gpu_affinity
+typedef void* HGPUNV;
+#endif
+
+/**
+ * \brief Gets the CUDA device associated with hGpu
+ *
+ * Returns the CUDA device associated with a hGpu, if applicable.
+ *
+ * \param device - Returns the device associated with hGpu, or -1 if hGpu is
+ * not a compute device.
+ * \param hGpu   - Handle to a GPU, as queried via WGL_NV_gpu_affinity
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa
+ * ::WGL_NV_gpu_affinity,
+ * ::cuWGLGetDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu);
+#endif
+
+/** @} */ /* END CUDART_OPENGL */
+
+/**
+ * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED]
+ * This section describes deprecated OpenGL interoperability functionality.
+ *
+ * @{
+ */
+
+/**
+ * CUDA GL Map Flags
+ */
+enum cudaGLMapFlags
+{
+  cudaGLMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+  cudaGLMapFlagsReadOnly     = 1,  /**< CUDA kernels will not write to this resource */
+  cudaGLMapFlagsWriteDiscard = 2   /**< CUDA kernels will only write to and will not read from this resource */
+};
+
+/**
+ * \brief Sets a CUDA device to use OpenGL interoperability
+ *
+ * \deprecated This function is deprecated as of CUDA 5.0. 
+ *
+ * This function is deprecated and should no longer be used.  It is
+ * no longer necessary to associate a CUDA device with an OpenGL
+ * context in order to achieve maximum interoperability performance.
+ *
+ * \param device - Device to use for OpenGL interoperability
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorSetOnActiveProcess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device);
+
+/**
+ * \brief Registers a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Registers the buffer object of ID \p bufObj for access by
+ * CUDA. This function must be called before CUDA can map the buffer
+ * object.  The OpenGL context used to create the buffer, or another
+ * context from the same share group, must be bound to the current
+ * thread when this is called.
+ *
+ * \param bufObj - Buffer object ID to register
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsGLRegisterBuffer
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj);
+
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj);
+
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * All streams in the current thread are synchronized with the current
+ * GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj);
+
+/**
+ * \brief Unregisters a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unregisters the buffer object of ID \p bufObj for access by CUDA
+ * and releases any CUDA resources associated with the buffer.  Once a
+ * buffer is unregistered, it may no longer be mapped by CUDA.  The GL
+ * context used to create the buffer, or another context from the
+ * same share group, must be bound to the current thread when this is
+ * called.
+ *
+ * \param bufObj - Buffer object to unregister
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnregisterResource
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj);
+
+/**
+ * \brief Set usage flags for mapping an OpenGL buffer
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Set flags for mapping the OpenGL buffer \p bufObj
+ *
+ * Changes to flags will take effect the next time \p bufObj is mapped.
+ * The \p flags argument may be any of the following:
+ *
+ * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will
+ * be used. It is therefore assumed that this buffer will be read from and
+ * written to by CUDA kernels. This is the default value.
+ * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this
+ * buffer will not write to the buffer.
+ * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access
+ * this buffer will not read from the buffer and will write over the
+ * entire contents of the buffer, so none of the data previously stored in
+ * the buffer will be preserved.
+ *
+ * If \p bufObj has not been registered for use with CUDA, then
+ * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently
+ * mapped for access by CUDA, then ::cudaErrorUnknown is returned.
+ *
+ * \param bufObj    - Registered buffer object to set flags for
+ * \param flags     - Parameters for buffer mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsResourceSetMapFlags
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags); 
+
+/**
+ * \brief Maps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Maps the buffer object of ID \p bufObj into the address space of
+ * CUDA and returns in \p *devPtr the base pointer of the resulting
+ * mapping.  The buffer must have previously been registered by
+ * calling ::cudaGLRegisterBufferObject().  While a buffer is mapped
+ * by CUDA, any OpenGL operation which references the buffer will
+ * result in undefined behavior.  The OpenGL context used to create
+ * the buffer, or another context from the same share group, must be
+ * bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param devPtr - Returned device pointer to CUDA object
+ * \param bufObj - Buffer object ID to map
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsMapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream);
+
+/**
+ * \brief Unmaps a buffer object for access by CUDA
+ *
+ * \deprecated This function is deprecated as of CUDA 3.0. 
+ *
+ * Unmaps the buffer object of ID \p bufObj for access by CUDA.  When
+ * a buffer is unmapped, the base address returned by
+ * ::cudaGLMapBufferObject() is invalid and subsequent references to
+ * the address result in undefined behavior.  The OpenGL context used
+ * to create the buffer, or another context from the same share group,
+ * must be bound to the current thread when this is called.
+ *
+ * Stream /p stream is synchronized with the current GL context.
+ *
+ * \param bufObj - Buffer object to unmap
+ * \param stream - Stream to synchronize
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnmapBufferObjectFailed
+ * \notefnerr
+ *
+ * \sa ::cudaGraphicsUnmapResources
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream);
+
+/** @} */ /* END CUDART_OPENGL_DEPRECATED */
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#undef __CUDA_DEPRECATED
+
+#endif /* __CUDA_GL_INTEROP_H__ */
+
diff --git a/ext/cudart/include/cuda_occupancy.h b/ext/cudart/include/cuda_occupancy.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffe55709f8ccdebf7341180f043006b68c08e104
--- /dev/null
+++ b/ext/cudart/include/cuda_occupancy.h
@@ -0,0 +1,1958 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/**
+ * CUDA Occupancy Calculator
+ *
+ * NAME
+ *
+ *   cudaOccMaxActiveBlocksPerMultiprocessor,
+ *   cudaOccMaxPotentialOccupancyBlockSize,
+ *   cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
+ *   cudaOccAvailableDynamicSMemPerBlock
+ *
+ * DESCRIPTION
+ *
+ *   The CUDA occupancy calculator provides a standalone, programmatical
+ *   interface to compute the occupancy of a function on a device. It can also
+ *   provide occupancy-oriented launch configuration suggestions.
+ *
+ *   The function and device are defined by the user through
+ *   cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
+ *   structures. All APIs require all 3 of them.
+ *
+ *   See the structure definition for more details about the device / function
+ *   descriptors.
+ *
+ *   See each API's prototype for API usage.
+ *
+ * COMPATIBILITY
+ *
+ *   The occupancy calculator will be updated on each major CUDA toolkit
+ *   release. It does not provide forward compatibility, i.e. new hardwares
+ *   released after this implementation's release will not be supported.
+ *
+ * NOTE
+ *
+ *   If there is access to CUDA runtime, and the sole intent is to calculate
+ *   occupancy related values on one of the accessible CUDA devices, using CUDA
+ *   runtime's occupancy calculation APIs is recommended.
+ *
+ */
+
+#ifndef __cuda_occupancy_h__
+#define __cuda_occupancy_h__
+
+#include <stddef.h>
+#include <limits.h>
+#include <string.h>
+
+
+// __OCC_INLINE will be undefined at the end of this header
+//
+#ifdef __CUDACC__
+#define __OCC_INLINE inline __host__ __device__
+#elif defined _MSC_VER
+#define __OCC_INLINE __inline
+#else // GNUCC assumed
+#define __OCC_INLINE inline
+#endif
+
+enum cudaOccError_enum {
+    CUDA_OCC_SUCCESS              = 0,  // no error encountered
+    CUDA_OCC_ERROR_INVALID_INPUT  = 1,  // input parameter is invalid
+    CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2,  // requested device is not supported in
+                                        // current implementation or device is
+                                        // invalid
+};
+typedef enum cudaOccError_enum       cudaOccError;
+
+typedef struct cudaOccResult         cudaOccResult;
+typedef struct cudaOccDeviceProp     cudaOccDeviceProp;
+typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
+typedef struct cudaOccDeviceState    cudaOccDeviceState;
+
+/**
+ * The CUDA occupancy calculator computes the occupancy of the function
+ * described by attributes with the given block size (blockSize), static device
+ * properties (properties), dynamic device states (states) and per-block dynamic
+ * shared memory allocation (dynamicSMemSize) in bytes, and output it through
+ * result along with other useful information. The occupancy is computed in
+ * terms of the maximum number of active blocks per multiprocessor. The user can
+ * then convert it to other metrics, such as number of active warps.
+ *
+ * RETURN VALUE
+ *
+ * The occupancy and related information is returned through result.
+ *
+ * If result->activeBlocksPerMultiprocessor is 0, then the given parameter
+ * combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,           // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    int                          blockSize,        // in
+    size_t                       dynamicSmemSize); // in
+
+/**
+ * The CUDA launch configurator C API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. maximum number of active warps with the smallest number of blocks) for
+ * the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the user should
+ * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if the dynamic
+ * shared memory size is constant regardless of block size, the size should be
+ * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
+ * NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to provide a pointer to an unary function through
+ * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
+ * a block of the function for any given block size. dynamicSMemSize is
+ * ignored. An example signature is:
+ *
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,      // out
+    int                         *blockSize,        // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    size_t                     (*blockSizeToDynamicSMemSize)(int), // in
+    size_t                       dynamicSMemSize); // in
+
+/**
+ * The CUDA launch configurator C++ API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number of blocks)
+ * for the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is 0 or constant regardless of
+ * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
+ * configure the launch. A constant dynamic shared memory allocation size in
+ * bytes can be passed through dynamicSMemSize.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to use
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
+ * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
+ * computes the dynamic shared memory needed by func for any given block
+ * size. An example signature is:
+ *
+ *  // Take block size, returns per-block dynamic shared memory needed
+ *  size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+
+#if defined(__cplusplus)
+namespace {
+
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    size_t                       dynamicSMemSize = 0); // in
+
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    UnaryFunction                blockSizeToDynamicSMemSize); // in
+
+} // namespace anonymous
+#endif // defined(__cplusplus)
+
+/**
+ *
+ * The CUDA dynamic shared memory calculator computes the maximum size of 
+ * per-block dynamic shared memory if we want to place numBlocks blocks
+ * on an SM.
+ *
+ * RETURN VALUE
+ *
+ * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow 
+ * numBlocks blocks per SM.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *dynamicSmemSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize);
+
+/**
+ * Data structures
+ *
+ * These structures are subject to change for future architecture and CUDA
+ * releases. C users should initialize the structure as {0}.
+ *
+ */
+
+/**
+ * Device descriptor
+ *
+ * This structure describes a device.
+ */
+struct cudaOccDeviceProp {
+    int    computeMajor;                // Compute capability major version
+    int    computeMinor;                // Compute capability minor
+                                        // version. None supported minor version
+                                        // may cause error
+    int    maxThreadsPerBlock;          // Maximum number of threads per block
+    int    maxThreadsPerMultiprocessor; // Maximum number of threads per SM
+                                        // i.e. (Max. number of warps) x (warp
+                                        // size)
+    int    regsPerBlock;                // Maximum number of registers per block
+    int    regsPerMultiprocessor;       // Maximum number of registers per SM
+    int    warpSize;                    // Warp size
+    size_t sharedMemPerBlock;           // Maximum shared memory size per block
+    size_t sharedMemPerMultiprocessor;  // Maximum shared memory size per SM
+    int    numSms;                      // Number of SMs available
+    size_t sharedMemPerBlockOptin;      // Maximum optin shared memory size per block
+    size_t reservedSharedMemPerBlock;   // Shared memory per block reserved by driver
+
+#ifdef __cplusplus
+    // This structure can be converted from a cudaDeviceProp structure for users
+    // that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the device properties of a CUDA device through
+    // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
+    // cudaDeviceProp structure.
+    //
+    // Example:
+    /*
+     {
+         cudaDeviceProp prop;
+
+         cudaGetDeviceProperties(&prop, ...);
+
+         cudaOccDeviceProp occProp = prop;
+
+         ...
+
+         cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
+     }
+     */
+    //
+    template<typename DeviceProp>
+    __OCC_INLINE
+    cudaOccDeviceProp(const DeviceProp &props)
+    :   computeMajor                (props.major),
+        computeMinor                (props.minor),
+        maxThreadsPerBlock          (props.maxThreadsPerBlock),
+        maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
+        regsPerBlock                (props.regsPerBlock),
+        regsPerMultiprocessor       (props.regsPerMultiprocessor),
+        warpSize                    (props.warpSize),
+        sharedMemPerBlock           (props.sharedMemPerBlock),
+        sharedMemPerMultiprocessor  (props.sharedMemPerMultiprocessor),
+        numSms                      (props.multiProcessorCount),
+        sharedMemPerBlockOptin      (props.sharedMemPerBlockOptin),
+        reservedSharedMemPerBlock   (props.reservedSharedMemPerBlock)
+    {}
+
+    __OCC_INLINE
+    cudaOccDeviceProp()
+    :   computeMajor                (0),
+        computeMinor                (0),
+        maxThreadsPerBlock          (0),
+        maxThreadsPerMultiprocessor (0),
+        regsPerBlock                (0),
+        regsPerMultiprocessor       (0),
+        warpSize                    (0),
+        sharedMemPerBlock           (0),
+        sharedMemPerMultiprocessor  (0),
+        numSms                      (0),
+        sharedMemPerBlockOptin      (0),
+        reservedSharedMemPerBlock   (0)
+    {}
+#endif // __cplusplus
+};
+
+/**
+ * Partitioned global caching option
+ */
+typedef enum cudaOccPartitionedGCConfig_enum {
+    PARTITIONED_GC_OFF,        // Disable partitioned global caching
+    PARTITIONED_GC_ON,         // Prefer partitioned global caching
+    PARTITIONED_GC_ON_STRICT   // Force partitioned global caching
+} cudaOccPartitionedGCConfig;
+
+/**
+ * Per function opt in maximum dynamic shared memory limit
+ */
+typedef enum cudaOccFuncShmemConfig_enum {
+    FUNC_SHMEM_LIMIT_DEFAULT,   // Default shmem limit
+    FUNC_SHMEM_LIMIT_OPTIN,     // Use the optin shmem limit
+} cudaOccFuncShmemConfig;
+
+/**
+ * Function descriptor
+ *
+ * This structure describes a CUDA function.
+ */
+struct cudaOccFuncAttributes {
+    int maxThreadsPerBlock; // Maximum block size the function can work with. If
+                            // unlimited, use INT_MAX or any value greater than
+                            // or equal to maxThreadsPerBlock of the device
+    int numRegs;            // Number of registers used. When the function is
+                            // launched on device, the register count may change
+                            // due to internal tools requirements.
+    size_t sharedSizeBytes; // Number of static shared memory used
+
+    cudaOccPartitionedGCConfig partitionedGCConfig; 
+                            // Partitioned global caching is required to enable
+                            // caching on certain chips, such as sm_52
+                            // devices. Partitioned global caching can be
+                            // automatically disabled if the occupancy
+                            // requirement of the launch cannot support caching.
+                            //
+                            // To override this behavior with caching on and
+                            // calculate occupancy strictly according to the
+                            // preference, set partitionedGCConfig to
+                            // PARTITIONED_GC_ON_STRICT. This is especially
+                            // useful for experimenting and finding launch
+                            // configurations (MaxPotentialOccupancyBlockSize)
+                            // that allow global caching to take effect.
+                            //
+                            // This flag only affects the occupancy calculation.
+
+    cudaOccFuncShmemConfig shmemLimitConfig;
+                            // Certain chips like sm_70 allow a user to opt into
+                            // a higher per block limit of dynamic shared memory
+                            // This optin is performed on a per function basis
+                            // using the cuFuncSetAttribute function
+
+    size_t maxDynamicSharedSizeBytes;
+                            // User set limit on maximum dynamic shared memory
+                            // usable by the kernel
+                            // This limit is set using the cuFuncSetAttribute
+                            // function.
+
+    int numBlockBarriers;   // Number of block barriers used (default to 1)
+#ifdef __cplusplus
+    // This structure can be converted from a cudaFuncAttributes structure for
+    // users that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the function attributes of a CUDA kernel function through
+    // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
+    // cudaFuncAttributes structure.
+    //
+    // Example:
+    /*
+      __global__ void foo() {...}
+
+      ...
+
+      {
+          cudaFuncAttributes attr;
+
+          cudaFuncGetAttributes(&attr, foo);
+
+          cudaOccFuncAttributes occAttr = attr;
+
+          ...
+
+          cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
+      }
+     */
+    //
+    template<typename FuncAttributes>
+    __OCC_INLINE
+    cudaOccFuncAttributes(const FuncAttributes &attr)
+    :   maxThreadsPerBlock  (attr.maxThreadsPerBlock),
+        numRegs             (attr.numRegs),
+        sharedSizeBytes     (attr.sharedSizeBytes),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_OPTIN),
+        maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
+        numBlockBarriers    (1)
+    {}
+
+    __OCC_INLINE
+    cudaOccFuncAttributes()
+    :   maxThreadsPerBlock  (0),
+        numRegs             (0),
+        sharedSizeBytes     (0),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_DEFAULT),
+        maxDynamicSharedSizeBytes (0),
+        numBlockBarriers    (0)
+    {}
+#endif
+};
+
+typedef enum cudaOccCacheConfig_enum {
+    CACHE_PREFER_NONE   = 0x00, // no preference for shared memory or L1 (default)
+    CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
+    CACHE_PREFER_L1     = 0x02, // prefer larger L1 cache and smaller shared memory
+    CACHE_PREFER_EQUAL  = 0x03  // prefer equal sized L1 cache and shared memory
+} cudaOccCacheConfig;
+
+typedef enum cudaOccCarveoutConfig_enum {
+    SHAREDMEM_CARVEOUT_DEFAULT       = -1,  // no preference for shared memory or L1 (default)
+    SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, // prefer maximum available shared memory, minimum L1 cache
+    SHAREDMEM_CARVEOUT_MAX_L1        = 0,    // prefer maximum available L1 cache, minimum shared memory
+    SHAREDMEM_CARVEOUT_HALF          = 50   // prefer half of maximum available shared memory, with the rest as L1 cache
+} cudaOccCarveoutConfig;
+
+/**
+ * Device state descriptor
+ *
+ * This structure describes device settings that affect occupancy calculation.
+ */
+struct cudaOccDeviceState
+{
+    // Cache / shared memory split preference. Deprecated on Volta 
+    cudaOccCacheConfig cacheConfig; 
+    // Shared memory / L1 split preference. Supported on only Volta
+    int carveoutConfig;
+
+#ifdef __cplusplus
+    __OCC_INLINE
+    cudaOccDeviceState()
+    :   cacheConfig     (CACHE_PREFER_NONE),
+        carveoutConfig  (SHAREDMEM_CARVEOUT_DEFAULT)
+    {}
+#endif
+};
+
+typedef enum cudaOccLimitingFactor_enum {
+                                    // Occupancy limited due to:
+    OCC_LIMIT_WARPS         = 0x01, // - warps available
+    OCC_LIMIT_REGISTERS     = 0x02, // - registers available
+    OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
+    OCC_LIMIT_BLOCKS        = 0x08, // - blocks available
+    OCC_LIMIT_BARRIERS      = 0x10  // - barrier available
+} cudaOccLimitingFactor;
+
+/**
+ * Occupancy output
+ *
+ * This structure contains occupancy calculator's output.
+ */
+struct cudaOccResult {
+    int activeBlocksPerMultiprocessor; // Occupancy
+    unsigned int limitingFactors;      // Factors that limited occupancy. A bit
+                                       // field that counts the limiting
+                                       // factors, see cudaOccLimitingFactor
+    int blockLimitRegs;                // Occupancy due to register
+                                       // usage, INT_MAX if the kernel does not
+                                       // use any register.
+    int blockLimitSharedMem;           // Occupancy due to shared memory
+                                       // usage, INT_MAX if the kernel does not
+                                       // use shared memory.
+    int blockLimitWarps;               // Occupancy due to block size limit
+    int blockLimitBlocks;              // Occupancy due to maximum number of blocks
+                                       // managable per SM
+    int blockLimitBarriers;            // Occupancy due to block barrier usage
+    int allocatedRegistersPerBlock;    // Actual number of registers allocated per
+                                       // block
+    size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
+                                       // per block
+    cudaOccPartitionedGCConfig partitionedGCConfig;
+                                       // Report if partitioned global caching
+                                       // is actually enabled.
+};
+
+/**
+ * Partitioned global caching support
+ *
+ * See cudaOccPartitionedGlobalCachingModeSupport
+ */
+typedef enum cudaOccPartitionedGCSupport_enum {
+    PARTITIONED_GC_NOT_SUPPORTED,  // Partitioned global caching is not supported
+    PARTITIONED_GC_SUPPORTED,      // Partitioned global caching is supported
+} cudaOccPartitionedGCSupport;
+
+/**
+ * Implementation
+ */
+
+/**
+ * Max compute capability supported
+ */
+#define __CUDA_OCC_MAJOR__ 9
+#define __CUDA_OCC_MINOR__ 0
+
+//////////////////////////////////////////
+//    Mathematical Helper Functions     //
+//////////////////////////////////////////
+
+static __OCC_INLINE int __occMin(int lhs, int rhs)
+{
+    return rhs < lhs ? rhs : lhs;
+}
+
+static __OCC_INLINE int __occDivideRoundUp(int x, int y)
+{
+    return (x + (y - 1)) / y;
+}
+
+static __OCC_INLINE int __occRoundUp(int x, int y)
+{
+    return y * __occDivideRoundUp(x, y);
+}
+
+//////////////////////////////////////////
+//      Architectural Properties        //
+//////////////////////////////////////////
+
+/**
+ * Granularity of shared memory allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+            value = 256;
+            break;
+        case 8:
+        case 9:
+            value = 128;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Maximum number of registers per thread
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+            value = 255;
+            break;
+        case 7:
+        case 8:
+        case 9:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Granularity of register allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+        case 8:
+        case 9:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Number of sub-partitions
+ */
+static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 7:
+        case 8:
+        case 9:
+            value = 4;
+            break;
+        case 6:
+            value = properties->computeMinor ? 4 : 2;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+
+/**
+ * Maximum number of blocks that can run simultaneously on a multiprocessor
+ */
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+            value = 16;
+            break;
+        case 5:
+        case 6:
+            value = 32;
+            break;
+        case 7: {
+            int isTuring = properties->computeMinor == 5;
+            value = (isTuring) ? 16 : 32;
+            break;
+        }
+        case 8:
+            if (properties->computeMinor == 0) {
+                value = 32;
+            }
+            else if (properties->computeMinor == 9) {
+                value = 24;
+            }
+            else {
+                value = 16;
+            }
+            break;
+        case 9:
+            value = 32;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/** 
+ * Align up shared memory based on compute major configurations
+ */
+static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
+{
+    // Volta and Turing have shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // map carveout config ratio to the next available architecture size
+    size_t size = *shMemSize;
+
+    switch (properties->computeMajor) {
+    case 7: {
+        // Turing supports 32KB and 64KB shared mem.
+        int isTuring = properties->computeMinor == 5;
+        if (isTuring) {
+            if      (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 96 * 1024) {
+                *shMemSize = 96 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    }
+    case 8:
+        if (properties->computeMinor == 0 || properties->computeMinor == 7) {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else if (size <= 132 * 1024) {
+                *shMemSize = 132 * 1024;
+            }
+            else if (size <= 164 * 1024) {
+                *shMemSize = 164 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    case 9: {
+        if      (size == 0) {
+            *shMemSize = 0;
+        }
+        else if (size <= 8 * 1024) {
+            *shMemSize = 8 * 1024;
+        }
+        else if (size <= 16 * 1024) {
+            *shMemSize = 16 * 1024;
+        }
+        else if (size <= 32 * 1024) {
+            *shMemSize = 32 * 1024;
+        }
+        else if (size <= 64 * 1024) {
+            *shMemSize = 64 * 1024;
+        }
+        else if (size <= 100 * 1024) {
+            *shMemSize = 100 * 1024;
+        }
+        else if (size <= 132 * 1024) {
+            *shMemSize = 132 * 1024;
+        }
+        else if (size <= 164 * 1024) {
+            *shMemSize = 164 * 1024;
+        }
+        else if (size <= 196 * 1024) {
+            *shMemSize = 196 * 1024;
+        }
+        else if (size <= 228 * 1024) {
+            *shMemSize = 228 * 1024;
+        }
+        else {
+            return CUDA_OCC_ERROR_INVALID_INPUT;
+        }
+        break;
+    }
+    default:
+        return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Shared memory based on the new carveoutConfig API introduced with Volta
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    size_t preferenceShmemSize;
+
+    // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
+    // devices. This preference will take precedence over the older cacheConfig setting.
+    // Map cacheConfig to its effective preference value.
+    int effectivePreference = state->carveoutConfig;
+    if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        switch (state->cacheConfig)
+        {
+        case CACHE_PREFER_L1:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
+            break;
+        case CACHE_PREFER_SHARED:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
+            break;
+        case CACHE_PREFER_EQUAL:
+            effectivePreference = SHAREDMEM_CARVEOUT_HALF;
+            break;
+        default:
+            effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
+            break;
+        }
+    }
+
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        preferenceShmemSize = properties->sharedMemPerMultiprocessor;
+    }
+    else {
+        preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
+    }
+
+    status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
+    *limit = preferenceShmemSize;
+    return status;
+}
+
+/**
+ * Shared memory based on the cacheConfig
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    size_t bytes                          = 0;
+    size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
+    cudaOccCacheConfig cacheConfig        = state->cacheConfig;
+
+    // Kepler has shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // calculate the correct shared memory size for user requested cache
+    // configuration.
+    //
+    size_t minCacheSize                   = 16384;
+    size_t maxCacheSize                   = 49152;
+    size_t cacheAndSharedTotal            = sharedMemPerMultiprocessorHigh + minCacheSize;
+    size_t sharedMemPerMultiprocessorLow  = cacheAndSharedTotal - maxCacheSize;
+
+    switch (properties->computeMajor) {
+        case 3:
+            // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
+            // is shared memory.
+            //
+            switch (cacheConfig) {
+                default :
+                case CACHE_PREFER_NONE:
+                case CACHE_PREFER_SHARED:
+                    bytes = sharedMemPerMultiprocessorHigh;
+                    break;
+                case CACHE_PREFER_L1:
+                    bytes = sharedMemPerMultiprocessorLow;
+                    break;
+                case CACHE_PREFER_EQUAL:
+                    // Equal is the mid-point between high and low. It should be
+                    // equivalent to low + 16KB.
+                    //
+                    bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
+                    break;
+            }
+            break;
+        case 5:
+        case 6:
+            // Maxwell and Pascal have dedicated shared memory.
+            //
+            bytes = sharedMemPerMultiprocessorHigh;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = bytes;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Shared memory based on config requested by User
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
+    // it is handled separately from the cache config preference.
+    if (properties->computeMajor >= 7) {
+        return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
+    }
+    return cudaOccSMemPreference(limit, properties, state);
+}
+
+/**
+ * Return the per block shared memory limit based on function config
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
+{
+    switch (properties->computeMajor) {
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+        case 6:
+            *limit = properties->sharedMemPerBlock;
+            break;
+        case 7:
+        case 8:
+        case 9:
+            switch (shmemLimitConfig) {
+                default:
+                case FUNC_SHMEM_LIMIT_DEFAULT:
+                    *limit = properties->sharedMemPerBlock;
+                    break;
+                case FUNC_SHMEM_LIMIT_OPTIN:
+                    if (smemPerCta > properties->sharedMemPerBlock) {
+                        *limit = properties->sharedMemPerBlockOptin;
+                    }
+                    else {
+                        *limit = properties->sharedMemPerBlock;
+                    }
+                    break;
+            }
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    // Starting Ampere, CUDA driver reserves additional shared memory per block
+    if (properties->computeMajor >= 8) {
+        *limit += properties->reservedSharedMemPerBlock;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Partitioned global caching mode support
+ */
+static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
+{
+    *limit = PARTITIONED_GC_NOT_SUPPORTED;
+
+    if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
+        properties->computeMajor == 6) {
+        *limit = PARTITIONED_GC_SUPPORTED;
+    }
+
+    if (properties->computeMajor == 6 && properties->computeMinor == 0) {
+        *limit = PARTITIONED_GC_NOT_SUPPORTED;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+///////////////////////////////////////////////
+//            User Input Sanity              //
+///////////////////////////////////////////////
+
+static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
+{
+    // Verify device properties
+    //
+    // Each of these limits must be a positive number.
+    //
+    // Compute capacity is checked during the occupancy calculation
+    //
+    if (properties->maxThreadsPerBlock          <= 0 ||
+        properties->maxThreadsPerMultiprocessor <= 0 ||
+        properties->regsPerBlock                <= 0 ||
+        properties->regsPerMultiprocessor       <= 0 ||
+        properties->warpSize                    <= 0 ||
+        properties->sharedMemPerBlock           <= 0 ||
+        properties->sharedMemPerMultiprocessor  <= 0 ||
+        properties->numSms                      <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
+{
+    // Verify function attributes
+    //
+    if (attributes->maxThreadsPerBlock <= 0 ||
+        attributes->numRegs < 0) {            // Compiler may choose not to use
+                                              // any register (empty kernels,
+                                              // etc.)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
+{
+    (void)state;   // silence unused-variable warning
+    // Placeholder
+    //
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccInputCheck(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+
+    status = cudaOccDevicePropCheck(properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccFuncAttributesCheck(attributes);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccDeviceStateCheck(state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    return status;
+}
+
+///////////////////////////////////////////////
+//    Occupancy calculation Functions        //
+///////////////////////////////////////////////
+
+static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccPartitionedGCSupport gcSupport;
+    cudaOccPartitionedGCConfig gcConfig;
+
+    cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
+
+    gcConfig = attributes->partitionedGCConfig;
+
+    if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
+        gcConfig = PARTITIONED_GC_OFF;
+    }
+
+    return gcConfig;
+}
+
+// Warp limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig   gcConfig,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int maxWarpsPerSm;
+    int warpsAllocatedPerCTA;
+    int maxBlocks;
+    (void)attributes;   // silence unused-variable warning
+
+    if (blockSize > properties->maxThreadsPerBlock) {
+        maxBlocks = 0;
+    }
+    else {
+        maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
+        warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+        maxBlocks = 0;
+
+        if (gcConfig != PARTITIONED_GC_OFF) {
+            int maxBlocksPerSmPartition;
+            int maxWarpsPerSmPartition;
+
+            // If partitioned global caching is on, then a CTA can only use a SM
+            // partition (a half SM), and thus a half of the warp slots
+            // available per SM
+            //
+            maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
+            maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
+            maxBlocks               = maxBlocksPerSmPartition * 2;
+        }
+        // On hardware that supports partitioned global caching, each half SM is
+        // guaranteed to support at least 32 warps (maximum number of warps of a
+        // CTA), so caching will not cause 0 occupancy due to insufficient warp
+        // allocation slots.
+        //
+        else {
+            maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
+        }
+    }
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+// Shared memory limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
+    int                         *limit,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    size_t userSmemPreference = 0;
+    size_t totalSmemUsagePerCTA;
+    size_t maxSmemUsagePerCTA;
+    size_t smemAllocatedPerCTA;
+    size_t staticSmemSize;
+    size_t sharedMemPerMultiprocessor;
+    size_t smemLimitPerCTA;
+    int maxBlocks;
+    int dynamicSmemSizeExceeded = 0;
+    int totalSmemSizeExceeded = 0;
+    (void)blockSize;   // silence unused-variable warning
+
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Obtain the user preferred shared memory size. This setting is ignored if
+    // user requests more shared memory than preferred.
+    //
+    status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
+    totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
+    smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
+
+    maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
+
+    dynamicSmemSizeExceeded = 0;
+    totalSmemSizeExceeded   = 0;
+
+    // Obtain the user set maximum dynamic size if it exists
+    // If so, the current launch dynamic shared memory must not
+    // exceed the set limit
+    if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
+        dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
+        dynamicSmemSizeExceeded = 1;
+    }
+
+    status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    if (smemAllocatedPerCTA > smemLimitPerCTA) {
+        totalSmemSizeExceeded = 1;
+    }
+
+    if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
+        maxBlocks = 0;
+    }
+    else {
+        // User requested shared memory limit is used as long as it is greater
+        // than the total shared memory used per CTA, i.e. as long as at least
+        // one CTA can be launched.
+        if (userSmemPreference >= smemAllocatedPerCTA) {
+            sharedMemPerMultiprocessor = userSmemPreference;
+        }
+        else {
+            // On Volta+, user requested shared memory will limit occupancy
+            // if it's less than shared memory per CTA. Otherwise, the
+            // maximum shared memory limit is used.
+            if (properties->computeMajor >= 7) {
+                sharedMemPerMultiprocessor = smemAllocatedPerCTA;
+                status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
+                if (status != CUDA_OCC_SUCCESS) {
+                    return status;
+                }
+            }
+            else {
+                sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
+            }
+        }
+
+        if (smemAllocatedPerCTA > 0) {
+            maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+
+    result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig  *gcConfig,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    int warpsAllocatedPerCTA;
+    int regsAllocatedPerCTA;
+    int regsAssumedPerCTA;
+    int regsPerWarp;
+    int regsAllocatedPerWarp;
+    int numSubPartitions;
+    int numRegsPerSubPartition;
+    int numWarpsPerSubPartition;
+    int numWarpsPerSM;
+    int maxBlocks;
+    int maxRegsPerThread;
+
+    status = cudaOccRegAllocationGranularity(
+        &allocationGranularity,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccRegAllocationMaxPerThread(
+        &maxRegsPerThread,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+
+    // GPUs of compute capability 2.x and higher allocate registers to warps
+    //
+    // Number of regs per warp is regs per thread x warp size, rounded up to
+    // register allocation granularity
+    //
+    regsPerWarp          = attributes->numRegs * properties->warpSize;
+    regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
+    regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;
+
+    // Hardware verifies if a launch fits the per-CTA register limit. For
+    // historical reasons, the verification logic assumes register
+    // allocations are made to all partitions simultaneously. Therefore, to
+    // simulate the hardware check, the warp allocation needs to be rounded
+    // up to the number of partitions.
+    //
+    regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
+
+    if (properties->regsPerBlock < regsAssumedPerCTA ||   // Hardware check
+        properties->regsPerBlock < regsAllocatedPerCTA || // Software check
+        attributes->numRegs > maxRegsPerThread) {         // Per thread limit check
+        maxBlocks = 0;
+    }
+    else {
+        if (regsAllocatedPerWarp > 0) {
+            // Registers are allocated in each sub-partition. The max number
+            // of warps that can fit on an SM is equal to the max number of
+            // warps per sub-partition x number of sub-partitions.
+            //
+            numRegsPerSubPartition  = properties->regsPerMultiprocessor / numSubPartitions;
+            numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
+
+            maxBlocks = 0;
+
+            if (*gcConfig != PARTITIONED_GC_OFF) {
+                int numSubPartitionsPerSmPartition;
+                int numWarpsPerSmPartition;
+                int maxBlocksPerSmPartition;
+
+                // If partitioned global caching is on, then a CTA can only
+                // use a half SM, and thus a half of the registers available
+                // per SM
+                //
+                numSubPartitionsPerSmPartition = numSubPartitions / 2;
+                numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
+                maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
+                maxBlocks                      = maxBlocksPerSmPartition * 2;
+            }
+
+            // Try again if partitioned global caching is not enabled, or if
+            // the CTA cannot fit on the SM with caching on (maxBlocks == 0).  In the latter
+            // case, the device will automatically turn off caching, except
+            // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
+            // occupancy and launch configuration.
+            //
+            if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
+               // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
+               // this is what it will be if we spread CTA across partitions.
+               //
+               *gcConfig = PARTITIONED_GC_OFF;
+               numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
+               maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
+            }
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+
+
+    result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+// Barrier limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
+    int                         *limit,
+    int                          ctaLimitBlocks,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int numBarriersAvailable = ctaLimitBlocks * 2;
+    int numBarriersUsed = attributes->numBlockBarriers;
+    int maxBlocks = INT_MAX;
+
+    if (numBarriersUsed) {
+        maxBlocks = numBarriersAvailable / numBarriersUsed;
+    }
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+///////////////////////////////////
+//      API Implementations      //
+///////////////////////////////////
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status          = CUDA_OCC_SUCCESS;
+    int          ctaLimitWarps   = 0;
+    int          ctaLimitBlocks  = 0;
+    int          ctaLimitSMem    = 0;
+    int          ctaLimitRegs    = 0;
+    int          ctaLimitBars    = 0;
+    int          ctaLimit        = 0;
+    unsigned int limitingFactors = 0;
+    
+    cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
+
+    if (!result || !properties || !attributes || !state || blockSize <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    ///////////////////////////
+    // Initialization
+    ///////////////////////////
+
+    gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
+
+    ///////////////////////////
+    // Compute occupancy
+    ///////////////////////////
+
+    // Limits due to registers/SM
+    // Also compute if partitioned global caching has to be turned off
+    //
+    status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
+    // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
+    // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
+    // we do not let it run on any Pascal processor, even though it may be able to run on GP100.
+    // Therefore, we check the occupancy on GP10x when it can run on GP100
+    //
+    if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
+        cudaOccDeviceProp propertiesGP10x;
+        cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
+        int ctaLimitRegsGP10x = 0;
+
+        // Set up properties for GP10x
+        memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
+        propertiesGP10x.computeMinor = 1;
+
+        status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        if (ctaLimitRegsGP10x == 0) {
+            ctaLimitRegs = 0;
+        }
+    }
+
+    // Limits due to warps/SM
+    //
+    status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Limits due to blocks/SM
+    //
+    status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Limits due to shared memory/SM
+    //
+    status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    ///////////////////////////
+    // Overall occupancy
+    ///////////////////////////
+
+    // Overall limit is min() of limits due to above reasons
+    //
+    ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
+
+    // Determine occupancy limiting factors
+    //
+    if (ctaLimit == ctaLimitWarps) {
+        limitingFactors |= OCC_LIMIT_WARPS;
+    }
+    if (ctaLimit == ctaLimitRegs) {
+        limitingFactors |= OCC_LIMIT_REGISTERS;
+    }
+    if (ctaLimit == ctaLimitSMem) {
+        limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
+    }
+    if (ctaLimit == ctaLimitBlocks) {
+        limitingFactors |= OCC_LIMIT_BLOCKS;
+    }
+
+    // For Hopper onwards compute the limits to occupancy based on block barrier count
+    //
+    if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
+        // Limits due to barrier/SM
+        //
+        status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        // Recompute overall limit based on barrier/SM
+        //
+        ctaLimit = __occMin(ctaLimitBars, ctaLimit);
+
+        // Determine if this is occupancy limiting factor
+        //
+        if (ctaLimit == ctaLimitBars) {
+            limitingFactors |= OCC_LIMIT_BARRIERS;
+        }
+    }
+    else {
+        ctaLimitBars = INT_MAX;
+    }
+
+    // Fill in the return values
+    //
+    result->limitingFactors = limitingFactors;
+
+    result->blockLimitRegs      = ctaLimitRegs;
+    result->blockLimitSharedMem = ctaLimitSMem;
+    result->blockLimitWarps     = ctaLimitWarps;
+    result->blockLimitBlocks    = ctaLimitBlocks;
+    result->blockLimitBarriers  = ctaLimitBars;
+    result->partitionedGCConfig = gcConfig;
+
+    // Final occupancy
+    result->activeBlocksPerMultiprocessor = ctaLimit;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *bytesAvailable,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize)
+{
+    int allocationGranularity;
+    size_t smemLimitPerBlock;
+    size_t smemAvailableForDynamic;
+    size_t userSmemPreference = 0;
+    size_t sharedMemPerMultiprocessor;
+    cudaOccResult result;
+    cudaOccError status = CUDA_OCC_SUCCESS;
+
+    if (numBlocks <= 0)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+
+    // First compute occupancy of potential kernel launch.
+    //
+    status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Check if occupancy is achievable given user requested number of blocks. 
+    //
+    if (result.activeBlocksPerMultiprocessor < numBlocks) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Return the per block shared memory limit based on function config.
+    //
+    status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
+    // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
+    // preference sets the total limit of available shared memory.
+    //
+    cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (numBlocks == 1) {
+        sharedMemPerMultiprocessor = smemLimitPerBlock;
+    }
+    else {
+        if (!userSmemPreference) {
+            userSmemPreference = 1 ;
+            status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
+            if (status != CUDA_OCC_SUCCESS) {
+                return status;
+            }
+        }
+        sharedMemPerMultiprocessor = userSmemPreference;
+    }
+
+    // Compute total shared memory available per SM
+    //
+    smemAvailableForDynamic =  sharedMemPerMultiprocessor / numBlocks;
+    smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
+
+    // Cap shared memory
+    //
+    if (smemAvailableForDynamic > smemLimitPerBlock) {
+        smemAvailableForDynamic = smemLimitPerBlock;
+    }
+
+    // Now compute dynamic shared memory size
+    smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes; 
+
+    // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
+    //
+    if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
+        smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
+
+    *bytesAvailable = smemAvailableForDynamic;
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                     (*blockSizeToDynamicSMemSize)(int),
+    size_t                       dynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+
+        // Ignore dynamicSMemSize if the user provides a mapping
+        //
+        if (blockSizeToDynamicSMemSize) {
+            dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
+        }
+
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+
+#if defined(__cplusplus)
+
+namespace {
+
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                       dynamicSMemSize)
+{
+    return cudaOccMaxPotentialOccupancyBlockSize(
+        minGridSize,
+        blockSize,
+        properties,
+        attributes,
+        state,
+        NULL,
+        dynamicSMemSize);
+}
+
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    UnaryFunction                blockSizeToDynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+} // namespace anonymous
+
+#endif /*__cplusplus */
+
+#undef __OCC_INLINE
+
+#endif /*__cuda_occupancy_h__*/
diff --git a/ext/cudart/include/cuda_pipeline.h b/ext/cudart/include/cuda_pipeline.h
new file mode 100644
index 0000000000000000000000000000000000000000..46bc89e4499576f1ae58848cd8684ba3e32420cf
--- /dev/null
+++ b/ext/cudart/include/cuda_pipeline.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_PIPELINE_H_
+# define _CUDA_PIPELINE_H_
+
+# include "cuda_pipeline_primitives.h"
+
+# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+# endif
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  include "cuda_awbarrier.h"
+# endif
+
+// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
+#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
+#  else
+#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
+#  endif
+
+#  define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
+#  define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
+#  define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
+
+namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
+    struct __block_scope_barrier_base;
+}}
+
+# endif
+
+_CUDA_PIPELINE_BEGIN_NAMESPACE
+
+template<size_t N, typename T>
+_CUDA_PIPELINE_QUALIFIER
+auto segment(T* ptr) -> T(*)[N];
+
+class pipeline {
+public:
+    pipeline(const pipeline&) = delete;
+    pipeline(pipeline&&) = delete;
+    pipeline& operator=(const pipeline&) = delete;
+    pipeline& operator=(pipeline&&) = delete;
+
+    _CUDA_PIPELINE_QUALIFIER pipeline();
+    _CUDA_PIPELINE_QUALIFIER size_t commit();
+    _CUDA_PIPELINE_QUALIFIER void commit_and_wait();
+    _CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
+    template<unsigned N>
+    _CUDA_PIPELINE_QUALIFIER void wait_prior();
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+    _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
+    _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
+# endif
+
+private:
+    size_t current_batch;
+};
+
+template<class T>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T& dst, const T& src, pipeline& pipe);
+
+template<class T, size_t DstN, size_t SrcN>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
+
+template<size_t N, typename T>
+_CUDA_PIPELINE_QUALIFIER
+auto segment(T* ptr) -> T(*)[N]
+{
+    return (T(*)[N])ptr;
+}
+
+_CUDA_PIPELINE_QUALIFIER
+pipeline::pipeline()
+    : current_batch(0)
+{
+}
+
+_CUDA_PIPELINE_QUALIFIER
+size_t pipeline::commit()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
+    return this->current_batch++;
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::commit_and_wait()
+{
+    (void)pipeline::commit();
+    pipeline::wait_prior<0>();
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::wait(size_t batch)
+{
+    const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
+
+    switch (prior) {
+    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
+    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
+    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
+    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
+    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
+    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
+    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
+    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
+    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
+    }
+}
+
+template<unsigned N>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::wait_prior()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
+}
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::arrive_on(awbarrier& barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
+}
+# endif
+
+template<class T>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T& dst, const T& src, pipeline& pipe)
+{
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
+
+    if (__is_trivially_copyable(T)) {
+        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
+                reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
+    } else {
+        dst = src;
+    }
+}
+
+template<class T, size_t DstN, size_t SrcN>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
+{
+    constexpr size_t dst_size = sizeof(*dst);
+    constexpr size_t src_size = sizeof(*src);
+    static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
+    static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
+
+    if (__is_trivially_copyable(T)) {
+        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
+                reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
+    } else {
+        for (size_t i = 0; i < DstN; ++i) {
+            (*dst)[i] = (i < SrcN) ? (*src)[i] : T();
+        }
+    }
+}
+
+_CUDA_PIPELINE_END_NAMESPACE
+
+#endif /* !_CUDA_PIPELINE_H_ */
diff --git a/ext/cudart/include/cuda_pipeline_helpers.h b/ext/cudart/include/cuda_pipeline_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..488264076c2b1bd850a14a85871dc22b7f6d36ce
--- /dev/null
+++ b/ext/cudart/include/cuda_pipeline_helpers.h
@@ -0,0 +1,373 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_PIPELINE_HELPERS_H_
+# define _CUDA_PIPELINE_HELPERS_H_
+
+# define _CUDA_PIPELINE_NAMESPACE       nvcuda::experimental
+# define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
+# define _CUDA_PIPELINE_END_NAMESPACE   } }
+
+# define _CUDA_PIPELINE_INTERNAL_NAMESPACE       _CUDA_PIPELINE_NAMESPACE::__pipeline_internal
+# define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal {
+# define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE   } _CUDA_PIPELINE_END_NAMESPACE
+
+# if !defined(_CUDA_PIPELINE_QUALIFIER)
+#  define _CUDA_PIPELINE_QUALIFIER inline __device__
+# endif
+# if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER)
+#  define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__
+# endif
+
+# if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+#  define _CUDA_PIPELINE_ARCH_700_OR_LATER
+# endif
+
+# if (__CUDA_ARCH__ >= 800)
+#  define _CUDA_PIPELINE_HAS_ASYNC_COPY 1
+# else
+#  define _CUDA_PIPELINE_HAS_ASYNC_COPY 0
+# endif
+
+# if !defined(_CUDA_PIPELINE_MAX_STAGES)
+#  define _CUDA_PIPELINE_MAX_STAGES 8
+# endif
+
+# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
+#  define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER
+# endif
+
+# if !defined(_CUDA_PIPELINE_DEBUG)
+#  if defined(__CUDACC_DEBUG__)
+#   define _CUDA_PIPELINE_DEBUG 1
+#  else
+#   define _CUDA_PIPELINE_DEBUG 0
+#  endif
+# endif
+
+# if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG)
+#  if !defined(__CUDACC_RTC__)
+#   include <cassert>
+#  endif
+#  define _CUDA_PIPELINE_ASSERT(x) assert((x));
+#  define _CUDA_PIPELINE_ABORT() assert(0);
+# else
+#  define _CUDA_PIPELINE_ASSERT(x)
+#  define _CUDA_PIPELINE_ABORT() __trap();
+# endif
+
+# if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
+#  define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m)
+# else
+#  define _CUDA_PIPELINE_STATIC_ASSERT(c, m)
+# endif
+
+# if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
+#  define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r"
+# else
+#  define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l"
+# endif
+
+# if defined(__CUDACC_RTC__)
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t           uintptr_t;
+# else
+#  include <stdint.h>
+# endif
+
+_CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE
+
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) ==  2, "Size mismatch for type 'short'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int)   ==  4, "Size mismatch for type 'int'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2)  ==  8, "Size mismatch for type 'int2'");
+_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4)  == 16, "Size mismatch for type 'int4'");
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
+
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+
+    char* const d = reinterpret_cast<char*>(dst);
+    const char* const s = reinterpret_cast<const char*>(src);
+
+    size_t copy_step_size;
+    if (SourceSize == 0) {
+        copy_step_size = CopySize;
+    } else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) {
+        copy_step_size = SourceSize;
+    } else {
+        copy_step_size = 1;
+    }
+
+    for (size_t i = 0; i < CopySize; i += copy_step_size) {
+        const bool copy_source = SourceSize && (i < SourceSize);
+
+        switch (copy_step_size) {
+        case 1:
+            d[i] = copy_source ? s[i] : char();
+            break;
+        case 2:
+            *reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short();
+            break;
+        case 4:
+            *reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int();
+            break;
+        case 8:
+            *reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2();
+            break;
+        case 16:
+            *reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4();
+            break;
+        }
+    }
+}
+
+template<bool UseHwAsyncCopy>
+struct ImplementationChooser;
+
+template<>
+struct ImplementationChooser<true> {
+    template<size_t CopySize, size_t SourceSize>
+    struct CpAsyncChooser {
+        _CUDA_PIPELINE_STATIC_QUALIFIER
+        void cp_async(void* __restrict__ dst, const void* __restrict__ src)
+        {
+            asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;"
+                :
+                : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize),
+                  "n"(SourceSize)
+                : "memory");
+        }
+    };
+
+    template<size_t SourceSize>
+    struct CpAsyncChooser<16, SourceSize> {
+        _CUDA_PIPELINE_STATIC_QUALIFIER
+        void cp_async(void* __restrict__ dst, const void* __restrict__ src)
+        {
+            asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;"
+                :
+                : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize)
+                : "memory");
+        }
+    };
+
+    template<size_t CopySize, size_t SourceSize>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+    {
+        _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+        _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+        _CUDA_PIPELINE_ASSERT(__isShared(dst));
+        _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+
+        CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src);
+    }
+
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_commit()
+    {
+        asm volatile ("cp.async.commit_group;");
+    }
+
+    template<unsigned N>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_wait_prior()
+    {
+        asm volatile ("cp.async.wait_group %0;"
+            :
+            : "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES));
+    }
+
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_arrive_on(uint64_t* barrier)
+    {
+        _CUDA_PIPELINE_ASSERT(__isShared(barrier));
+
+        asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];"
+            :
+            : "r"(__nvvm_get_smem_pointer(barrier)));
+    }
+};
+
+template<>
+struct ImplementationChooser<false> {
+    template<size_t CopySize, size_t SourceSize>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+    {
+        _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+        _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+        _CUDA_PIPELINE_ASSERT(__isShared(dst));
+        _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+        _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+
+        pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
+    }
+
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_commit()
+    {
+    }
+
+    template<unsigned N>
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_wait_prior()
+    {
+    }
+
+    _CUDA_PIPELINE_STATIC_QUALIFIER
+    void pipeline_arrive_on(uint64_t* barrier)
+    {
+    }
+};
+
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size");
+    _CUDA_PIPELINE_ASSERT(__isShared(dst));
+    _CUDA_PIPELINE_ASSERT(__isGlobal(src));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_commit()
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit();
+}
+
+template<unsigned N>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_wait_prior()
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>();
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_arrive_on(uint64_t* barrier)
+{
+    ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier);
+}
+
+template<size_t CopySize, size_t SourceSize>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size.");
+    _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size.");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1)));
+
+    if (__isGlobal(src) && __isShared(dst)) {
+        pipeline_memcpy_async<CopySize, SourceSize>(dst, src);
+    } else {
+        pipeline_memcpy_sync<CopySize, SourceSize>(dst, src);
+    }
+}
+
+template<size_t CopySize, size_t Align>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src)
+{
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1)));
+
+    const char* s = reinterpret_cast<const char*>(src);
+    char* d = reinterpret_cast<char*>(dst);
+    size_t remaining = CopySize;
+
+    while (remaining) {
+        if ((Align >= 16) && (remaining >= 16)) {
+            pipeline_copy_strict<16, 16>(dst, src);
+            d += 16;
+            s += 16;
+            remaining -= 16;
+        } else if ((Align >= 8) && (remaining >= 8)) {
+            pipeline_copy_strict<8, 8>(dst, src);
+            d += 8;
+            s += 8;
+            remaining -= 8;
+        } else if ((Align >= 4) && (remaining >= 4)) {
+            pipeline_copy_strict<4, 4>(dst, src);
+            d += 4;
+            s += 4;
+            remaining -= 4;
+        } else if ((Align >= 2) && (remaining >= 2)) {
+            *reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s);
+            d += 2;
+            s += 2;
+            remaining -= 2;
+        } else {
+            *d = *s;
+            d += 1;
+            s += 1;
+            remaining -= 1;
+        }
+    }
+}
+
+_CUDA_PIPELINE_END_INTERNAL_NAMESPACE
+
+#endif /* !_CUDA_PIPELINE_HELPERS_H_ */
diff --git a/ext/cudart/include/cuda_pipeline_primitives.h b/ext/cudart/include/cuda_pipeline_primitives.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaba0cfb5ac9184bec5e837d2ec2f9db11d873ae
--- /dev/null
+++ b/ext/cudart/include/cuda_pipeline_primitives.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_PIPELINE_PRIMITIVES_H_
+# define _CUDA_PIPELINE_PRIMITIVES_H_
+
+# include "cuda_pipeline_helpers.h"
+
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
+                             size_t zfill = 0)
+{
+    _CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
+    _CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
+    _CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
+    _CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
+
+    switch (size_and_align) {
+    case 16:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
+        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
+        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
+        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  9>(dst_shared, src_global); return;
+        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  8>(dst_shared, src_global); return;
+        case  9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  7>(dst_shared, src_global); return;
+        case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  6>(dst_shared, src_global); return;
+        case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  5>(dst_shared, src_global); return;
+        case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  4>(dst_shared, src_global); return;
+        case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  3>(dst_shared, src_global); return;
+        case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  2>(dst_shared, src_global); return;
+        case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  1>(dst_shared, src_global); return;
+        case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    case 8:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  8>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  7>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  6>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  5>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  4>(dst_shared, src_global); return;
+        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  3>(dst_shared, src_global); return;
+        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  2>(dst_shared, src_global); return;
+        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  1>(dst_shared, src_global); return;
+        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    case 4:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  4>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  3>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  2>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  1>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    default:
+        _CUDA_PIPELINE_ABORT();
+        return;
+    }
+}
+
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_commit()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
+}
+
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_wait_prior(size_t prior)
+{
+    switch (prior) {
+    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
+    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
+    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
+    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
+    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
+    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
+    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
+    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
+    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
+    }
+}
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  include "cuda_awbarrier_primitives.h"
+
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_arrive_on(__mbarrier_t* barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
+}
+# endif
+
+#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */
diff --git a/ext/cudart/include/cuda_runtime.h b/ext/cudart/include/cuda_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d297bb6c4140f2056618cedd9b34bdc42cd6367
--- /dev/null
+++ b/ext/cudart/include/cuda_runtime.h
@@ -0,0 +1,2725 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_RUNTIME_H__)
+#define __CUDA_RUNTIME_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+#pragma GCC diagnostic push
+#endif
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4820)
+#endif
+#endif
+
+#ifdef __QNX__
+#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
+typedef unsigned size_t;
+#endif
+#endif
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "crt/host_config.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "library_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#include "cuda_runtime_api.h"
+#include "driver_functions.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "crt/host_defines.h"
+#include "vector_functions.h"
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#include "nvrtc_device_runtime.h"
+#include "crt/device_functions.h"
+#include "crt/common_functions.h"
+#include "cuda_surface_types.h"
+#include "cuda_texture_types.h"
+#include "device_launch_parameters.h"
+
+#else /* !__CUDACC_RTC__ */
+#define EXCLUDE_FROM_RTC
+#include "crt/common_functions.h"
+#include "cuda_surface_types.h"
+#include "cuda_texture_types.h"
+#include "crt/device_functions.h"
+#include "device_launch_parameters.h"
+
+#if defined(__CUDACC_EXTENDED_LAMBDA__)
+#include <functional>
+#include <utility>
+struct  __device_builtin__ __nv_lambda_preheader_injection { };
+#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __CUDACC__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus) && !defined(__CUDACC_RTC__)
+
+#if __cplusplus >= 201103
+#include <utility>
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ * @{
+ */
+
+/**
+ *\brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory (defaults to 0)
+ * \param stream      - Stream identifier (defaults to NULL)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLaunchKernel(
+  const T *func,
+  dim3 gridDim,
+  dim3 blockDim,
+  void **args,
+  size_t sharedMem = 0,
+  cudaStream_t stream = 0
+)
+{
+    return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+
+#if __cplusplus >= 201103 || defined(__DOXYGEN_ONLY__)
+/**
+ * \brief Launches a CUDA function with launch-time configuration
+ *
+ * Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x
+ * &times; \p config->gridDim.y &times; \p config->gridDim.z) grid of blocks.
+ * Each block contains \p config->blockDim (\p config->blockDim.x &times;
+ * \p config->blockDim.y &times; \p config->blockDim.z) threads.
+ *
+ * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that
+ * will be available to each thread block.
+ *
+ * \p config->stream specifies a stream the invocation is associated to.
+ *
+ * Configuration beyond grid and block dimensions, dynamic shared memory size,
+ * and stream can be provided with the following two fields of \p config:
+ *
+ * \p config->attrs is an array of \p config->numAttrs contiguous
+ * ::cudaLaunchAttribute elements. The value of this pointer is not considered
+ * if \p config->numAttrs is zero. However, in that case, it is recommended to
+ * set the pointer to NULL.
+ * \p config->numAttrs is the number of attributes populating the first
+ * \p config->numAttrs positions of the \p config->attrs array.
+ *
+ * The kernel arguments should be passed as arguments to this function via the
+ * \p args parameter pack.
+ *
+ * The C API version of this function, \p cudaLaunchKernelExC, is also available
+ * for pre-C++11 compilers and for use cases where the ability to pass kernel
+ * parameters via void* array is preferable.
+ *
+ * \param config - Launch configuration
+ * \param func   - Kernel to launch
+ * \param args   - Parameter pack of kernel parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args) "cudaLaunchKernelEx (C API)",
+ * ::cuLaunchKernelEx
+ */
+template<typename... ExpTypes, typename... ActTypes>
+static __inline__ __host__ cudaError_t cudaLaunchKernelEx(
+  const cudaLaunchConfig_t *config,
+  void (*kernel)(ExpTypes...),
+  ActTypes &&... args
+)
+{
+    return [&](ExpTypes... coercedArgs){
+        void *pArgs[] = { &coercedArgs... };
+        return ::cudaLaunchKernelExC(config, (const void *)kernel, pArgs);
+    }(std::forward<ActTypes>(args)...);
+}
+# endif
+
+/**
+ *\brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::cudaDevAttrCooperativeLaunch.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory (defaults to 0)
+ * \param stream      - Stream identifier (defaults to NULL)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel(
+  const T *func,
+  dim3 gridDim,
+  dim3 blockDim,
+  void **args,
+  size_t sharedMem = 0,
+  cudaStream_t stream = 0
+)
+{
+    return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+/**
+ * \brief \hl Creates an event object with the specified flags
+ *
+ * Creates an event object with the specified flags. Valid flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that event should use blocking
+ *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
+ *   on an event created with this flag will block until the event actually
+ *   completes.
+ * - ::cudaEventDisableTiming: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::cudaEventBlockingSync flag not specified will provide the best
+ *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
+ *
+ * \param event - Newly created event
+ * \param flags - Flags for new event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent
+ */
+static __inline__ __host__ cudaError_t cudaEventCreate(
+  cudaEvent_t  *event,
+  unsigned int  flags
+)
+{
+  return ::cudaEventCreateWithFlags(event, flags);
+}
+
+/**
+ * \brief \hl Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
+ * can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between host
+ * and device.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaHostAllocDefault: This flag's value is defined to be 0.
+ * - ::cudaHostAllocPortable: The memory returned by this call will be
+ * considered as pinned memory by all CUDA contexts, not just the one that
+ * performed the allocation.
+ * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
+ * The device pointer to the memory may be obtained by calling
+ * ::cudaHostGetDevicePointer().
+ * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
+ * WC memory can be transferred across the PCI Express bus more quickly on some
+ * system configurations, but cannot be read efficiently by most CPUs.  WC
+ * memory is a good option for buffers that will be written by the CPU and read
+ * by the device via mapped pinned memory or host->device transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
+ * flag in order for the ::cudaHostAllocMapped flag to have any effect.
+ *
+ * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
+ * that do not support mapped pinned memory. The failure is deferred to
+ * ::cudaHostGetDevicePointer() because the memory may be mapped into other
+ * CUDA contexts via the ::cudaHostAllocPortable flag.
+ *
+ * Memory allocated by this function must be freed with ::cudaFreeHost().
+ *
+ * \param ptr   - Device pointer to allocated memory
+ * \param size  - Requested allocation size in bytes
+ * \param flags - Requested properties of allocated memory
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc
+ */
+static __inline__ __host__ cudaError_t cudaMallocHost(
+  void         **ptr,
+  size_t         size,
+  unsigned int   flags
+)
+{
+  return ::cudaHostAlloc(ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaHostAlloc(
+  T            **ptr,
+  size_t         size,
+  unsigned int   flags
+)
+{
+  return ::cudaHostAlloc((void**)(void*)ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaHostGetDevicePointer(
+  T            **pDevice,
+  void          *pHost,
+  unsigned int   flags
+)
+{
+  return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags);
+}
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p size bytes of managed memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::cudaErrorNotSupported is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::cudaDevAttrManagedMemory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p size
+ * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
+ * default value for \p flags is ::cudaMemAttachGlobal.
+ * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
+ * any stream on any device. If ::cudaMemAttachHost is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
+ * ::cudaStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cudaStreamAttachMemAsync to
+ * a single stream, the default association, as specifed during ::cudaMallocManaged,
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::cudaMemAttachGlobal. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cudaMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
+ * value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all devices used in
+ * that process that support managed memory have to be peer-to-peer compatible
+ * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
+ * that supports managed memory is used and it is not peer-to-peer compatible with
+ * any of the other managed memory supporting devices that were previously used in
+ * that process, even if ::cudaDeviceReset has been called on those devices. These
+ * environment variables are described in the CUDA programming guide under the
+ * "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ * \param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocManaged(
+  T            **devPtr,
+  size_t         size,
+  unsigned int   flags = cudaMemAttachGlobal
+)
+{
+  return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
+}
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p stream to specify stream association of
+ * \p length bytes of memory starting from \p devPtr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p devPtr must point to an one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cudaMallocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::cudaDevAttrPageableMemoryAccess.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
+ * The default value for \p flags is ::cudaMemAttachSingle
+ * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
+ * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p stream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p stream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p stream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cudaMallocManaged. For __managed__ variables, the default
+ * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param stream  - Stream in which to enqueue the attach operation
+ * \param devPtr  - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  memory)
+ * \param length  - Length of memory (defaults to zero)
+ * \param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(
+  cudaStream_t   stream,
+  T              *devPtr,
+  size_t         length = 0,
+  unsigned int   flags  = cudaMemAttachSingle
+)
+{
+  return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMalloc(
+  T      **devPtr,
+  size_t   size
+)
+{
+  return ::cudaMalloc((void**)(void*)devPtr, size);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocHost(
+  T            **ptr,
+  size_t         size,
+  unsigned int   flags = 0
+)
+{
+  return cudaMallocHost((void**)(void*)ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocPitch(
+  T      **devPtr,
+  size_t  *pitch,
+  size_t   width,
+  size_t   height
+)
+{
+  return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height);
+}
+
+/**
+ * \brief Allocate from a pool
+ *
+ * This is an alternate spelling for cudaMallocFromPoolAsync
+ * made available through operator overloading.
+ *
+ * \sa ::cudaMallocFromPoolAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaStream_t hStream)  "cudaMallocAsync (C API)"
+ */
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  void        **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync(ptr, size, memPool, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  T           **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  T           **ptr,
+  size_t        size,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocAsync((void**)(void*)ptr, size, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocFromPoolAsync(
+  T           **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream);
+}
+
+#if defined(__CUDACC__)
+
+/**
+ * \brief \hl Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
+ *
+ * \param symbol - Device symbol reference
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
+  const T                   &symbol,
+  const void                *src,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice
+)
+{
+  return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief \hl Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
+ *
+ * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
+ * may overlap with operations in other streams.
+ *
+ * \param symbol - Device symbol reference
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
+  const T                   &symbol,
+  const void                *src,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice,
+        cudaStream_t         stream = 0
+)
+{
+  return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream);
+}
+
+/**
+ * \brief \hl Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol reference
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol(
+        void                *dst,
+  const T                   &symbol,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost
+)
+{
+  return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief \hl Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that resides in
+ * global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
+ *
+ * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
+ * with operations in other streams.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol reference
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
+        void                *dst,
+  const T                   &symbol,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost,
+        cudaStream_t         stream = 0
+)
+{
+  return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream);
+}
+
+/**
+ * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeToSymbol(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphAddMemcpyNodeToSymbol(pGraphNode, graph, pDependencies, numDependencies, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeFromSymbol(
+    cudaGraphNode_t* pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t* pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphAddMemcpyNodeFromSymbol(pGraphNode, graph, pDependencies, numDependencies, dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief Sets a memcpy node's parameters to copy to a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(
+    cudaGraphNode_t node,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphMemcpyNodeSetParamsToSymbol(node, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Sets a memcpy node's parameters to copy from a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsFromSymbol(
+    cudaGraphNode_t node,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphMemcpyNodeSetParamsFromSymbol(node, dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p symbol must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+    return ::cudaGraphExecMemcpyNodeSetParamsToSymbol(hGraphExec, node, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p symbol and \p dst must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsFromSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphExecMemcpyNodeSetParamsFromSymbol(hGraphExec, node, dst, (const void*)&symbol, count, offset, kind);
+}
+
+#if __cplusplus >= 201103
+
+/**
+ * \brief Creates a user object by wrapping a C++ object
+ *
+ * TODO detail
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param objectToWrap    - This becomes the \ptr argument to ::cudaUserObjectCreate. A
+ *                          lambda will be passed for the \p destroy argument, which calls
+ *                          delete on this object pointer.
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass cudaUserObjectNoDestructorSync,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaUserObjectCreate(
+    cudaUserObject_t *object_out,
+    T *objectToWrap,
+    unsigned int initialRefcount,
+    unsigned int flags)
+{
+    return ::cudaUserObjectCreate(
+            object_out,
+            objectToWrap,
+            [](void *vpObj) { delete reinterpret_cast<T *>(vpObj); },
+            initialRefcount,
+            flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaUserObjectCreate(
+    cudaUserObject_t *object_out,
+    T *objectToWrap,
+    unsigned int initialRefcount,
+    cudaUserObjectFlags flags)
+{
+    return cudaUserObjectCreate(object_out, objectToWrap, initialRefcount, (unsigned int)flags);
+}
+
+#endif
+
+/**
+ * \brief \hl Finds the address associated with a CUDA symbol
+ *
+ * Returns in \p *devPtr the address of symbol \p symbol on the device.
+ * \p symbol can either be a variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared
+ * in the global or constant memory space, \p *devPtr is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param devPtr - Return device pointer associated with symbol
+ * \param symbol - Device symbol reference
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
+        void **devPtr,
+  const T     &symbol
+)
+{
+  return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol);
+}
+
+/**
+ * \brief \hl Finds the size of the object associated with a CUDA symbol
+ *
+ * Returns in \p *size the size of symbol \p symbol. \p symbol must be a
+ * variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared
+ * in global or constant memory space, \p *size is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param size   - Size of object associated with symbol
+ * \param symbol - Device symbol reference
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGetSymbolSize(
+        size_t *size,
+  const T      &symbol
+)
+{
+  return ::cudaGetSymbolSize(size, (const void*)&symbol);
+}
+
+/**
+ * \brief \hl Binds a memory area to a texture
+ *
+ * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
+ * reference \p tex. \p desc describes how the memory is interpreted when
+ * fetching values from the texture. The \p offset parameter is an optional
+ * byte offset as with the low-level
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
+ * function. Any memory previously bound to \p tex is unbound.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture to bind
+ * \param devPtr - Memory area on device
+ * \param desc   - Channel format
+ * \param size   - Size of the memory area pointed to by devPtr
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  const struct cudaChannelFormatDesc     &desc,
+        size_t                            size = UINT_MAX
+)
+{
+  return ::cudaBindTexture(offset, &tex, devPtr, &desc, size);
+}
+
+/**
+ * \brief \hl Binds a memory area to a texture
+ *
+ * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
+ * reference \p tex. The channel descriptor is inherited from the texture
+ * reference type. The \p offset parameter is an optional byte offset as with
+ * the low-level
+ * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t)
+ * function. Any memory previously bound to \p tex is unbound.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture to bind
+ * \param devPtr - Memory area on device
+ * \param size   - Size of the memory area pointed to by devPtr
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+        size_t                            size = UINT_MAX
+)
+{
+  return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size);
+}
+
+/**
+ * \brief \hl Binds a 2D memory area to a texture
+ *
+ * Binds the 2D memory area pointed to by \p devPtr to the
+ * texture reference \p tex. The size of the area is constrained by
+ * \p width in texel units, \p height in texel units, and \p pitch in byte
+ * units. \p desc describes how the memory is interpreted when fetching values
+ * from the texture. Any memory previously bound to \p tex is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses,
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()"
+ * returns in \p *offset a byte offset that
+ * must be applied to texture fetches in order to read from the desired memory.
+ * This offset must be divided by the texel size and passed to kernels that
+ * read from the texture so they can be applied to the ::tex2D() function.
+ * If the device memory pointer was returned from ::cudaMalloc(), the offset is
+ * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture reference to bind
+ * \param devPtr - 2D memory area on device
+ * \param desc   - Channel format
+ * \param width  - Width in texel units
+ * \param height - Height in texel units
+ * \param pitch  - Pitch in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture2D(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  const struct cudaChannelFormatDesc     &desc,
+  size_t                                  width,
+  size_t                                  height,
+  size_t                                  pitch
+)
+{
+  return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
+}
+
+/**
+ * \brief \hl Binds a 2D memory area to a texture
+ *
+ * Binds the 2D memory area pointed to by \p devPtr to the
+ * texture reference \p tex. The size of the area is constrained by
+ * \p width in texel units, \p height in texel units, and \p pitch in byte
+ * units. The channel descriptor is inherited from the texture reference
+ * type. Any memory previously bound to \p tex is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses,
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()"
+ * returns in \p *offset a byte offset that
+ * must be applied to texture fetches in order to read from the desired memory.
+ * This offset must be divided by the texel size and passed to kernels that
+ * read from the texture so they can be applied to the ::tex2D() function.
+ * If the device memory pointer was returned from ::cudaMalloc(), the offset is
+ * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture reference to bind
+ * \param devPtr - 2D memory area on device
+ * \param width  - Width in texel units
+ * \param height - Height in texel units
+ * \param pitch  - Pitch in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture2D(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  size_t                                  width,
+  size_t                                  height,
+  size_t                                  pitch
+)
+{
+  return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
+}
+
+/**
+ * \brief \hl Binds an array to a texture
+ *
+ * Binds the CUDA array \p array to the texture reference \p tex.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the texture. Any CUDA array previously bound to \p tex is unbound.
+ *
+ * \param tex   - Texture to bind
+ * \param array - Memory array on device
+ * \param desc  - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaArray_const_t                       array,
+  const struct cudaChannelFormatDesc     &desc
+)
+{
+  return ::cudaBindTextureToArray(&tex, array, &desc);
+}
+
+/**
+ * \brief \hl Binds an array to a texture
+ *
+ * Binds the CUDA array \p array to the texture reference \p tex.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA array
+ * previously bound to \p tex is unbound.
+ *
+ * \param tex   - Texture to bind
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaArray_const_t                       array
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaError_t                  err = ::cudaGetChannelDesc(&desc, array);
+
+  return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err;
+}
+
+/**
+ * \brief \hl Binds a mipmapped array to a texture
+ *
+ * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound.
+ *
+ * \param tex            - Texture to bind
+ * \param mipmappedArray - Memory mipmapped array on device
+ * \param desc           - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaMipmappedArray_const_t              mipmappedArray,
+  const struct cudaChannelFormatDesc     &desc
+)
+{
+  return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
+}
+
+/**
+ * \brief \hl Binds a mipmapped array to a texture
+ *
+ * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array
+ * previously bound to \p tex is unbound.
+ *
+ * \param tex            - Texture to bind
+ * \param mipmappedArray - Memory mipmapped array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaMipmappedArray_const_t              mipmappedArray
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaArray_t                  levelArray;
+  cudaError_t                  err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
+  
+  if (err != cudaSuccess) {
+      return err;
+  }
+  err = ::cudaGetChannelDesc(&desc, levelArray);
+
+  return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err;
+}
+
+/**
+ * \brief \hl Unbinds a texture
+ *
+ * Unbinds the texture bound to \p tex. If \p texref is not currently bound, no operation is performed.
+ *
+ * \param tex - Texture to unbind
+ *
+ * \return 
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaUnbindTexture(
+  const struct texture<T, dim, readMode> &tex
+)
+{
+  return ::cudaUnbindTexture(&tex);
+}
+
+/**
+ * \brief \hl Get the alignment offset of a texture
+ *
+ * Returns in \p *offset the offset that was returned when texture reference
+ * \p tex was bound.
+ *
+ * \param offset - Offset of texture reference in bytes
+ * \param tex    - Texture to get offset of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex
+)
+{
+  return ::cudaGetTextureAlignmentOffset(offset, &tex);
+}
+
+/**
+ * \brief \hl Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache configuration
+ * for the function specified via \p func. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute \p func.
+ *
+ * \p func must be a pointer to a function that executes on the device.
+ * The parameter specified by \p func must be declared as a \p __global__
+ * function. If the specified function does not exist,
+ * then ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param func        - device function pointer
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost,
+ * ::cudaThreadGetCacheConfig,
+ * ::cudaThreadSetCacheConfig
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
+  T                  *func,
+  enum cudaFuncCache  cacheConfig
+)
+{
+  return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig(
+  T                        *func,
+  enum cudaSharedMemConfig  config
+)
+{
+  return ::cudaFuncSetSharedMemConfig((const void*)func, config);
+}
+
+#endif // __CUDACC__
+
+/**
+ * \brief Returns occupancy for a device function
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calulated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    int   *numBlocks,
+    T      func,
+    int    blockSize,
+    size_t dynamicSMemSize)
+{
+    return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns occupancy for a device function with the specified flags
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ *
+ * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calulated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int         *numBlocks,
+    T            func,
+    int          blockSize,
+    size_t       dynamicSMemSize,
+    unsigned int flags)
+{
+    return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags);
+}
+
+/**
+ * Helper functor for cudaOccupancyMaxPotentialBlockSize
+ */
+class __cudaOccupancyB2DHelper {
+  size_t n;
+public:
+  inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
+  inline __host__ CUDART_DEVICE size_t operator()(int)
+  {
+      return n;
+  }
+};
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ * \param flags       - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+
+template<typename UnaryFunction, class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
+    int           *minGridSize,
+    int           *blockSize,
+    T              func,
+    UnaryFunction  blockSizeToDynamicSMemSize,
+    int            blockSizeLimit = 0,
+    unsigned int   flags = 0)
+{
+    cudaError_t status;
+
+    // Device and function properties
+    int                       device;
+    struct cudaFuncAttributes attr;
+
+    // Limits
+    int maxThreadsPerMultiProcessor;
+    int warpSize;
+    int devMaxThreadsPerBlock;
+    int multiProcessorCount;
+    int funcMaxThreadsPerBlock;
+    int occupancyLimit;
+    int granularity;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !func) {
+        return cudaErrorInvalidValue;
+    }
+
+    //////////////////////////////////////////////
+    // Obtain device and function properties
+    //////////////////////////////////////////////
+
+    status = ::cudaGetDevice(&device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &maxThreadsPerMultiProcessor,
+        cudaDevAttrMaxThreadsPerMultiProcessor,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &warpSize,
+        cudaDevAttrWarpSize,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &devMaxThreadsPerBlock,
+        cudaDevAttrMaxThreadsPerBlock,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &multiProcessorCount,
+        cudaDevAttrMultiProcessorCount,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaFuncGetAttributes(&attr, func);
+    if (status != cudaSuccess) {
+        return status;
+    }
+    
+    funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = maxThreadsPerMultiProcessor;
+    granularity    = warpSize;
+
+    if (blockSizeLimit == 0) {
+        blockSizeLimit = devMaxThreadsPerBlock;
+    }
+
+    if (devMaxThreadsPerBlock < blockSizeLimit) {
+        blockSizeLimit = devMaxThreadsPerBlock;
+    }
+
+    if (funcMaxThreadsPerBlock < blockSizeLimit) {
+        blockSizeLimit = funcMaxThreadsPerBlock;
+    }
+
+    blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        // This is needed for the first iteration, because
+        // blockSizeLimitAligned could be greater than blockSizeLimit
+        //
+        if (blockSizeLimit < blockSizeToTryAligned) {
+            blockSizeToTry = blockSizeLimit;
+        } else {
+            blockSizeToTry = blockSizeToTryAligned;
+        }
+        
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+
+        status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+            &occupancyInBlocks,
+            func,
+            blockSizeToTry,
+            dynamicSMemSize,
+            flags);
+
+        if (status != cudaSuccess) {
+            return status;
+        }
+
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * multiProcessorCount;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+
+template<typename UnaryFunction, class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(
+    int           *minGridSize,
+    int           *blockSize,
+    T              func,
+    UnaryFunction  blockSizeToDynamicSMemSize,
+    int            blockSizeLimit = 0)
+{
+    return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
+ * amount of per-block dynamic shared memory changes with different
+ * block sizes.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize(
+    int    *minGridSize,
+    int    *blockSize,
+    T       func,
+    size_t  dynamicSMemSize = 0,
+    int     blockSizeLimit = 0)
+{
+  return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM.
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the block
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(
+    size_t *dynamicSmemSize,
+    T      func,
+    int    numBlocks,
+    int    blockSize)
+{
+    return ::cudaOccupancyAvailableDynamicSMemPerBlock(dynamicSmemSize, (const void*)func, numBlocks, blockSize);
+}
+
+/**
+ * \brief Returns grid and block size that achived maximum potential occupancy for a device function with the specified flags
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * The \p flags parameter controls how special cases are handle. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxPotentialBlockSize
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
+ * amount of per-block dynamic shared memory changes with different
+ * block sizes.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ * \param flags       - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags(
+    int    *minGridSize,
+    int    *blockSize,
+    T      func,
+    size_t dynamicSMemSize = 0,
+    int    blockSizeLimit = 0,
+    unsigned int flags = 0)
+{
+    return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags);
+}
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect 
+ * the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxPotentialClusterSize(
+    int *clusterSize,
+    T *func,
+    const cudaLaunchConfig_t *config)
+{
+    return ::cudaOccupancyMaxPotentialClusterSize(clusterSize, (const void*)func, config);
+}
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes), the cluster size from config must either be
+ * unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidClusterSize,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveClusters(
+    int *numClusters,
+    T *func,
+    const cudaLaunchConfig_t *config)
+{
+    return ::cudaOccupancyMaxActiveClusters(numClusters, (const void*)func, config);
+}
+
+#if defined __CUDACC__
+
+/**
+ * \brief \hl Find out attributes for a given function
+ *
+ * This function obtains the attributes of a function specified via \p entry.
+ * The parameter \p entry must be a pointer to a function that executes
+ * on the device. The parameter specified by \p entry must be declared as a \p __global__
+ * function. The fetched attributes are placed in \p attr. If the specified
+ * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * Note that some function attributes such as
+ * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
+ * may vary based on the device that is currently being used.
+ *
+ * \param attr  - Return pointer to function's attributes
+ * \param entry - Function to get attributes of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncGetAttributes(
+  struct cudaFuncAttributes *attr,
+  T                         *entry
+)
+{
+  return ::cudaFuncGetAttributes(attr, (const void*)entry);
+}
+
+/**
+ * \brief \hl Set attributes for a given function
+ *
+ * This function sets the attributes of a function specified via \p entry.
+ * The parameter \p entry must be a pointer to a function that executes
+ * on the device. The parameter specified by \p entry must be declared as a \p __global__
+ * function. The enumeration defined by \p attr is set to the value defined by \p value.
+ * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ * If the specified attribute cannot be written, or if the value is incorrect, 
+ * then ::cudaErrorInvalidValue is returned.
+ *
+ * Valid values for \p attr are:
+ * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
+ *   cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
+ * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, 
+ *   this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::cudaFuncAttributeRequiredClusterWidth: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterHeight: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterDepth: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeClusterSchedulingPolicyPreference: The block
+ *   scheduling policy of a function. The value type is cudaClusterSchedulingPolicy.
+ *
+ * \param entry - Function to get attributes of
+ * \param attr  - Attribute to set
+ * \param value - Value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetAttribute(
+  T                         *entry,
+  enum cudaFuncAttribute    attr,
+  int                       value
+)
+{
+  return ::cudaFuncSetAttribute((const void*)entry, attr, value);
+}
+
+/**
+ * \brief \hl Binds an array to a surface
+ *
+ * Binds the CUDA array \p array to the surface reference \p surf.
+ * \p desc describes how the memory is interpreted when dealing with
+ * the surface. Any CUDA array previously bound to \p surf is unbound.
+ *
+ * \param surf  - Surface to bind
+ * \param array - Memory array on device
+ * \param desc  - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSurface
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
+ * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)"
+ */
+template<class T, int dim>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
+  const struct surface<T, dim>       &surf,
+  cudaArray_const_t                   array,
+  const struct cudaChannelFormatDesc &desc
+)
+{
+  return ::cudaBindSurfaceToArray(&surf, array, &desc);
+}
+
+/**
+ * \brief \hl Binds an array to a surface
+ *
+ * Binds the CUDA array \p array to the surface reference \p surf.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA array
+ * previously bound to \p surf is unbound.
+ *
+ * \param surf  - Surface to bind
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSurface
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
+ * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)"
+ */
+template<class T, int dim>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
+  const struct surface<T, dim> &surf,
+  cudaArray_const_t             array
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaError_t                  err = ::cudaGetChannelDesc(&desc, array);
+
+  return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err;
+}
+
+#endif /* __CUDACC__ */
+
+/** @} */ /* END CUDART_HIGHLEVEL */
+
+#endif /* __cplusplus && !__CUDACC_RTC__ */
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+#pragma GCC diagnostic pop
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
+#endif
+
+#endif /* !__CUDA_RUNTIME_H__ */
diff --git a/ext/cudart/include/cuda_runtime_api.h b/ext/cudart/include/cuda_runtime_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..9e5501ac5586aa77e843c30b4825cc08c1aa4abe
--- /dev/null
+++ b/ext/cudart/include/cuda_runtime_api.h
@@ -0,0 +1,13380 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+
+#if !defined(__CUDA_RUNTIME_API_H__)
+#define __CUDA_RUNTIME_API_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__
+#endif
+
+/**
+ * \latexonly
+ * \page sync_async API synchronization behavior
+ *
+ * \section memcpy_sync_async_behavior Memcpy
+ * The API provides memcpy/memset functions in both synchronous and asynchronous forms,
+ * the latter having an \e "Async" suffix. This is a misnomer as each function
+ * may exhibit synchronous or asynchronous behavior depending on the arguments
+ * passed to the function. In the reference documentation, each memcpy function is
+ * categorized as \e synchronous or \e asynchronous, corresponding to the definitions
+ * below.
+ * 
+ * \subsection MemcpySynchronousBehavior Synchronous
+ * 
+ * <ol>
+ * <li> For transfers from pageable host memory to device memory, a stream sync is performed
+ * before the copy is initiated. The function will return once the pageable
+ * buffer has been copied to the staging memory for DMA transfer to device memory,
+ * but the DMA to final destination may not have completed.
+ * 
+ * <li> For transfers from pinned host memory to device memory, the function is synchronous
+ * with respect to the host.
+ *
+ * <li> For transfers from device to either pageable or pinned host memory, the function returns
+ * only once the copy has completed.
+ * 
+ * <li> For transfers from device memory to device memory, no host-side synchronization is
+ * performed.
+ *
+ * <li> For transfers from any host memory to any host memory, the function is fully
+ * synchronous with respect to the host.
+ * </ol>
+ * 
+ * \subsection MemcpyAsynchronousBehavior Asynchronous
+ *
+ * <ol>
+ * <li> For transfers from device memory to pageable host memory, the function
+ * will return only once the copy has completed.
+ *
+ * <li> For transfers from any host memory to any host memory, the function is fully
+ * synchronous with respect to the host.
+ * 
+ * <li> For all other transfers, the function is fully asynchronous. If pageable
+ * memory must first be staged to pinned memory, this will be handled
+ * asynchronously with a worker thread.
+ * </ol>
+ *
+ * \section memset_sync_async_behavior Memset
+ * The cudaMemset functions are asynchronous with respect to the host
+ * except when the target memory is pinned host memory. The \e Async
+ * versions are always asynchronous with respect to the host.
+ *
+ * \section kernel_launch_details Kernel Launches
+ * Kernel launches are asynchronous with respect to the host. Details of
+ * concurrent kernel execution and data transfers can be found in the CUDA
+ * Programmers Guide.
+ *
+ * \endlatexonly
+ */
+
+/**
+ * There are two levels for the runtime API.
+ *
+ * The C API (<i>cuda_runtime_api.h</i>) is
+ * a C-style interface that does not require compiling with \p nvcc.
+ *
+ * The \ref CUDART_HIGHLEVEL "C++ API" (<i>cuda_runtime.h</i>) is a
+ * C++-style interface built on top of the C API. It wraps some of the
+ * C API routines, using overloading, references and default arguments.
+ * These wrappers can be used from C++ code and can be compiled with any C++
+ * compiler. The C++ API also has some CUDA-specific wrappers that wrap
+ * C API routines that deal with symbols, textures, and device functions.
+ * These wrappers require the use of \p nvcc because they depend on code being
+ * generated by the compiler. For example, the execution configuration syntax
+ * to invoke kernels is only available in source code compiled with \p nvcc.
+ */
+
+/** CUDA Runtime API Version */
+#define CUDART_VERSION  11080
+
+#if defined(__CUDA_API_VER_MAJOR__) && defined(__CUDA_API_VER_MINOR__)
+# define __CUDART_API_VERSION ((__CUDA_API_VER_MAJOR__ * 1000) + (__CUDA_API_VER_MINOR__ * 10))
+#else
+# define __CUDART_API_VERSION CUDART_VERSION
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+#include "builtin_types.h"
+
+#include "cuda_device_runtime_api.h"
+
+#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) || defined(__CUDA_API_VERSION_INTERNAL)
+    #define __CUDART_API_PER_THREAD_DEFAULT_STREAM
+    #define __CUDART_API_PTDS(api) api ## _ptds
+    #define __CUDART_API_PTSZ(api) api ## _ptsz
+#else
+    #define __CUDART_API_PTDS(api) api
+    #define __CUDART_API_PTSZ(api) api
+#endif
+
+#define cudaSignalExternalSemaphoresAsync  __CUDART_API_PTSZ(cudaSignalExternalSemaphoresAsync_v2)
+#define cudaWaitExternalSemaphoresAsync    __CUDART_API_PTSZ(cudaWaitExternalSemaphoresAsync_v2)
+
+#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+    #define cudaMemcpy                     __CUDART_API_PTDS(cudaMemcpy)
+    #define cudaMemcpyToSymbol             __CUDART_API_PTDS(cudaMemcpyToSymbol)
+    #define cudaMemcpyFromSymbol           __CUDART_API_PTDS(cudaMemcpyFromSymbol)
+    #define cudaMemcpy2D                   __CUDART_API_PTDS(cudaMemcpy2D)
+    #define cudaMemcpyToArray              __CUDART_API_PTDS(cudaMemcpyToArray)
+    #define cudaMemcpy2DToArray            __CUDART_API_PTDS(cudaMemcpy2DToArray)
+    #define cudaMemcpyFromArray            __CUDART_API_PTDS(cudaMemcpyFromArray)
+    #define cudaMemcpy2DFromArray          __CUDART_API_PTDS(cudaMemcpy2DFromArray)
+    #define cudaMemcpyArrayToArray         __CUDART_API_PTDS(cudaMemcpyArrayToArray)
+    #define cudaMemcpy2DArrayToArray       __CUDART_API_PTDS(cudaMemcpy2DArrayToArray)
+    #define cudaMemcpy3D                   __CUDART_API_PTDS(cudaMemcpy3D)
+    #define cudaMemcpy3DPeer               __CUDART_API_PTDS(cudaMemcpy3DPeer)
+    #define cudaMemset                     __CUDART_API_PTDS(cudaMemset)
+    #define cudaMemset2D                   __CUDART_API_PTDS(cudaMemset2D)
+    #define cudaMemset3D                   __CUDART_API_PTDS(cudaMemset3D)
+    #define cudaGraphUpload                __CUDART_API_PTSZ(cudaGraphUpload)
+    #define cudaGraphLaunch                __CUDART_API_PTSZ(cudaGraphLaunch)
+    #define cudaStreamBeginCapture         __CUDART_API_PTSZ(cudaStreamBeginCapture)
+    #define cudaStreamEndCapture           __CUDART_API_PTSZ(cudaStreamEndCapture)
+    #define cudaStreamGetCaptureInfo       __CUDART_API_PTSZ(cudaStreamGetCaptureInfo)
+    #define cudaStreamGetCaptureInfo_v2    __CUDART_API_PTSZ(cudaStreamGetCaptureInfo_v2)
+    #define cudaStreamIsCapturing          __CUDART_API_PTSZ(cudaStreamIsCapturing)
+    #define cudaMemcpyAsync                __CUDART_API_PTSZ(cudaMemcpyAsync)
+    #define cudaMemcpyToSymbolAsync        __CUDART_API_PTSZ(cudaMemcpyToSymbolAsync)
+    #define cudaMemcpyFromSymbolAsync      __CUDART_API_PTSZ(cudaMemcpyFromSymbolAsync)
+    #define cudaMemcpy2DAsync              __CUDART_API_PTSZ(cudaMemcpy2DAsync)
+    #define cudaMemcpyToArrayAsync         __CUDART_API_PTSZ(cudaMemcpyToArrayAsync)
+    #define cudaMemcpy2DToArrayAsync       __CUDART_API_PTSZ(cudaMemcpy2DToArrayAsync)
+    #define cudaMemcpyFromArrayAsync       __CUDART_API_PTSZ(cudaMemcpyFromArrayAsync)
+    #define cudaMemcpy2DFromArrayAsync     __CUDART_API_PTSZ(cudaMemcpy2DFromArrayAsync)
+    #define cudaMemcpy3DAsync              __CUDART_API_PTSZ(cudaMemcpy3DAsync)
+    #define cudaMemcpy3DPeerAsync          __CUDART_API_PTSZ(cudaMemcpy3DPeerAsync)
+    #define cudaMemsetAsync                __CUDART_API_PTSZ(cudaMemsetAsync)
+    #define cudaMemset2DAsync              __CUDART_API_PTSZ(cudaMemset2DAsync)
+    #define cudaMemset3DAsync              __CUDART_API_PTSZ(cudaMemset3DAsync)
+    #define cudaStreamQuery                __CUDART_API_PTSZ(cudaStreamQuery)
+    #define cudaStreamGetFlags             __CUDART_API_PTSZ(cudaStreamGetFlags)
+    #define cudaStreamGetPriority          __CUDART_API_PTSZ(cudaStreamGetPriority)
+    #define cudaEventRecord                __CUDART_API_PTSZ(cudaEventRecord)
+    #define cudaEventRecordWithFlags       __CUDART_API_PTSZ(cudaEventRecordWithFlags)
+    #define cudaStreamWaitEvent            __CUDART_API_PTSZ(cudaStreamWaitEvent)
+    #define cudaStreamAddCallback          __CUDART_API_PTSZ(cudaStreamAddCallback)
+    #define cudaStreamAttachMemAsync       __CUDART_API_PTSZ(cudaStreamAttachMemAsync)
+    #define cudaStreamSynchronize          __CUDART_API_PTSZ(cudaStreamSynchronize)
+    #define cudaLaunchKernel               __CUDART_API_PTSZ(cudaLaunchKernel)
+    #define cudaLaunchKernelExC            __CUDART_API_PTSZ(cudaLaunchKernelExC)
+    #define cudaLaunchHostFunc             __CUDART_API_PTSZ(cudaLaunchHostFunc)
+    #define cudaMemPrefetchAsync           __CUDART_API_PTSZ(cudaMemPrefetchAsync)
+    #define cudaLaunchCooperativeKernel    __CUDART_API_PTSZ(cudaLaunchCooperativeKernel)
+    #define cudaStreamCopyAttributes       __CUDART_API_PTSZ(cudaStreamCopyAttributes)
+    #define cudaStreamGetAttribute         __CUDART_API_PTSZ(cudaStreamGetAttribute)
+    #define cudaStreamSetAttribute         __CUDART_API_PTSZ(cudaStreamSetAttribute)
+    #define cudaMallocAsync                __CUDART_API_PTSZ(cudaMallocAsync)
+    #define cudaFreeAsync                  __CUDART_API_PTSZ(cudaFreeAsync)
+    #define cudaMallocFromPoolAsync        __CUDART_API_PTSZ(cudaMallocFromPoolAsync)
+    #define cudaGetDriverEntryPoint        __CUDART_API_PTSZ(cudaGetDriverEntryPoint)
+#endif
+
+/** \cond impl_private */
+#if !defined(__dv)
+
+#if defined(__cplusplus)
+
+#define __dv(v) \
+        = v
+
+#else /* __cplusplus */
+
+#define __dv(v)
+
+#endif /* __cplusplus */
+
+#endif /* !__dv */
+/** \endcond impl_private */
+
+#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350))   /** Visible to SM>=3.5 and "__host__ __device__" only **/
+
+#define CUDART_DEVICE __device__ 
+
+#else
+
+#define CUDART_DEVICE
+
+#endif /** CUDART_DEVICE */
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/**
+ * \defgroup CUDART_DEVICE Device Management
+ *
+ * ___MANBRIEF___ device management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the device management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Destroy all allocations and reset all state on the current device
+ * in the current process.
+ *
+ * Explicitly destroys and cleans up all resources associated with the current
+ * device in the current process. It is the caller's responsibility to ensure
+ * that the resources are not accessed or passed in subsequent API calls and
+ * doing so will result in undefined behavior. These resources include CUDA types
+ * such as ::cudaStream_t, ::cudaEvent_t, ::cudaArray_t, ::cudaMipmappedArray_t,
+ * ::cudaTextureObject_t, ::cudaSurfaceObject_t, ::textureReference, ::surfaceReference,
+ * ::cudaExternalMemory_t, ::cudaExternalSemaphore_t and ::cudaGraphicsResource_t.
+ * Any subsequent API call to this device will reinitialize the device.
+ *
+ * Note that this function will reset the device immediately.  It is the caller's
+ * responsibility to ensure that the device is not being accessed by any 
+ * other host threads from the process when this function is called.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void);
+
+/**
+ * \brief Wait for compute device to finish
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cudaDeviceSynchronize() returns an error if one of the preceding tasks
+ * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for 
+ * this device, the host thread will block until the device has finished 
+ * its work.
+ *
+ * \return
+ * ::cudaSuccess
+ * \note_device_sync_deprecated
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceReset,
+ * ::cuCtxSynchronize
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the device.  The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc).  The application can use ::cudaDeviceGetLimit() to find out
+ * exactly what the limit has been set to.
+ *
+ * Setting each ::cudaLimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::cudaLimitStackSize controls the stack size in bytes of each GPU thread.
+ *
+ * - ::cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO
+ *   used by the ::printf() device system call. Setting
+ *   ::cudaLimitPrintfFifoSize must not be performed after launching any kernel
+ *   that uses the ::printf() device system call - in such case
+ *   ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitMallocHeapSize controls the size in bytes of the heap used by
+ *   the ::malloc() and ::free() device system calls. Setting
+ *   ::cudaLimitMallocHeapSize must not be performed after launching any kernel
+ *   that uses the ::malloc() or ::free() device system calls - in such case
+ *   ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a
+ *   grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting
+ *   this limit must be performed before any launch of a kernel that uses the
+ *   device runtime and calls ::cudaDeviceSynchronize() above the default sync
+ *   depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail
+ *   with error code ::cudaErrorSyncDepthExceeded if the limitation is
+ *   violated. This limit can be set smaller than the default or up the maximum
+ *   launch depth of 24. When setting this limit, keep in mind that additional
+ *   levels of sync depth require the runtime to reserve large amounts of
+ *   device memory which can no longer be used for user allocations. If these
+ *   reservations of device memory fail, ::cudaDeviceSetLimit will return
+ *   ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::cudaErrorUnsupportedLimit being
+ *   returned.
+ *
+ * - ::cudaLimitDevRuntimePendingLaunchCount controls the maximum number of
+ *   outstanding device runtime launches that can be made from the current
+ *   device. A grid is outstanding from the point of launch up until the grid
+ *   is known to have been completed. Device runtime launches which violate 
+ *   this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when
+ *   ::cudaGetLastError() is called after launch. If more pending launches than
+ *   the default (2048 launches) are needed for a module using the device
+ *   runtime, this limit can be increased. Keep in mind that being able to
+ *   sustain additional pending launches will require the runtime to reserve
+ *   larger amounts of device memory upfront which can no longer be used for
+ *   allocations. If these reservations fail, ::cudaDeviceSetLimit will return
+ *   ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value.
+ *   This limit is only applicable to devices of compute capability 3.5 and
+ *   higher. Attempting to set this limit on devices of compute capability less
+ *   than 3.5 will result in the error ::cudaErrorUnsupportedLimit being
+ *   returned. 
+ *
+ * - ::cudaLimitMaxL2FetchGranularity controls the L2 cache fetch granularity.
+ *   Values can range from 0B to 128B. This is purely a performance hint and
+ *   it can be ignored or clamped depending on the platform.
+ *
+ * - ::cudaLimitPersistingL2CacheSize controls size in bytes available
+ *   for persisting L2 cache. This is purely a performance hint and it
+ *   can be ignored or clamped depending on the platform.
+ *
+ * \param limit - Limit to set
+ * \param value - Size of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetLimit,
+ * ::cuCtxSetLimit
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *  * Returns in \p *pValue the current size of \p limit. The following ::cudaLimit values are supported. 
+ * - ::cudaLimitStackSize is the stack size in bytes of each GPU thread.
+ * - ::cudaLimitPrintfFifoSize is the size in bytes of the shared FIFO used by the
+ *   ::printf() device system call.
+ * - ::cudaLimitMallocHeapSize is the size in bytes of the heap used by the
+ *   ::malloc() and ::free() device system calls.
+ * - ::cudaLimitDevRuntimeSyncDepth is the maximum grid depth at which a
+ *   thread can isssue the device runtime call ::cudaDeviceSynchronize()
+ *   to wait on child grid launches to complete.
+ * - ::cudaLimitDevRuntimePendingLaunchCount is the maximum number of outstanding
+ *   device runtime launches.
+ * - ::cudaLimitMaxL2FetchGranularity is the L2 cache fetch granularity.
+ * - ::cudaLimitPersistingL2CacheSize is the persisting L2 cache size in bytes.
+ *
+ * \param limit  - Limit to query
+ * \param pValue - Returned size of the limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceSetLimit,
+ * ::cuCtxGetLimit
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit);
+
+/**
+ * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given element size.
+ *
+ * Returns in \p maxWidthInElements the maximum number of elements allocatable in a 1D linear texture
+ * for given format descriptor \p fmtDesc.
+ *
+ * \param maxWidthInElements    - Returns maximum number of texture elements allocatable for given \p fmtDesc.
+ * \param fmtDesc               - Texture format description.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuDeviceGetMaxTexture1DLinear,
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc, int device);
+#endif
+
+/**
+ * \brief Returns the preferred cache configuration for the current device.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this returns through \p pCacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute functions.
+ *
+ * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param pCacheConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * ::cuCtxGetCacheConfig
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+
+/**
+ * \brief Returns numerical values that correspond to the least and
+ * greatest stream priorities.
+ *
+ * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond
+ * to the least and greatest stream priorities respectively. Stream priorities
+ * follow a convention where lower numbers imply greater priorities. The range of
+ * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority].
+ * If the user attempts to create a stream with a priority value that is
+ * outside the the meaningful range as specified by this API, the priority is
+ * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority
+ * respectively. See ::cudaStreamCreateWithPriority for details on creating a
+ * priority stream.
+ * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value
+ * is not desired.
+ *
+ * This function will return '0' in both \p *leastPriority and \p *greatestPriority if
+ * the current context's device does not support stream priorities
+ * (see ::cudaDeviceGetAttribute).
+ *
+ * \param leastPriority    - Pointer to an int in which the numerical value for least
+ *                           stream priority is returned
+ * \param greatestPriority - Pointer to an int in which the numerical value for greatest
+ *                           stream priority is returned
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamGetPriority,
+ * ::cuCtxGetStreamPriorityRange
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority);
+
+/**
+ * \brief Sets the preferred cache configuration for the current device.
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute the function. Any
+ * function preference set via
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
+ * or
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
+ * will be preferred over this device-wide setting. Setting the device-wide
+ * cache configuration to ::cudaFuncCachePreferNone will cause subsequent
+ * kernel launches to prefer to not change the cache configuration unless
+ * required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetCacheConfig,
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * ::cuCtxSetCacheConfig
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig);
+
+/**
+ * \brief Returns the shared memory configuration for the current device.
+ *
+ * This function will return in \p pConfig the current size of shared memory banks
+ * on the current device. On devices with configurable shared memory banks, 
+ * ::cudaDeviceSetSharedMemConfig can be used to change this setting, so that all 
+ * subsequent kernel launches will by default use the new bank size. When 
+ * ::cudaDeviceGetSharedMemConfig is called on devices without configurable shared 
+ * memory, it will return the fixed bank size of the hardware.
+ *
+ * The returned bank configurations can be either:
+ * - ::cudaSharedMemBankSizeFourByte - shared memory bank width is four bytes.
+ * - ::cudaSharedMemBankSizeEightByte - shared memory bank width is eight bytes.
+ *
+ * \param pConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaDeviceSetSharedMemConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuCtxGetSharedMemConfig
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig);
+
+/**
+ * \brief Sets the shared memory configuration for the current device.
+ *
+ * On devices with configurable shared memory banks, this function will set
+ * the shared memory bank size which is used for all subsequent kernel launches.
+ * Any per-function setting of shared memory set via ::cudaFuncSetSharedMemConfig
+ * will override the device wide setting.
+ *
+ * Changing the shared memory configuration between launches may introduce
+ * a device side synchronization point.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance. 
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank 
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * The supported bank configurations are:
+ * - ::cudaSharedMemBankSizeDefault: set bank width the device default (currently,
+ *   four bytes)
+ * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be four bytes
+ *   natively.
+ * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight 
+ *   bytes natively.
+ *
+ * \param config - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaDeviceGetSharedMemConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuCtxSetSharedMemConfig
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config);
+
+/**
+ * \brief Returns a handle to a compute device
+ *
+ * Returns in \p *device a device ordinal given a PCI bus ID string.
+ *
+ * \param device   - Returned device ordinal
+ *
+ * \param pciBusId - String in one of the following forms: 
+ * [domain]:[bus]:[device].[function]
+ * [domain]:[bus]:[device]
+ * [bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetPCIBusId,
+ * ::cuDeviceGetByPCIBusId
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId);
+
+/**
+ * \brief Returns a PCI Bus Id string for the device
+ *
+ * Returns an ASCII string identifying the device \p dev in the NULL-terminated
+ * string pointed to by \p pciBusId. \p len specifies the maximum length of the
+ * string that may be returned.
+ *
+ * \param pciBusId - Returned identifier string for the device in the following format
+ * [domain]:[bus]:[device].[function]
+ * where \p domain, \p bus, \p device, and \p function are all hexadecimal values.
+ * pciBusId should be large enough to store 13 characters including the NULL-terminator.
+ *
+ * \param len      - Maximum length of string to store in \p name
+ *
+ * \param device   - Device to get identifier string for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetByPCIBusId,
+ * ::cuDeviceGetPCIBusId
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device);
+
+/**
+ * \brief Gets an interprocess handle for a previously allocated event
+ *
+ * Takes as input a previously allocated event. This event must have been 
+ * created with the ::cudaEventInterprocess and ::cudaEventDisableTiming
+ * flags set. This opaque handle may be copied into other processes and
+ * opened with ::cudaIpcOpenEventHandle to allow efficient hardware
+ * synchronization between GPU work in different processes.
+ *
+ * After the event has been been opened in the importing process, 
+ * ::cudaEventRecord, ::cudaEventSynchronize, ::cudaStreamWaitEvent and 
+ * ::cudaEventQuery may be used in either process. Performing operations 
+ * on the imported event after the exported event has been freed 
+ * with ::cudaEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified 
+ * addressing on Linux operating systems. IPC functionality is not supported
+ * on Tegra platforms.
+ *
+ * \param handle - Pointer to a user allocated cudaIpcEventHandle
+ *                    in which to return the opaque event handle
+ * \param event   - Event allocated with ::cudaEventInterprocess and 
+ *                    ::cudaEventDisableTiming flags.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaEventCreate,
+ * ::cudaEventDestroy,
+ * ::cudaEventSynchronize,
+ * ::cudaEventQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcGetEventHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event);
+
+/**
+ * \brief Opens an interprocess event handle for use in the current process
+ *
+ * Opens an interprocess event handle exported from another process with 
+ * ::cudaIpcGetEventHandle. This function returns a ::cudaEvent_t that behaves like 
+ * a locally created event with the ::cudaEventDisableTiming flag specified. 
+ * This event must be freed with ::cudaEventDestroy.
+ *
+ * Performing operations on the imported event after the exported event has 
+ * been freed with ::cudaEventDestroy will result in undefined behavior.
+ *
+ * IPC functionality is restricted to devices with support for unified 
+ * addressing on Linux operating systems. IPC functionality is not supported
+ * on Tegra platforms.
+ *
+ * \param event - Returns the imported event
+ * \param handle  - Interprocess handle to open
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorDeviceUninitialized
+ * \note_init_rt
+ * \note_callback
+ *
+  * \sa
+ * ::cudaEventCreate,
+ * ::cudaEventDestroy,
+ * ::cudaEventSynchronize,
+ * ::cudaEventQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcOpenEventHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle);
+
+
+/**
+ * \brief Gets an interprocess memory handle for an existing device memory
+ *          allocation
+ *
+ * Takes a pointer to the base of an existing device memory allocation created 
+ * with ::cudaMalloc and exports it for use in another process. This is a 
+ * lightweight operation and may be called multiple times on an allocation
+ * without adverse effects. 
+ *
+ * If a region of memory is freed with ::cudaFree and a subsequent call
+ * to ::cudaMalloc returns memory with the same device address,
+ * ::cudaIpcGetMemHandle will return a unique handle for the
+ * new memory. 
+ *
+ * IPC functionality is restricted to devices with support for unified 
+ * addressing on Linux operating systems. IPC functionality is not supported
+ * on Tegra platforms.
+ *
+ * \param handle - Pointer to user allocated ::cudaIpcMemHandle to return
+ *                    the handle in.
+ * \param devPtr - Base pointer to previously allocated device memory 
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cuIpcGetMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr);
+
+/**
+ * \brief Opens an interprocess memory handle exported from another process
+ *          and returns a device pointer usable in the local process.
+ *
+ * Maps memory exported from another process with ::cudaIpcGetMemHandle into
+ * the current device address space. For contexts on different devices 
+ * ::cudaIpcOpenMemHandle can attempt to enable peer access between the
+ * devices as if the user called ::cudaDeviceEnablePeerAccess. This behavior is 
+ * controlled by the ::cudaIpcMemLazyEnablePeerAccess flag. 
+ * ::cudaDeviceCanAccessPeer can determine if a mapping is possible.
+ *
+ * ::cudaIpcOpenMemHandle can open handles to devices that may not be visible
+ * in the process calling the API.
+ *
+ * Contexts that may open ::cudaIpcMemHandles are restricted in the following way.
+ * ::cudaIpcMemHandles from each device in a given process may only be opened 
+ * by one context per device per other process.
+ *
+ * If the memory handle has already been opened by the current context, the
+ * reference count on the handle is incremented by 1 and the existing device pointer
+ * is returned.
+ *
+ * Memory returned from ::cudaIpcOpenMemHandle must be freed with
+ * ::cudaIpcCloseMemHandle.
+ *
+ * Calling ::cudaFree on an exported memory region before calling
+ * ::cudaIpcCloseMemHandle in the importing context will result in undefined
+ * behavior.
+ * 
+ * IPC functionality is restricted to devices with support for unified 
+ * addressing on Linux operating systems. IPC functionality is not supported
+ * on Tegra platforms.
+ *
+ * \param devPtr - Returned device pointer
+ * \param handle - ::cudaIpcMemHandle to open
+ * \param flags  - Flags for this operation. Must be specified as ::cudaIpcMemLazyEnablePeerAccess
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorDeviceUninitialized,
+ * ::cudaErrorTooManyPeers,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note No guarantees are made about the address returned in \p *devPtr.  
+ * In particular, multiple processes may not receive the same address for the same \p handle.
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcCloseMemHandle,
+ * ::cudaDeviceEnablePeerAccess,
+ * ::cudaDeviceCanAccessPeer,
+ * ::cuIpcOpenMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags);
+
+/**
+ * \brief Attempts to close memory mapped with cudaIpcOpenMemHandle
+ * 
+ * Decrements the reference count of the memory returnd by ::cudaIpcOpenMemHandle by 1.
+ * When the reference count reaches 0, this API unmaps the memory. The original allocation
+ * in the exporting process as well as imported mappings in other processes
+ * will be unaffected.
+ *
+ * Any resources used to enable peer access will be freed if this is the
+ * last mapping using them.
+ *
+ * IPC functionality is restricted to devices with support for unified 
+ * addressing on Linux operating systems. IPC functionality is not supported
+ * on Tegra platforms.
+ *
+ * \param devPtr - Device pointer returned by ::cudaIpcOpenMemHandle
+ * 
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorMapBufferObjectFailed,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMalloc,
+ * ::cudaFree,
+ * ::cudaIpcGetEventHandle,
+ * ::cudaIpcOpenEventHandle,
+ * ::cudaIpcGetMemHandle,
+ * ::cudaIpcOpenMemHandle,
+ * ::cuIpcCloseMemHandle
+ */
+extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr);
+
+/**
+ * \brief Blocks until remote writes are visible to the specified scope
+ *
+ * Blocks until remote writes to the target context via mappings created
+ * through GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see
+ * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are
+ * visible to the specified scope.
+ *
+ * If the scope equals or lies within the scope indicated by
+ * ::cudaDevAttrGPUDirectRDMAWritesOrdering, the call will be a no-op and
+ * can be safely omitted for performance. This can be determined by
+ * comparing the numerical values between the two enums, with smaller
+ * scopes having smaller values.
+ *
+ * Users may query support for this API via ::cudaDevAttrGPUDirectRDMAFlushWritesOptions.
+ *
+ * \param target - The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
+ * \param scope  - The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuFlushGPUDirectRDMAWrites
+ */
+#if __CUDART_API_VERSION >= 11030
+extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWritesTarget target, enum cudaFlushGPUDirectRDMAWritesScope scope);
+#endif
+
+/** @} */ /* END CUDART_DEVICE */
+
+/**
+ * \defgroup CUDART_THREAD_DEPRECATED Thread Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated thread management functions of the CUDA runtime
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated thread management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Exit and clean up from CUDA launches
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceReset(), which should be used
+ * instead.
+ *
+ * Explicitly destroys all cleans up all resources associated with the current
+ * device in the current process.  Any subsequent API call to this device will 
+ * reinitialize the device.  
+ *
+ * Note that this function will reset the device immediately.  It is the caller's
+ * responsibility to ensure that the device is not being accessed by any 
+ * other host threads from the process when this function is called.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceReset
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void);
+
+/**
+ * \brief Wait for compute device to finish
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is similar to the 
+ * non-deprecated function ::cudaDeviceSynchronize(), which should be used
+ * instead.
+ *
+ * Blocks until the device has completed all preceding requested tasks.
+ * ::cudaThreadSynchronize() returns an error if one of the preceding tasks
+ * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for 
+ * this device, the host thread will block until the device has finished 
+ * its work.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSynchronize
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void);
+
+/**
+ * \brief Set resource limits
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceSetLimit(), which should be used
+ * instead.
+ *
+ * Setting \p limit to \p value is a request by the application to update
+ * the current limit maintained by the device.  The driver is free to
+ * modify the requested value to meet h/w requirements (this could be
+ * clamping to minimum or maximum values, rounding up to nearest element
+ * size, etc).  The application can use ::cudaThreadGetLimit() to find out
+ * exactly what the limit has been set to.
+ *
+ * Setting each ::cudaLimit has its own specific restrictions, so each is
+ * discussed here.
+ *
+ * - ::cudaLimitStackSize controls the stack size of each GPU thread.
+ *
+ * - ::cudaLimitPrintfFifoSize controls the size of the shared FIFO
+ *   used by the ::printf() device system call.
+ *   Setting ::cudaLimitPrintfFifoSize must be performed before
+ *   launching any kernel that uses the ::printf() device
+ *   system call, otherwise ::cudaErrorInvalidValue will be returned.
+ *
+ * - ::cudaLimitMallocHeapSize controls the size of the heap used
+ *   by the ::malloc() and ::free() device system calls.  Setting
+ *   ::cudaLimitMallocHeapSize must be performed before launching
+ *   any kernel that uses the ::malloc() or ::free() device system calls,
+ *   otherwise ::cudaErrorInvalidValue will be returned.
+ *
+ * \param limit - Limit to set
+ * \param value - Size in bytes of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetLimit
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value);
+
+/**
+ * \brief Returns resource limits
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceGetLimit(), which should be used
+ * instead.
+ *
+ * Returns in \p *pValue the current size of \p limit.  The supported
+ * ::cudaLimit values are:
+ * - ::cudaLimitStackSize: stack size of each GPU thread;
+ * - ::cudaLimitPrintfFifoSize: size of the shared FIFO used by the
+ *   ::printf() device system call.
+ * - ::cudaLimitMallocHeapSize: size of the heap used by the
+ *   ::malloc() and ::free() device system calls;
+ *
+ * \param limit  - Limit to query
+ * \param pValue - Returned size in bytes of limit
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorUnsupportedLimit,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetLimit
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit);
+
+/**
+ * \brief Returns the preferred cache configuration for the current device.
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceGetCacheConfig(), which should be 
+ * used instead.
+ * 
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this returns through \p pCacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute functions.
+ *
+ * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices
+ * where the size of the L1 cache and shared memory are fixed.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param pCacheConfig - Returned cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceGetCacheConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig);
+
+/**
+ * \brief Sets the preferred cache configuration for the current device.
+ *
+ * \deprecated
+ *
+ * Note that this function is deprecated because its name does not 
+ * reflect its behavior.  Its functionality is identical to the 
+ * non-deprecated function ::cudaDeviceSetCacheConfig(), which should be 
+ * used instead.
+ * 
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache
+ * configuration for the current device. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute the function. Any
+ * function preference set via
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)"
+ * or
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)"
+ * will be preferred over this device-wide setting. Setting the device-wide
+ * cache configuration to ::cudaFuncCachePreferNone will cause subsequent
+ * kernel launches to prefer to not change the cache configuration unless
+ * required to launch the kernel.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetCacheConfig
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig);
+
+/** @} */ /* END CUDART_THREAD_DEPRECATED */
+
+/**
+ * \defgroup CUDART_ERROR Error Handling
+ *
+ * ___MANBRIEF___ error handling functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the error handling functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the last error from a runtime call
+ *
+ * Returns the last error that has been produced by any of the runtime calls
+ * in the same host thread and resets it to ::cudaSuccess.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMissingConfiguration,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorUnmapBufferObjectFailed,
+ * ::cudaErrorInvalidDevicePointer,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorInvalidFilterSetting,
+ * ::cudaErrorInvalidNormSetting,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInsufficientDriver,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorSetOnActiveProcess,
+ * ::cudaErrorStartupFailure,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaPeekAtLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void);
+
+/**
+ * \brief Returns the last error from a runtime call
+ *
+ * Returns the last error that has been produced by any of the runtime calls
+ * in the same host thread. Note that this call does not reset the error to
+ * ::cudaSuccess like ::cudaGetLastError().
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMissingConfiguration,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorUnmapBufferObjectFailed,
+ * ::cudaErrorInvalidDevicePointer,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorInvalidFilterSetting,
+ * ::cudaErrorInvalidNormSetting,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInsufficientDriver,
+ * ::cudaErrorNoDevice,
+ * ::cudaErrorSetOnActiveProcess,
+ * ::cudaErrorStartupFailure,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void);
+
+/**
+ * \brief Returns the string representation of an error code enum name
+ *
+ * Returns a string containing the name of an error code in the enum.  If the error
+ * code is not recognized, "unrecognized error code" is returned.
+ *
+ * \param error - Error code to convert to string
+ *
+ * \return
+ * \p char* pointer to a NULL-terminated string
+ *
+ * \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError,
+ * ::cuGetErrorName
+ */
+extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error);
+
+/**
+ * \brief Returns the description string for an error code
+ *
+ * Returns the description string for an error code.  If the error
+ * code is not recognized, "unrecognized error code" is returned.
+ *
+ * \param error - Error code to convert to string
+ *
+ * \return
+ * \p char* pointer to a NULL-terminated string
+ *
+ * \sa ::cudaGetErrorName, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError,
+ * ::cuGetErrorString
+ */
+extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error);
+/** @} */ /* END CUDART_ERROR */
+
+/**
+ * \addtogroup CUDART_DEVICE 
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the number of compute-capable devices
+ *
+ * Returns in \p *count the number of devices with compute capability greater
+ * or equal to 2.0 that are available for execution.
+ *
+ * \param count - Returns the number of devices with compute capability
+ * greater or equal to 2.0
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDevice, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice,
+ * ::cuDeviceGetCount
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count);
+
+/**
+ * \brief Returns information about the compute-device
+ *
+ * Returns in \p *prop the properties of device \p dev. The ::cudaDeviceProp
+ * structure is defined as:
+ * \code
+    struct cudaDeviceProp {
+        char name[256];
+        cudaUUID_t uuid;
+        size_t totalGlobalMem;
+        size_t sharedMemPerBlock;
+        int regsPerBlock;
+        int warpSize;
+        size_t memPitch;
+        int maxThreadsPerBlock;
+        int maxThreadsDim[3];
+        int maxGridSize[3];
+        int clockRate;
+        size_t totalConstMem;
+        int major;
+        int minor;
+        size_t textureAlignment;
+        size_t texturePitchAlignment;
+        int deviceOverlap;
+        int multiProcessorCount;
+        int kernelExecTimeoutEnabled;
+        int integrated;
+        int canMapHostMemory;
+        int computeMode;
+        int maxTexture1D;
+        int maxTexture1DMipmap;
+        int maxTexture1DLinear;
+        int maxTexture2D[2];
+        int maxTexture2DMipmap[2];
+        int maxTexture2DLinear[3];
+        int maxTexture2DGather[2];
+        int maxTexture3D[3];
+        int maxTexture3DAlt[3];
+        int maxTextureCubemap;
+        int maxTexture1DLayered[2];
+        int maxTexture2DLayered[3];
+        int maxTextureCubemapLayered[2];
+        int maxSurface1D;
+        int maxSurface2D[2];
+        int maxSurface3D[3];
+        int maxSurface1DLayered[2];
+        int maxSurface2DLayered[3];
+        int maxSurfaceCubemap;
+        int maxSurfaceCubemapLayered[2];
+        size_t surfaceAlignment;
+        int concurrentKernels;
+        int ECCEnabled;
+        int pciBusID;
+        int pciDeviceID;
+        int pciDomainID;
+        int tccDriver;
+        int asyncEngineCount;
+        int unifiedAddressing;
+        int memoryClockRate;
+        int memoryBusWidth;
+        int l2CacheSize;
+        int persistingL2CacheMaxSize;
+        int maxThreadsPerMultiProcessor;
+        int streamPrioritiesSupported;
+        int globalL1CacheSupported;
+        int localL1CacheSupported;
+        size_t sharedMemPerMultiprocessor;
+        int regsPerMultiprocessor;
+        int managedMemory;
+        int isMultiGpuBoard;
+        int multiGpuBoardGroupID;
+        int singleToDoublePrecisionPerfRatio;
+        int pageableMemoryAccess;
+        int concurrentManagedAccess;
+        int computePreemptionSupported;
+        int canUseHostPointerForRegisteredMem;
+        int cooperativeLaunch;
+        int cooperativeMultiDeviceLaunch;
+        int pageableMemoryAccessUsesHostPageTables;
+        int directManagedMemAccessFromHost;
+        int accessPolicyMaxWindowSize;
+    }
+ \endcode
+ * where:
+ * - \ref ::cudaDeviceProp::name "name[256]" is an ASCII string identifying
+ *   the device.
+ * - \ref ::cudaDeviceProp::uuid "uuid" is a 16-byte unique identifier.
+ * - \ref ::cudaDeviceProp::totalGlobalMem "totalGlobalMem" is the total
+ *   amount of global memory available on the device in bytes.
+ * - \ref ::cudaDeviceProp::sharedMemPerBlock "sharedMemPerBlock" is the
+ *   maximum amount of shared memory available to a thread block in bytes.
+ * - \ref ::cudaDeviceProp::regsPerBlock "regsPerBlock" is the maximum number
+ *   of 32-bit registers available to a thread block.
+ * - \ref ::cudaDeviceProp::warpSize "warpSize" is the warp size in threads.
+ * - \ref ::cudaDeviceProp::memPitch "memPitch" is the maximum pitch in
+ *   bytes allowed by the memory copy functions that involve memory regions
+ *   allocated through ::cudaMallocPitch().
+ * - \ref ::cudaDeviceProp::maxThreadsPerBlock "maxThreadsPerBlock" is the
+ *   maximum number of threads per block.
+ * - \ref ::cudaDeviceProp::maxThreadsDim "maxThreadsDim[3]" contains the
+ *   maximum size of each dimension of a block.
+ * - \ref ::cudaDeviceProp::maxGridSize "maxGridSize[3]" contains the
+ *   maximum size of each dimension of a grid.
+ * - \ref ::cudaDeviceProp::clockRate "clockRate" is the clock frequency in
+ *   kilohertz.
+ * - \ref ::cudaDeviceProp::totalConstMem "totalConstMem" is the total amount
+ *   of constant memory available on the device in bytes.
+ * - \ref ::cudaDeviceProp::major "major",
+ *   \ref ::cudaDeviceProp::minor "minor" are the major and minor revision
+ *   numbers defining the device's compute capability.
+ * - \ref ::cudaDeviceProp::textureAlignment "textureAlignment" is the
+ *   alignment requirement; texture base addresses that are aligned to
+ *   \ref ::cudaDeviceProp::textureAlignment "textureAlignment" bytes do not
+ *   need an offset applied to texture fetches.
+ * - \ref ::cudaDeviceProp::texturePitchAlignment "texturePitchAlignment" is the
+ *   pitch alignment requirement for 2D texture references that are bound to 
+ *   pitched memory.
+ * - \ref ::cudaDeviceProp::deviceOverlap "deviceOverlap" is 1 if the device
+ *   can concurrently copy memory between host and device while executing a
+ *   kernel, or 0 if not.  Deprecated, use instead asyncEngineCount.
+ * - \ref ::cudaDeviceProp::multiProcessorCount "multiProcessorCount" is the
+ *   number of multiprocessors on the device.
+ * - \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
+ *   is 1 if there is a run time limit for kernels executed on the device, or
+ *   0 if not.
+ * - \ref ::cudaDeviceProp::integrated "integrated" is 1 if the device is an
+ *   integrated (motherboard) GPU and 0 if it is a discrete (card) component.
+ * - \ref ::cudaDeviceProp::canMapHostMemory "canMapHostMemory" is 1 if the
+ *   device can map host memory into the CUDA address space for use with
+ *   ::cudaHostAlloc()/::cudaHostGetDevicePointer(), or 0 if not.
+ * - \ref ::cudaDeviceProp::computeMode "computeMode" is the compute mode
+ *   that the device is currently in. Available modes are as follows:
+ *   - cudaComputeModeDefault: Default mode - Device is not restricted and
+ *     multiple threads can use ::cudaSetDevice() with this device.
+ *   - cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
+ *     ::cudaSetDevice() with this device.
+ *   - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many 
+ *     threads in one process will be able to use ::cudaSetDevice() with this device.
+ *   <br> When an occupied exclusive mode device is chosen with ::cudaSetDevice,
+ *   all subsequent non-device management runtime functions will return
+ *   ::cudaErrorDevicesUnavailable.
+ * - \ref ::cudaDeviceProp::maxTexture1D "maxTexture1D" is the maximum 1D
+ *   texture size.
+ * - \ref ::cudaDeviceProp::maxTexture1DMipmap "maxTexture1DMipmap" is the maximum
+ *   1D mipmapped texture texture size.
+ * - \ref ::cudaDeviceProp::maxTexture1DLinear "maxTexture1DLinear" is the maximum
+ *   1D texture size for textures bound to linear memory.
+ * - \ref ::cudaDeviceProp::maxTexture2D "maxTexture2D[2]" contains the maximum
+ *   2D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DMipmap "maxTexture2DMipmap[2]" contains the
+ *   maximum 2D mipmapped texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DLinear "maxTexture2DLinear[3]" contains the 
+ *   maximum 2D texture dimensions for 2D textures bound to pitch linear memory.
+ * - \ref ::cudaDeviceProp::maxTexture2DGather "maxTexture2DGather[2]" contains the 
+ *   maximum 2D texture dimensions if texture gather operations have to be performed.
+ * - \ref ::cudaDeviceProp::maxTexture3D "maxTexture3D[3]" contains the maximum
+ *   3D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture3DAlt "maxTexture3DAlt[3]"
+ *   contains the maximum alternate 3D texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTextureCubemap "maxTextureCubemap" is the 
+ *   maximum cubemap texture width or height.
+ * - \ref ::cudaDeviceProp::maxTexture1DLayered "maxTexture1DLayered[2]" contains
+ *   the maximum 1D layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTexture2DLayered "maxTexture2DLayered[3]" contains
+ *   the maximum 2D layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxTextureCubemapLayered "maxTextureCubemapLayered[2]"
+ *   contains the maximum cubemap layered texture dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface1D "maxSurface1D" is the maximum 1D
+ *   surface size.
+ * - \ref ::cudaDeviceProp::maxSurface2D "maxSurface2D[2]" contains the maximum
+ *   2D surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface3D "maxSurface3D[3]" contains the maximum
+ *   3D surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface1DLayered "maxSurface1DLayered[2]" contains
+ *   the maximum 1D layered surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurface2DLayered "maxSurface2DLayered[3]" contains
+ *   the maximum 2D layered surface dimensions.
+ * - \ref ::cudaDeviceProp::maxSurfaceCubemap "maxSurfaceCubemap" is the maximum 
+ *   cubemap surface width or height.
+ * - \ref ::cudaDeviceProp::maxSurfaceCubemapLayered "maxSurfaceCubemapLayered[2]"
+ *   contains the maximum cubemap layered surface dimensions.
+ * - \ref ::cudaDeviceProp::surfaceAlignment "surfaceAlignment" specifies the
+ *   alignment requirements for surfaces.
+ * - \ref ::cudaDeviceProp::concurrentKernels "concurrentKernels" is 1 if the
+ *   device supports executing multiple kernels within the same context
+ *   simultaneously, or 0 if not. It is not guaranteed that multiple kernels
+ *   will be resident on the device concurrently so this feature should not be
+ *   relied upon for correctness.
+ * - \ref ::cudaDeviceProp::ECCEnabled "ECCEnabled" is 1 if the device has ECC
+ *   support turned on, or 0 if not.
+ * - \ref ::cudaDeviceProp::pciBusID "pciBusID" is the PCI bus identifier of
+ *   the device.
+ * - \ref ::cudaDeviceProp::pciDeviceID "pciDeviceID" is the PCI device
+ *   (sometimes called slot) identifier of the device.
+ * - \ref ::cudaDeviceProp::pciDomainID "pciDomainID" is the PCI domain identifier
+ *   of the device.
+ * - \ref ::cudaDeviceProp::tccDriver "tccDriver" is 1 if the device is using a
+ *   TCC driver or 0 if not.
+ * - \ref ::cudaDeviceProp::asyncEngineCount "asyncEngineCount" is 1 when the
+ *   device can concurrently copy memory between host and device while executing
+ *   a kernel. It is 2 when the device can concurrently copy memory between host
+ *   and device in both directions and execute a kernel at the same time. It is
+ *   0 if neither of these is supported.
+ * - \ref ::cudaDeviceProp::unifiedAddressing "unifiedAddressing" is 1 if the device 
+ *   shares a unified address space with the host and 0 otherwise.
+ * - \ref ::cudaDeviceProp::memoryClockRate "memoryClockRate" is the peak memory 
+ *   clock frequency in kilohertz.
+ * - \ref ::cudaDeviceProp::memoryBusWidth "memoryBusWidth" is the memory bus width  
+ *   in bits.
+ * - \ref ::cudaDeviceProp::l2CacheSize "l2CacheSize" is L2 cache size in bytes. 
+ * - \ref ::cudaDeviceProp::persistingL2CacheMaxSize "persistingL2CacheMaxSize" is L2 cache's maximum persisting lines size in bytes.
+ * - \ref ::cudaDeviceProp::maxThreadsPerMultiProcessor "maxThreadsPerMultiProcessor"  
+ *   is the number of maximum resident threads per multiprocessor.
+ * - \ref ::cudaDeviceProp::streamPrioritiesSupported "streamPrioritiesSupported"
+ *   is 1 if the device supports stream priorities, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::globalL1CacheSupported "globalL1CacheSupported"
+ *   is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::localL1CacheSupported "localL1CacheSupported"
+ *   is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::sharedMemPerMultiprocessor "sharedMemPerMultiprocessor" is the
+ *   maximum amount of shared memory available to a multiprocessor in bytes; this amount is
+ *   shared by all thread blocks simultaneously resident on a multiprocessor.
+ * - \ref ::cudaDeviceProp::regsPerMultiprocessor "regsPerMultiprocessor" is the maximum number
+ *   of 32-bit registers available to a multiprocessor; this number is shared
+ *   by all thread blocks simultaneously resident on a multiprocessor.
+ * - \ref ::cudaDeviceProp::managedMemory "managedMemory"
+ *   is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported.
+ * - \ref ::cudaDeviceProp::isMultiGpuBoard "isMultiGpuBoard"
+ *   is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not;
+ * - \ref ::cudaDeviceProp::multiGpuBoardGroupID "multiGpuBoardGroupID" is a unique identifier
+ *   for a group of devices associated with the same board.
+ *   Devices on the same multi-GPU board will share the same identifier.
+ * - \ref ::cudaDeviceProp::singleToDoublePrecisionPerfRatio "singleToDoublePrecisionPerfRatio"  
+ *   is the ratio of single precision performance (in floating-point operations per second)
+ *   to double precision performance.
+ * - \ref ::cudaDeviceProp::pageableMemoryAccess "pageableMemoryAccess" is 1 if the device supports
+ *   coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::concurrentManagedAccess "concurrentManagedAccess" is 1 if the device can
+ *   coherently access managed memory concurrently with the CPU, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::computePreemptionSupported "computePreemptionSupported" is 1 if the device
+ *   supports Compute Preemption, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::canUseHostPointerForRegisteredMem "canUseHostPointerForRegisteredMem" is 1 if
+ *   the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::cooperativeLaunch "cooperativeLaunch" is 1 if the device supports launching
+ *   cooperative kernels via ::cudaLaunchCooperativeKernel, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::cooperativeMultiDeviceLaunch "cooperativeMultiDeviceLaunch" is 1 if the device
+ *   supports launching cooperative kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::pageableMemoryAccessUsesHostPageTables "pageableMemoryAccessUsesHostPageTables" is 1 if the device accesses
+ *   pageable memory via the host's page tables, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::directManagedMemAccessFromHost "directManagedMemAccessFromHost" is 1 if the host can directly access managed
+ *   memory on the device without migration, and 0 otherwise.
+ * - \ref ::cudaDeviceProp::maxBlocksPerMultiProcessor "maxBlocksPerMultiProcessor" is the maximum number of thread blocks
+ *   that can reside on a multiprocessor.
+ * - \ref ::cudaDeviceProp::accessPolicyMaxWindowSize "accessPolicyMaxWindowSize" is
+ *   the maximum value of ::cudaAccessPolicyWindow::num_bytes.
+ *
+ * \param prop   - Properties for the specified device
+ * \param device - Device number to get properties for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
+ * ::cudaDeviceGetAttribute,
+ * ::cuDeviceGetAttribute,
+ * ::cuDeviceGetName
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device);
+
+/**
+ * \brief Returns information about the device
+ *
+ * Returns in \p *value the integer value of the attribute \p attr on device
+ * \p device. The supported attributes are:
+ * - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block
+ * - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block
+ * - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block
+ * - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block
+ * - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid
+ * - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid
+ * - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid
+ * - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory
+ *   available to a thread block in bytes
+ * - ::cudaDevAttrTotalConstantMemory: Memory available on device for
+ *   __constant__ variables in a CUDA C kernel in bytes
+ * - ::cudaDevAttrWarpSize: Warp size in threads
+ * - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy
+ *   functions that involve memory regions allocated through ::cudaMallocPitch()
+ * - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width
+ * - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound
+ *   to linear memory
+ * - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width
+ * - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width
+ * - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height
+ * - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture
+ *   bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture
+ *   bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D
+ *   texture bound to linear memory
+ * - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture
+ *   width
+ * - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture
+ *   height
+ * - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width
+ * - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height
+ * - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth
+ * - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth,
+ *   0 if no alternate maximum 3D texture size is supported
+ * - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or
+ *   height
+ * - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width
+ * - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered
+ *   texture
+ * - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width
+ * - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height
+ * - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered
+ *   texture
+ * - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered
+ *   texture width or height
+ * - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap
+ *   layered texture
+ * - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width
+ * - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width
+ * - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height
+ * - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width
+ * - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height
+ * - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth
+ * - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width
+ * - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered
+ *   surface
+ * - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width
+ * - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height
+ * - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered
+ *   surface
+ * - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width
+ * - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered
+ *   surface width
+ * - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap
+ *   layered surface
+ * - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers 
+ *   available to a thread block
+ * - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz
+ * - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base
+ *   addresses aligned to ::textureAlign bytes do not need an offset applied
+ *   to texture fetches
+ * - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D
+ *   texture references bound to pitched memory
+ * - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory
+ *   between host and device while executing a kernel, or 0 if not
+ * - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device
+ * - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels
+ *   executed on the device, or 0 if not
+ * - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory
+ *   subsystem, or 0 if not
+ * - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into
+ *   the CUDA address space, or 0 if not
+ * - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device
+ *   is currently in. Available modes are as follows:
+ *   - ::cudaComputeModeDefault: Default mode - Device is not restricted and
+ *     multiple threads can use ::cudaSetDevice() with this device.
+ *   - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use
+ *     ::cudaSetDevice() with this device.
+ *   - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many 
+ *     threads in one process will be able to use ::cudaSetDevice() with this
+ *     device.
+ * - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing
+ *   multiple kernels within the same context simultaneously, or 0 if
+ *   not. It is not guaranteed that multiple kernels will be resident on the
+ *   device concurrently so this feature should not be relied upon for
+ *   correctness.
+ * - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device,
+ *   0 if error correction is disabled or not supported by the device
+ * - ::cudaDevAttrPciBusId: PCI bus identifier of the device
+ * - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of
+ *   the device
+ * - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only
+ *   available on Tesla hardware running Windows Vista or later.
+ * - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz
+ * - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits
+ * - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device
+ *   doesn't have L2 cache.
+ * - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per 
+ *   multiprocessor
+ * - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address
+ *   space with the host, or 0 if not
+ * - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version
+ *   number
+ * - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version
+ *   number
+ * - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream
+ *   priorities, or 0 if not
+ * - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals 
+ *    in L1 cache, 0 if not
+ * - ::cudaDevAttrLocalL1CacheSupported: 1 if device supports caching locals 
+ *    in L1 cache, 0 if not
+ * - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory
+ *   available to a multiprocessor in bytes; this amount is shared by all 
+ *   thread blocks simultaneously resident on a multiprocessor
+ * - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers 
+ *   available to a multiprocessor; this number is shared by all thread blocks
+ *   simultaneously resident on a multiprocessor
+ * - ::cudaDevAttrManagedMemory: 1 if device supports allocating
+ *   managed memory, 0 if not
+ * - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not
+ * - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the
+ *   same multi-GPU board
+ * - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the
+ *   host supports native atomic operations
+ * - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance
+ *   (in floating-point operations per second) to double precision performance
+ * - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing
+ *   pageable memory without calling cudaHostRegister on it, and 0 otherwise
+ * - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed
+ *   memory concurrently with the CPU, and 0 otherwise
+ * - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports
+ *   Compute Preemption, 0 if not
+ * - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host
+ *   registered memory at the same virtual address as the CPU, and 0 otherwise
+ * - ::cudaDevAttrCooperativeLaunch: 1 if the device supports launching cooperative kernels
+ *   via ::cudaLaunchCooperativeKernel, and 0 otherwise
+ * - ::cudaDevAttrCooperativeMultiDeviceLaunch: 1 if the device supports launching cooperative
+ *   kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise
+ * - ::cudaDevAttrCanFlushRemoteWrites: 1 if the device supports flushing of outstanding 
+ *   remote writes, and 0 otherwise
+ * - ::cudaDevAttrHostRegisterSupported: 1 if the device supports host memory registration
+ *   via ::cudaHostRegister, and 0 otherwise
+ * - ::cudaDevAttrPageableMemoryAccessUsesHostPageTables: 1 if the device accesses pageable memory via the
+ *   host's page tables, and 0 otherwise
+ * - ::cudaDevAttrDirectManagedMemAccessFromHost: 1 if the host can directly access managed memory on the device
+ *   without migration, and 0 otherwise
+ * - ::cudaDevAttrMaxSharedMemoryPerBlockOptin: Maximum per block shared memory size on the device. This value can
+ *   be opted into when using ::cudaFuncSetAttribute
+ * - ::cudaDevAttrMaxBlocksPerMultiprocessor: Maximum number of thread blocks that can reside on a multiprocessor
+ * - ::cudaDevAttrMaxPersistingL2CacheSize: Maximum L2 persisting lines capacity setting in bytes
+ * - ::cudaDevAttrMaxAccessPolicyWindowSize: Maximum value of cudaAccessPolicyWindow::num_bytes
+ * - ::cudaDevAttrReservedSharedMemoryPerBlock: Shared memory reserved by CUDA driver per block in bytes
+ * - ::cudaDevAttrSparseCudaArraySupported: 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
+ * - ::cudaDevAttrHostRegisterReadOnlySupported: Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly
+ *   to register memory that must be mapped as read-only to the GPU
+ * - ::cudaDevAttrMemoryPoolsSupported: 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, and 0 otherwise
+ * - ::cudaDevAttrGPUDirectRDMASupported: 1 if the device supports GPUDirect RDMA APIs, and 0 otherwise
+ * - ::cudaDevAttrGPUDirectRDMAFlushWritesOptions: bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum 
+ * - ::cudaDevAttrGPUDirectRDMAWritesOrdering: see the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values
+ * - ::cudaDevAttrMemoryPoolSupportedHandleTypes: Bitmask of handle types supported with mempool based IPC
+ * - ::cudaDevAttrDeferredMappingCudaArraySupported : 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays.
+ *
+ * \param value  - Returned device attribute value
+ * \param attr   - Device attribute to query
+ * \param device - Device number to query 
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice,
+ * ::cudaGetDeviceProperties,
+ * ::cuDeviceGetAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device);
+
+/**
+ * \brief Returns the default mempool of a device
+ *
+ * The default mempool of a device contains device memory from that device.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuDeviceGetDefaultMemPool, ::cudaMallocAsync, ::cudaMemPoolTrimTo, ::cudaMemPoolGetAttribute, ::cudaDeviceSetMemPool, ::cudaMemPoolSetAttribute, ::cudaMemPoolSetAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device);
+
+
+/**
+ * \brief Sets the current memory pool of a device
+ *
+ * The memory pool must be local to the specified device.
+ * Unless a mempool is specified in the ::cudaMallocAsync call,
+ * ::cudaMallocAsync allocates from the current mempool of the provided stream's device.
+ * By default, a device's current memory pool is its default memory pool.
+ *
+ * \note Use ::cudaMallocFromPoolAsync to specify asynchronous allocations from a device different
+ * than the one the stream runs on.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidDevice
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_callback
+ *
+ * \sa ::cuDeviceSetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaMemPoolCreate, ::cudaMemPoolDestroy, ::cudaMallocFromPoolAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetMemPool(int device, cudaMemPool_t memPool);
+
+/**
+ * \brief Gets the current mempool for a device
+ *
+ * Returns the last pool provided to ::cudaDeviceSetMemPool for this device
+ * or the device's default memory pool if ::cudaDeviceSetMemPool has never been called.
+ * By default the current mempool is the default mempool for a device,
+ * otherwise the returned pool must have been set with ::cuDeviceSetMemPool or ::cudaDeviceSetMemPool.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuDeviceGetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceSetMemPool
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device);
+
+/**
+ * \brief Return NvSciSync attributes that this device can support.
+ *
+ * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that
+ * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList
+ * can be used to create an NvSciSync that matches this device's capabilities.
+ * 
+ * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is
+ * already set this API will return ::cudaErrorInvalidValue.
+ * 
+ * The applications should set \p nvSciSyncAttrList to a valid 
+ * NvSciSyncAttrList failing which this API will return
+ * ::cudaErrorInvalidHandle.
+ * 
+ * The \p flags controls how applications intends to use
+ * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are:
+ * - ::cudaNvSciSyncAttrSignal, specifies that the applications intends to 
+ * signal an NvSciSync on this CUDA device.
+ * - ::cudaNvSciSyncAttrWait, specifies that the applications intends to 
+ * wait on an NvSciSync on this CUDA device.
+ *
+ * At least one of these flags must be set, failing which the API
+ * returns ::cudaErrorInvalidValue. Both the flags are orthogonal
+ * to one another: a developer may set both these flags that allows to
+ * set both wait and signal specific attributes in the same \p nvSciSyncAttrList.
+ *
+ * \param nvSciSyncAttrList     - Return NvSciSync attributes supported.
+ * \param device                - Valid Cuda Device to get NvSciSync attributes for.
+ * \param flags                 - flags describing NvSciSync usage.
+ *
+ * \return
+ *
+ * ::cudaSuccess,
+ * ::cudaErrorDeviceUninitialized,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidHandle,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorMemoryAllocation
+ *
+ * \sa
+ * ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, int device, int flags);
+
+/**
+ * \brief Queries attributes of the link between two devices.
+ *
+ * Returns in \p *value the value of the requested attribute \p attrib of the
+ * link between \p srcDevice and \p dstDevice. The supported attributes are:
+ * - ::cudaDevP2PAttrPerformanceRank: A relative value indicating the
+ *   performance of the link between two devices. Lower value means better
+ *   performance (0 being the value used for most performant link).
+ * - ::cudaDevP2PAttrAccessSupported: 1 if peer access is enabled.
+ * - ::cudaDevP2PAttrNativeAtomicSupported: 1 if native atomic operations over
+ *   the link are supported.
+ * - ::cudaDevP2PAttrCudaArrayAccessSupported: 1 if accessing CUDA arrays over
+ *   the link is supported.
+ *
+ * Returns ::cudaErrorInvalidDevice if \p srcDevice or \p dstDevice are not valid
+ * or if they represent the same device.
+ *
+ * Returns ::cudaErrorInvalidValue if \p attrib is not valid or if \p value is
+ * a null pointer.
+ *
+ * \param value         - Returned value of the requested attribute
+ * \param attrib        - The requested attribute of the link between \p srcDevice and \p dstDevice.
+ * \param srcDevice     - The source device of the target link.
+ * \param dstDevice     - The destination device of the target link.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaCtxEnablePeerAccess,
+ * ::cudaCtxDisablePeerAccess,
+ * ::cudaCtxCanAccessPeer,
+ * ::cuDeviceGetP2PAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice);
+
+/**
+ * \brief Select compute-device which best matches criteria
+ *
+ * Returns in \p *device the device which has properties that best match
+ * \p *prop.
+ *
+ * \param device - Device with best match
+ * \param prop   - Desired device properties
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
+ * ::cudaGetDeviceProperties
+ */
+extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop);
+
+/**
+ * \brief Set device to be used for GPU executions
+ *
+ * Sets \p device as the current device for the calling host thread.
+ * Valid device id's are 0 to (::cudaGetDeviceCount() - 1).
+ *
+ * Any device memory subsequently allocated from this host thread
+ * using ::cudaMalloc(), ::cudaMallocPitch() or ::cudaMallocArray()
+ * will be physically resident on \p device.  Any host memory allocated
+ * from this host thread using ::cudaMallocHost() or ::cudaHostAlloc() 
+ * or ::cudaHostRegister() will have its lifetime associated  with
+ * \p device.  Any streams or events created from this host thread will 
+ * be associated with \p device.  Any kernels launched from this host
+ * thread using the <<<>>> operator or ::cudaLaunchKernel() will be executed
+ * on \p device.
+ *
+ * This call may be made from any host thread, to any device, and at 
+ * any time.  This function will do no synchronization with the previous 
+ * or new device, and should be considered a very low overhead call.
+ * If the current context bound to the calling thread is not the primary context,
+ * this call will bind the primary context to the calling thread and all the
+ * subsequent memory allocations, stream and event creations, and kernel launches
+ * will be associated with the primary context. This function will not initialize 
+ * the context until a runtime API requiring the context (such as ::cudaMalloc()) 
+ * is used. This function will not return an error if the device is in 
+ * ::cudaComputeModeExclusiveProcess and is occupied by another process or
+ * if the device is in ::cudaComputeModeProhibited.
+ *
+ * \param device - Device on which the active host thread should execute the
+ * device code.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice,
+ * ::cuCtxSetCurrent
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device);
+
+/**
+ * \brief Returns which device is currently being used
+ *
+ * Returns in \p *device the current device for the calling host thread.
+ *
+ * \param device - Returns the device on which the active host thread
+ * executes the device code.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaChooseDevice,
+ * ::cuCtxGetCurrent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device);
+
+/**
+ * \brief Set a list of devices that can be used for CUDA
+ *
+ * Sets a list of devices for CUDA execution in priority order using
+ * \p device_arr. The parameter \p len specifies the number of elements in the
+ * list.  CUDA will try devices from the list sequentially until it finds one
+ * that works.  If this function is not called, or if it is called with a \p len
+ * of 0, then CUDA will go back to its default behavior of trying devices
+ * sequentially from a default list containing all of the available CUDA
+ * devices in the system. If a specified device ID in the list does not exist,
+ * this function will return ::cudaErrorInvalidDevice. If \p len is not 0 and
+ * \p device_arr is NULL or if \p len exceeds the number of devices in
+ * the system, then ::cudaErrorInvalidValue is returned.
+ *
+ * \param device_arr - List of devices to try
+ * \param len        - Number of devices in specified list
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDeviceFlags,
+ * ::cudaChooseDevice
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len);
+
+/**
+ * \brief Sets flags to be used for device executions
+ * 
+ * Records \p flags as the flags for the current device. If the current device
+ * has been set and that device has already been initialized, the previous flags
+ * are overwritten. If the current device has not been initialized, it is
+ * initialized with the provided flags. If no device has been made current to
+ * the calling thread, a default device is selected and initialized with the
+ * provided flags.
+ * 
+ * The two LSBs of the \p flags parameter can be used to control how the CPU
+ * thread interacts with the OS scheduler when waiting for results from the
+ * device.
+ *
+ * - ::cudaDeviceScheduleAuto: The default value if the \p flags parameter is
+ * zero, uses a heuristic based on the number of active CUDA contexts in the
+ * process \p C and the number of logical processors in the system \p P. If
+ * \p C \> \p P, then CUDA will yield to other OS threads when waiting for the
+ * device, otherwise CUDA will not yield while waiting for results and
+ * actively spin on the processor. Additionally, on Tegra devices,
+ * ::cudaDeviceScheduleAuto uses a heuristic based on the power profile of
+ * the platform and may choose ::cudaDeviceScheduleBlockingSync for low-powered
+ * devices.
+ * - ::cudaDeviceScheduleSpin: Instruct CUDA to actively spin when waiting for
+ * results from the device. This can decrease latency when waiting for the
+ * device, but may lower the performance of CPU threads if they are performing
+ * work in parallel with the CUDA thread.
+ * - ::cudaDeviceScheduleYield: Instruct CUDA to yield its thread when waiting
+ * for results from the device. This can increase latency when waiting for the
+ * device, but can increase the performance of CPU threads performing work in
+ * parallel with the device.
+ * - ::cudaDeviceScheduleBlockingSync: Instruct CUDA to block the CPU thread 
+ * on a synchronization primitive when waiting for the device to finish work.
+ * - ::cudaDeviceBlockingSync: Instruct CUDA to block the CPU thread on a 
+ * synchronization primitive when waiting for the device to finish work. <br>
+ * \ref deprecated "Deprecated:" This flag was deprecated as of CUDA 4.0 and
+ * replaced with ::cudaDeviceScheduleBlockingSync.
+ * - ::cudaDeviceMapHost: This flag enables allocating pinned
+ * host memory that is accessible to the device. It is implicit for the
+ * runtime but may be absent if a context is created using the driver API.
+ * If this flag is not set, ::cudaHostGetDevicePointer() will always return
+ * a failure code.
+ * - ::cudaDeviceLmemResizeToMax: Instruct CUDA to not reduce local memory
+ * after resizing local memory for a kernel. This can prevent thrashing by
+ * local memory allocations when launching many kernels with high local
+ * memory usage at the cost of potentially increased memory usage. <br>
+ * \ref deprecated "Deprecated:" This flag is deprecated and the behavior enabled          
+ * by this flag is now the default and cannot be disabled.
+ *
+ * \param flags - Parameters for device operation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceFlags, ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDevice, ::cudaSetValidDevices,
+ * ::cudaChooseDevice,
+ * ::cuDevicePrimaryCtxSetFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags );
+
+/**
+ * \brief Gets the flags for the current device
+ *
+ * 
+ * Returns in \p flags the flags for the current device. If there is a current
+ * device for the calling thread, the flags for the device are returned. If
+ * there is no current device, the flags for the first device are returned,
+ * which may be the default flags.  Compare to the behavior of
+ * ::cudaSetDeviceFlags.
+ *
+ * Typically, the flags returned should match the behavior that will be seen
+ * if the calling thread uses a device after this call, without any change to
+ * the flags or current device inbetween by this or another thread.  Note that
+ * if the device is not initialized, it is possible for another thread to
+ * change the flags for the current device before it is initialized.
+ * Additionally, when using exclusive mode, if this thread has not requested a
+ * specific device, it may use a device other than the first device, contrary
+ * to the assumption made by this function.
+ *
+ * If a context has been created via the driver API and is current to the
+ * calling thread, the flags for that context are always returned.
+ *
+ * Flags returned by this function may specifically include ::cudaDeviceMapHost
+ * even though it is not accepted by ::cudaSetDeviceFlags because it is
+ * implicit in runtime API flags.  The reason for this is that the current
+ * context may have been created via the driver API in which case the flag is
+ * not implicit and may be unset.
+ *
+ * \param flags - Pointer to store the device flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDevice, ::cudaGetDeviceProperties,
+ * ::cudaSetDevice, ::cudaSetDeviceFlags,
+ * ::cuCtxGetFlags,
+ * ::cuDevicePrimaryCtxGetState
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags );
+/** @} */ /* END CUDART_DEVICE */
+
+/**
+ * \defgroup CUDART_STREAM Stream Management
+ *
+ * ___MANBRIEF___ stream management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the stream management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Create an asynchronous stream
+ *
+ * Creates a new asynchronous stream.
+ *
+ * \param pStream - Pointer to new stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamGetFlags,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream);
+
+/**
+ * \brief Create an asynchronous stream
+ *
+ * Creates a new asynchronous stream.  The \p flags argument determines the 
+ * behaviors of the stream.  Valid values for \p flags are
+ * - ::cudaStreamDefault: Default stream creation flag.
+ * - ::cudaStreamNonBlocking: Specifies that work running in the created 
+ *   stream may run concurrently with work in stream 0 (the NULL stream), and that
+ *   the created stream should perform no implicit synchronization with stream 0.
+ *
+ * \param pStream - Pointer to new stream identifier
+ * \param flags   - Parameters for stream creation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithPriority,
+ * ::cudaStreamGetFlags,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreate
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags);
+
+/**
+ * \brief Create an asynchronous stream with the specified priority
+ *
+ * Creates a stream with the specified priority and returns a handle in \p pStream.
+ * This API alters the scheduler priority of work in the stream. Work in a higher
+ * priority stream may preempt work already executing in a low priority stream.
+ *
+ * \p priority follows a convention where lower numbers represent higher priorities.
+ * '0' represents default priority. The range of meaningful numerical priorities can
+ * be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is
+ * outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange,
+ * it will automatically be clamped to the lowest or the highest number in the range.
+ *
+ * \param pStream  - Pointer to new stream identifier
+ * \param flags    - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed
+ * \param priority - Priority of the stream. Lower numbers represent higher priorities.
+ *                   See ::cudaDeviceGetStreamPriorityRange for more information about
+ *                   the meaningful stream priorities that can be passed.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note Stream priorities are supported only on GPUs
+ * with compute capability 3.5 or higher.
+ *
+ * \note In the current implementation, only compute kernels launched in
+ * priority streams are affected by the stream's priority. Stream priorities have
+ * no effect on host-to-device and device-to-host memory operations.
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaDeviceGetStreamPriorityRange,
+ * ::cudaStreamGetPriority,
+ * ::cudaStreamQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamAddCallback,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamDestroy,
+ * ::cuStreamCreateWithPriority
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority);
+
+/**
+ * \brief Query the priority of a stream
+ *
+ * Query the priority of a stream. The priority is returned in in \p priority.
+ * Note that if the stream was created with a priority outside the meaningful
+ * numerical range returned by ::cudaDeviceGetStreamPriorityRange,
+ * this function returns the clamped priority.
+ * See ::cudaStreamCreateWithPriority for details about priority clamping.
+ *
+ * \param hStream    - Handle to the stream to be queried
+ * \param priority   - Pointer to a signed integer in which the stream's priority is returned
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaDeviceGetStreamPriorityRange,
+ * ::cudaStreamGetFlags,
+ * ::cuStreamGetPriority
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority);
+
+/**
+ * \brief Query the flags of a stream
+ *
+ * Query the flags of a stream. The flags are returned in \p flags.
+ * See ::cudaStreamCreateWithFlags for a list of valid flags.
+ *
+ * \param hStream - Handle to the stream to be queried
+ * \param flags   - Pointer to an unsigned integer in which the stream's flags are returned
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreateWithPriority,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamGetPriority,
+ * ::cuStreamGetFlags
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
+
+/**
+ * \brief Resets all persisting lines in cache to normal status.
+ *
+ * Resets all persisting lines in cache to normal status.
+ * Takes effect on function return.
+ *
+ * \return
+ * ::cudaSuccess,
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void);
+
+/**
+ * \brief Copies attributes from source stream to destination stream.
+ *
+ * Copies attributes from source stream \p src to destination stream \p dst.
+ * Both streams must have the same context.
+ *
+ * \param[out] dst Destination stream
+ * \param[in] src Source stream
+ * For attributes see ::cudaStreamAttrID
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src);
+
+ /**
+ * \brief Queries stream attribute.
+ *
+ * Queries attribute \p attr from \p hStream and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hStream
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetAttribute(
+        cudaStream_t hStream, cudaStreamAttrID attr,
+        cudaStreamAttrValue *value_out);
+
+ /**
+ * \brief Sets stream attribute.
+ *
+ * Sets attribute \p attr on \p hStream from corresponding attribute of
+ * \p value. The updated attribute will be applied to subsequent work
+ * submitted to the stream. It will not affect previously submitted work.
+ *
+ * \param[out] hStream
+ * \param[in] attr
+ * \param[in] value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamSetAttribute(
+        cudaStream_t hStream, cudaStreamAttrID attr,
+        const cudaStreamAttrValue *value);
+
+ /**
+ * \brief Destroys and cleans up an asynchronous stream
+ *
+ * Destroys and cleans up the asynchronous stream specified by \p stream.
+ *
+ * In case the device is still doing work in the stream \p stream
+ * when ::cudaStreamDestroy() is called, the function will return immediately 
+ * and the resources associated with \p stream will be released automatically 
+ * once the device has completed all work in \p stream.
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamCreateWithFlags,
+ * ::cudaStreamQuery,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamAddCallback,
+ * ::cuStreamDestroy
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream);
+
+/**
+ * \brief Make a compute stream wait on an event
+ *
+ * Makes all future work submitted to \p stream wait for all work captured in
+ * \p event.  See ::cudaEventRecord() for details on what is captured by an event.
+ * The synchronization will be performed efficiently on the device when applicable.
+ * \p event may be from a different device than \p stream.
+ *
+ * flags include:
+ * - ::cudaEventWaitDefault: Default event creation flag.
+ * - ::cudaEventWaitExternal: Event is captured in the graph as an external
+ *   event node when performing stream capture.
+ *
+ * \param stream - Stream to wait
+ * \param event  - Event to wait on
+ * \param flags  - Parameters for the operation(See above)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamWaitEvent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0));
+
+/**
+ * Type of stream callback functions.
+ * \param stream The stream as passed to ::cudaStreamAddCallback, may be NULL.
+ * \param status ::cudaSuccess or any persistent error on the stream.
+ * \param userData User parameter provided at registration.
+ */
+typedef void (CUDART_CB *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData);
+
+/**
+ * \brief Add a callback to a compute stream
+ *
+ * \note This function is slated for eventual deprecation and removal. If
+ * you do not require the callback to execute in case of a device error,
+ * consider using ::cudaLaunchHostFunc. Additionally, this function is not
+ * supported with ::cudaStreamBeginCapture and ::cudaStreamEndCapture, unlike
+ * ::cudaLaunchHostFunc.
+ *
+ * Adds a callback to be called on the host after all currently enqueued
+ * items in the stream have completed.  For each 
+ * cudaStreamAddCallback call, a callback will be executed exactly once.
+ * The callback will block later work in the stream until it is finished.
+ *
+ * The callback may be passed ::cudaSuccess or an error code.  In the event
+ * of a device error, all subsequently executed callbacks will receive an
+ * appropriate ::cudaError_t.
+ *
+ * Callbacks must not make any CUDA API calls.  Attempting to use CUDA APIs
+ * may result in ::cudaErrorNotPermitted.  Callbacks must not perform any
+ * synchronization that may depend on outstanding device work or other callbacks
+ * that are not mandated to run earlier.  Callbacks without a mandated order
+ * (in independent streams) execute in undefined order and may be serialized.
+ *
+ * For the purposes of Unified Memory, callback execution makes a number of
+ * guarantees:
+ * <ul>
+ *   <li>The callback stream is considered idle for the duration of the
+ *   callback.  Thus, for example, a callback may always use memory attached
+ *   to the callback stream.</li>
+ *   <li>The start of execution of a callback has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the callback.  It thus synchronizes streams which have been "joined"
+ *   prior to the callback.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding callbacks have executed.  Thus, for
+ *   example, a callback might use global attached memory even if work has
+ *   been added to another stream, if it has been properly ordered with an
+ *   event.</li>
+ *   <li>Completion of a callback does not cause a stream to become
+ *   active except as described above.  The callback stream will remain idle
+ *   if no device work follows the callback, and will remain idle across
+ *   consecutive callbacks without device work in between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a callback at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * \param stream   - Stream to add callback to
+ * \param callback - The function to call once preceding stream operations are complete
+ * \param userData - User specified data to be passed to the callback function
+ * \param flags    - Reserved for future use, must be 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamWaitEvent, ::cudaStreamDestroy, ::cudaMallocManaged, ::cudaStreamAttachMemAsync,
+ * ::cudaLaunchHostFunc, ::cuStreamAddCallback
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream,
+        cudaStreamCallback_t callback, void *userData, unsigned int flags);
+
+/**
+ * \brief Waits for stream tasks to complete
+ *
+ * Blocks until \p stream has completed all operations. If the
+ * ::cudaDeviceScheduleBlockingSync flag was set for this device, 
+ * the host thread will block until the stream is finished with 
+ * all of its tasks.
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream);
+
+/**
+ * \brief Queries an asynchronous stream for completion status
+ *
+ * Returns ::cudaSuccess if all operations in \p stream have
+ * completed, or ::cudaErrorNotReady if not.
+ *
+ * For the purposes of Unified Memory, a return value of ::cudaSuccess
+ * is equivalent to having called ::cudaStreamSynchronize().
+ *
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy,
+ * ::cuStreamQuery
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p stream to specify stream association of
+ * \p length bytes of memory starting from \p devPtr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p devPtr must point to an one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cudaMallocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::cudaDevAttrPageableMemoryAccess.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
+ * The default value for \p flags is ::cudaMemAttachSingle
+ * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
+ * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p stream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p stream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region. 
+ *
+ * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p stream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cudaMallocManaged. For __managed__ variables, the default
+ * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param stream  - Stream in which to enqueue the attach operation
+ * \param devPtr  - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  memory)
+ * \param length  - Length of memory (defaults to zero)
+ * \param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged,
+ * ::cuStreamAttachMemAsync
+ */
+#if defined(__cplusplus)
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags = cudaMemAttachSingle);
+#else
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags);
+#endif
+
+/**
+ * \brief Begins graph capture on a stream
+ *
+ * Begin graph capture on \p stream. When a stream is in capture mode, all operations
+ * pushed into the stream will not be executed, but will instead be captured into
+ * a graph, which will be returned via ::cudaStreamEndCapture. Capture may not be initiated
+ * if \p stream is ::cudaStreamLegacy. Capture must be ended on the same stream in which
+ * it was initiated, and it may only be initiated if the stream is not already in capture
+ * mode. The capture mode may be queried via ::cudaStreamIsCapturing. A unique id
+ * representing the capture sequence may be queried via ::cudaStreamGetCaptureInfo.
+ *
+ * If \p mode is not ::cudaStreamCaptureModeRelaxed, ::cudaStreamEndCapture must be
+ * called on this stream from the same thread.
+ *
+ * \note Kernels captured using this API must not use texture and surface references.
+ *       Reading or writing through any texture or surface reference is undefined
+ *       behavior. This restriction does not apply to texture and surface objects.
+ *
+ * \param stream - Stream in which to initiate capture
+ * \param mode    - Controls the interaction of this capture sequence with other API
+ *                  calls that are potentially unsafe. For more details see
+ *                  ::cudaThreadExchangeStreamCaptureMode.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamEndCapture,
+ * ::cudaThreadExchangeStreamCaptureMode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode);
+
+/**
+ * \brief Swaps the stream capture interaction mode for a thread
+ *
+ * Sets the calling thread's stream capture interaction mode to the value contained
+ * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To
+ * facilitate deterministic behavior across function or module boundaries, callers
+ * are encouraged to use this API in a push-pop fashion: \code
+     cudaStreamCaptureMode mode = desiredMode;
+     cudaThreadExchangeStreamCaptureMode(&mode);
+     ...
+     cudaThreadExchangeStreamCaptureMode(&mode); // restore previous mode
+ * \endcode
+ *
+ * During stream capture (see ::cudaStreamBeginCapture), some actions, such as a call
+ * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is
+ * not enqueued asynchronously to a stream, and is not observed by stream capture.
+ * Therefore, if the sequence of operations captured via ::cudaStreamBeginCapture
+ * depended on the allocation being replayed whenever the graph is launched, the
+ * captured graph would be invalid.
+ *
+ * Therefore, stream capture places restrictions on API calls that can be made within
+ * or concurrently to a ::cudaStreamBeginCapture-::cudaStreamEndCapture sequence. This
+ * behavior can be controlled via this API and flags to ::cudaStreamBeginCapture.
+ *
+ * A thread's mode is one of the following:
+ * - \p cudaStreamCaptureModeGlobal: This is the default mode. If the local thread has
+ *   an ongoing capture sequence that was not initiated with
+ *   \p cudaStreamCaptureModeRelaxed at \p cuStreamBeginCapture, or if any other thread
+ *   has a concurrent capture sequence initiated with \p cudaStreamCaptureModeGlobal,
+ *   this thread is prohibited from potentially unsafe API calls.
+ * - \p cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture
+ *   sequence not initiated with \p cudaStreamCaptureModeRelaxed, it is prohibited
+ *   from potentially unsafe API calls. Concurrent capture sequences in other threads
+ *   are ignored.
+ * - \p cudaStreamCaptureModeRelaxed: The local thread is not prohibited from potentially
+ *   unsafe API calls. Note that the thread is still prohibited from API calls which
+ *   necessarily conflict with stream capture, for example, attempting ::cudaEventQuery
+ *   on an event that was last recorded inside a capture sequence.
+ *
+ * \param mode - Pointer to mode value to swap with the current mode
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture
+ */
+extern __host__ cudaError_t CUDARTAPI cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode);
+
+/**
+ * \brief Ends capture on a stream, returning the captured graph
+ *
+ * End capture on \p stream, returning the captured graph via \p pGraph.
+ * Capture must have been initiated on \p stream via a call to ::cudaStreamBeginCapture.
+ * If capture was invalidated, due to a violation of the rules of stream capture, then
+ * a NULL graph will be returned.
+ *
+ * If the \p mode argument to ::cudaStreamBeginCapture was not
+ * ::cudaStreamCaptureModeRelaxed, this call must be from the same thread as
+ * ::cudaStreamBeginCapture.
+ *
+ * \param stream - Stream to query
+ * \param pGraph - The captured graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureWrongThread
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);
+
+/**
+ * \brief Returns a stream's capture status
+ *
+ * Return the capture status of \p stream via \p pCaptureStatus. After a successful
+ * call, \p *pCaptureStatus will contain one of the following:
+ * - ::cudaStreamCaptureStatusNone: The stream is not capturing.
+ * - ::cudaStreamCaptureStatusActive: The stream is capturing.
+ * - ::cudaStreamCaptureStatusInvalidated: The stream was capturing but an error
+ *   has invalidated the capture sequence. The capture sequence must be terminated
+ *   with ::cudaStreamEndCapture on the stream where it was initiated in order to
+ *   continue using \p stream.
+ *
+ * Note that, if this is called on ::cudaStreamLegacy (the "null stream") while
+ * a blocking stream on the same device is capturing, it will return
+ * ::cudaErrorStreamCaptureImplicit and \p *pCaptureStatus is unspecified
+ * after the call. The blocking stream capture is not invalidated.
+ *
+ * When a blocking stream is capturing, the legacy stream is in an
+ * unusable state until the blocking stream capture is terminated. The legacy
+ * stream is not supported for stream capture, but attempted use would have an
+ * implicit dependency on the capturing stream(s).
+ *
+ * \param stream         - Stream to query
+ * \param pCaptureStatus - Returns the stream's capture status
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureImplicit
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamCreate,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamEndCapture
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus);
+
+/**
+ * \brief Query capture status of a stream
+ *
+ * Note there is a later version of this API, ::cudaStreamGetCaptureInfo_v2. It will
+ * supplant this version in 12.0, which is retained for minor version compatibility.
+ *
+ * Query the capture status of a stream and get a unique id representing
+ * the capture sequence over the lifetime of the process.
+ *
+ * If called on ::cudaStreamLegacy (the "null stream") while a stream not created
+ * with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit.
+ *
+ * A valid id is returned only if both of the following are true:
+ * - the call returns ::cudaSuccess
+ * - captureStatus is set to ::cudaStreamCaptureStatusActive
+ *
+ * \param stream         - Stream to query
+ * \param pCaptureStatus - Returns the stream's capture status
+ * \param pId            - Returns the unique id of the capture sequence
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorStreamCaptureImplicit
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamGetCaptureInfo_v2,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus, unsigned long long *pId);
+
+/**
+ * \brief Query a stream's capture state (11.3+)
+ *
+ * Query stream state related to stream capture.
+ *
+ * If called on ::cudaStreamLegacy (the "null stream") while a stream not created 
+ * with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit.
+ *
+ * Valid data (other than capture status) is returned only if both of the following are true:
+ * - the call returns cudaSuccess
+ * - the returned capture status is ::cudaStreamCaptureStatusActive
+ *
+ * This version of cudaStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the
+ * previous version ::cudaStreamGetCaptureInfo in 12.0. Developers requiring compatibility
+ * across minor versions to CUDA 11.0 (driver version 445) can do one of the following:
+ * - Use the older version of the API, ::cudaStreamGetCaptureInfo
+ * - Pass null for all of \p graph_out, \p dependencies_out, and \p numDependencies_out.
+ *
+ * \param stream - The stream to query
+ * \param captureStatus_out - Location to return the capture status of the stream; required
+ * \param id_out - Optional location to return an id for the capture sequence, which is
+ *           unique over the lifetime of the process
+ * \param graph_out - Optional location to return the graph being captured into. All
+ *           operations other than destroy and node removal are permitted on the graph
+ *           while the capture sequence is in progress. This API does not transfer
+ *           ownership of the graph, which is transferred or destroyed at
+ *           ::cudaStreamEndCapture. Note that the graph handle may be invalidated before
+ *           end of capture for certain errors. Nodes that are or become
+ *           unreachable from the original stream at ::cudaStreamEndCapture due to direct
+ *           actions on the graph do not trigger ::cudaErrorStreamCaptureUnjoined.
+ * \param dependencies_out - Optional location to store a pointer to an array of nodes.
+ *           The next node to be captured in the stream will depend on this set of nodes,
+ *           absent operations such as event wait which modify this set. The array pointer
+ *           is valid until the next API call which operates on the stream or until end of
+ *           capture. The node handles may be copied out and are valid until they or the
+ *           graph is destroyed. The driver-owned array may also be passed directly to
+ *           APIs that operate on the graph (not the stream) without copying.
+ * \param numDependencies_out - Optional location to store the size of the array
+ *           returned in dependencies_out.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorStreamCaptureImplicit
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamGetCaptureInfo,
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamIsCapturing,
+ * ::cudaStreamUpdateCaptureDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), size_t *numDependencies_out __dv(0));
+
+/**
+ * \brief Update the set of dependencies in a capturing stream (11.3+)
+ *
+ * Modifies the dependency set of a capturing stream. The dependency set is the set
+ * of nodes that the next captured node in the stream will depend on.
+ *
+ * Valid flags are ::cudaStreamAddCaptureDependencies and
+ * ::cudaStreamSetCaptureDependencies. These control whether the set passed to
+ * the API is added to the existing set or replaces it. A flags value of 0 defaults
+ * to ::cudaStreamAddCaptureDependencies.
+ *
+ * Nodes that are removed from the dependency set via this API do not result in
+ * ::cudaErrorStreamCaptureUnjoined if they are unreachable from the stream at
+ * ::cudaStreamEndCapture.
+ *
+ * Returns ::cudaErrorIllegalState if the stream is not capturing.
+ *
+ * This API is new in CUDA 11.3. Developers requiring compatibility across minor
+ * versions of the CUDA driver to 11.0 should not use this API or provide a fallback.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorIllegalState
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaStreamBeginCapture,
+ * ::cudaStreamGetCaptureInfo,
+ * ::cudaStreamGetCaptureInfo_v2
+ */
+extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags __dv(0));
+/** @} */ /* END CUDART_STREAM */
+
+/**
+ * \defgroup CUDART_EVENT Event Management
+ *
+ * ___MANBRIEF___ event management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the event management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates an event object
+ *
+ * Creates an event object for the current device using ::cudaEventDefault.
+ *
+ * \param event - Newly created event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*, unsigned int) "cudaEventCreate (C++ API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cuEventCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event);
+
+/**
+ * \brief Creates an event object with the specified flags
+ *
+ * Creates an event object for the current device with the specified flags. Valid
+ * flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that event should use blocking
+ *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
+ *   on an event created with this flag will block until the event actually
+ *   completes.
+ * - ::cudaEventDisableTiming: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::cudaEventBlockingSync flag not specified will provide the best
+ *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
+ * - ::cudaEventInterprocess: Specifies that the created event may be used as an
+ *   interprocess event by ::cudaIpcGetEventHandle(). ::cudaEventInterprocess must
+ *   be specified along with ::cudaEventDisableTiming.
+ *
+ * \param event - Newly created event
+ * \param flags - Flags for new event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cuEventCreate
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags);
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p event the contents of \p stream at the time of this call.
+ * \p event and \p stream must be on the same CUDA context.
+ * Calls such as ::cudaEventQuery() or ::cudaStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p stream after this call do not modify \p event. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cudaEventRecord() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cudaStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cudaEventRecord(). Before the first call to ::cudaEventRecord(), an
+ * event represents an empty set of work, so for example ::cudaEventQuery()
+ * would return ::cudaSuccess.
+ *
+ * \param event  - Event to record
+ * \param stream - Stream in which to record event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cuEventRecord
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Records an event
+ *
+ * Captures in \p event the contents of \p stream at the time of this call.
+ * \p event and \p stream must be on the same CUDA context.
+ * Calls such as ::cudaEventQuery() or ::cudaStreamWaitEvent() will then
+ * examine or wait for completion of the work that was captured. Uses of
+ * \p stream after this call do not modify \p event. See note on default
+ * stream behavior for what is captured in the default case.
+ *
+ * ::cudaEventRecordWithFlags() can be called multiple times on the same event and
+ * will overwrite the previously captured state. Other APIs such as
+ * ::cudaStreamWaitEvent() use the most recently captured state at the time
+ * of the API call, and are not affected by later calls to
+ * ::cudaEventRecordWithFlags(). Before the first call to ::cudaEventRecordWithFlags(), an
+ * event represents an empty set of work, so for example ::cudaEventQuery()
+ * would return ::cudaSuccess.
+ *
+ * flags include:
+ * - ::cudaEventRecordDefault: Default event creation flag.
+ * - ::cudaEventRecordExternal: Event is captured in the graph as an external
+ *   event node when performing stream capture.
+ *
+ * \param event  - Event to record
+ * \param stream - Stream in which to record event
+ * \param flags  - Parameters for the operation(See above)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent,
+ * ::cudaEventRecord,
+ * ::cuEventRecord,
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0), unsigned int flags __dv(0));
+#endif
+
+/**
+ * \brief Queries an event's status
+ *
+ * Queries the status of all work currently captured by \p event. See
+ * ::cudaEventRecord() for details on what is captured by an event.
+ *
+ * Returns ::cudaSuccess if all captured work has been completed, or
+ * ::cudaErrorNotReady if any captured work is incomplete.
+ *
+ * For the purposes of Unified Memory, a return value of ::cudaSuccess
+ * is equivalent to having called ::cudaEventSynchronize().
+ *
+ * \param event - Event to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cuEventQuery
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event);
+
+/**
+ * \brief Waits for an event to complete
+ *
+ * Waits until the completion of all work currently captured in \p event.
+ * See ::cudaEventRecord() for details on what is captured by an event.
+ *
+ * Waiting for an event that was created with the ::cudaEventBlockingSync
+ * flag will cause the calling CPU thread to block until the event has
+ * been completed by the device.  If the ::cudaEventBlockingSync flag has
+ * not been set, then the CPU thread will busy-wait until the event has
+ * been completed by the device.
+ *
+ * \param event - Event to wait for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord,
+ * ::cudaEventQuery, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cuEventSynchronize
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event);
+
+/**
+ * \brief Destroys an event object
+ *
+ * Destroys the event specified by \p event.
+ *
+ * An event may be destroyed before it is complete (i.e., while
+ * ::cudaEventQuery() would return ::cudaErrorNotReady). In this case, the
+ * call does not block on completion of the event, and any associated
+ * resources will automatically be released asynchronously at completion.
+ *
+ * \param event - Event to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventRecord, ::cudaEventElapsedTime,
+ * ::cuEventDestroy
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event);
+
+/**
+ * \brief Computes the elapsed time between events
+ *
+ * Computes the elapsed time between two events (in milliseconds with a
+ * resolution of around 0.5 microseconds).
+ *
+ * If either event was last recorded in a non-NULL stream, the resulting time
+ * may be greater than expected (even if both used the same stream handle). This
+ * happens because the ::cudaEventRecord() operation takes place asynchronously
+ * and there is no guarantee that the measured latency is actually just between
+ * the two events. Any number of other different stream operations could execute
+ * in between the two measured events, thus altering the timing in a significant
+ * way.
+ *
+ * If ::cudaEventRecord() has not been called on either event, then
+ * ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been
+ * called on both events but one or both of them has not yet been completed
+ * (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one
+ * of the events), ::cudaErrorNotReady is returned. If either event was created
+ * with the ::cudaEventDisableTiming flag, then this function will return
+ * ::cudaErrorInvalidResourceHandle.
+ *
+ * \param ms    - Time between \p start and \p end in ms
+ * \param start - Starting event
+ * \param end   - Ending event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord,
+ * ::cuEventElapsedTime
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end);
+
+/** @} */ /* END CUDART_EVENT */
+
+/**
+ * \defgroup CUDART_EXTRES_INTEROP External Resource Interoperability
+ *
+ * ___MANBRIEF___ External resource interoperability functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the external resource interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Imports an external memory object
+ *
+ * Imports an externally allocated memory object and returns
+ * a handle to that in \p extMem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p memHandleDesc. The ::cudaExternalMemoryHandleDesc structure
+ * is defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryHandleDesc_st {
+            cudaExternalMemoryHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void *nvSciBufObject;
+            } handle;
+            unsigned long long size;
+            unsigned int flags;
+        } cudaExternalMemoryHandleDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryHandleDesc::type specifies the type
+ * of handle being imported. ::cudaExternalMemoryHandleType is
+ * defined as:
+ *
+ * \code
+        typedef enum cudaExternalMemoryHandleType_enum {
+            cudaExternalMemoryHandleTypeOpaqueFd         = 1,
+            cudaExternalMemoryHandleTypeOpaqueWin32      = 2,
+            cudaExternalMemoryHandleTypeOpaqueWin32Kmt   = 3,
+            cudaExternalMemoryHandleTypeD3D12Heap        = 4,
+            cudaExternalMemoryHandleTypeD3D12Resource    = 5,
+	        cudaExternalMemoryHandleTypeD3D11Resource    = 6,
+		    cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+            cudaExternalMemoryHandleTypeNvSciBuf         = 8
+        } cudaExternalMemoryHandleType;
+ * \endcode
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueFd, then
+ * ::cudaExternalMemoryHandleDesc::handle::fd must be a valid
+ * file descriptor referencing a memory object. Ownership of
+ * the file descriptor is transferred to the CUDA driver when the
+ * handle is imported successfully. Performing any operations on the
+ * file descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueWin32, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a memory object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a memory object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt, then
+ * ::cudaExternalMemoryHandleDesc::handle::win32::handle must
+ * be non-NULL and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * memory object are destroyed.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D12Heap, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Heap object. This handle holds a reference to the underlying
+ * object. If ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Heap object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D12Resource, then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Resource object. This handle holds a reference to the
+ * underlying object. If
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D12Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D11Resource,then exactly one
+ * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle is    
+ * not NULL, then it must represent a valid shared NT handle that is  
+ * returned by  IDXGIResource1::CreateSharedHandle when referring to a 
+ * ID3D11Resource object. If
+ * ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * is not NULL, then it must point to a NULL-terminated array of
+ * UTF-16 characters that refers to a ID3D11Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt, then
+ * ::cudaExternalMemoryHandleDesc::handle::win32::handle must
+ * be non-NULL and ::cudaExternalMemoryHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a valid shared KMT
+ * handle that is returned by IDXGIResource::GetSharedHandle when
+ * referring to a ID3D11Resource object.
+ *
+ * If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeNvSciBuf, then
+ * ::cudaExternalMemoryHandleDesc::handle::nvSciBufObject must be NON-NULL
+ * and reference a valid NvSciBuf object.
+ * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the
+ * application must use ::cudaWaitExternalSemaphoresAsync or ::cudaSignalExternalSemaphoresAsync
+ * as approprriate barriers to maintain coherence between CUDA and the other drivers.
+ * See ::cudaExternalSemaphoreWaitSkipNvSciBufMemSync and ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync 
+ * for memory synchronization.
+ *
+ * The size of the memory object must be specified in
+ * ::cudaExternalMemoryHandleDesc::size.
+ *
+ * Specifying the flag ::cudaExternalMemoryDedicated in
+ * ::cudaExternalMemoryHandleDesc::flags indicates that the
+ * resource is a dedicated resource. The definition of what a
+ * dedicated resource is outside the scope of this extension.
+ * This flag must be set if ::cudaExternalMemoryHandleDesc::type
+ * is one of the following:
+ * ::cudaExternalMemoryHandleTypeD3D12Resource
+ * ::cudaExternalMemoryHandleTypeD3D11Resource
+ * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+ *
+ * \param extMem_out    - Returned handle to an external memory object
+ * \param memHandleDesc - Memory import handle descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the
+ * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges
+ * as well as appropriate Vulkan pipeline barriers to maintain coherence between
+ * CPU and GPU. For more information on these APIs, please refer to "Synchronization
+ * and Cache Control" chapter from Vulkan specification.
+ *
+ *
+ * \sa ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(cudaExternalMemory_t *extMem_out, const struct cudaExternalMemoryHandleDesc *memHandleDesc);
+
+/**
+ * \brief Maps a buffer onto an imported memory object
+ *
+ * Maps a buffer onto an imported memory object and returns a device
+ * pointer in \p devPtr.
+ *
+ * The properties of the buffer being mapped must be described in
+ * \p bufferDesc. The ::cudaExternalMemoryBufferDesc structure is
+ * defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryBufferDesc_st {
+            unsigned long long offset;
+            unsigned long long size;
+            unsigned int flags;
+        } cudaExternalMemoryBufferDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryBufferDesc::offset is the offset in
+ * the memory object where the buffer's base address is.
+ * ::cudaExternalMemoryBufferDesc::size is the size of the buffer.
+ * ::cudaExternalMemoryBufferDesc::flags must be zero.
+ *
+ * The offset and size have to be suitably aligned to match the
+ * requirements of the external API. Mapping two buffers whose ranges
+ * overlap may or may not result in the same virtual address being
+ * returned for the overlapped portion. In such cases, the application
+ * must ensure that all accesses to that region from the GPU are
+ * volatile. Otherwise writes made via one address are not guaranteed
+ * to be visible via the other address, even if they're issued by the
+ * same thread. It is recommended that applications map the combined
+ * range instead of mapping separate buffers and then apply the
+ * appropriate offsets to the returned pointer to derive the
+ * individual buffers.
+ *
+ * The returned pointer \p devPtr must be freed using ::cudaFree.
+ *
+ * \param devPtr     - Returned device pointer to buffer
+ * \param extMem     - Handle to external memory object
+ * \param bufferDesc - Buffer descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(void **devPtr, cudaExternalMemory_t extMem, const struct cudaExternalMemoryBufferDesc *bufferDesc);
+
+/**
+ * \brief Maps a CUDA mipmapped array onto an external memory object
+ *
+ * Maps a CUDA mipmapped array onto an external object and returns a
+ * handle to it in \p mipmap.
+ *
+ * The properties of the CUDA mipmapped array being mapped must be
+ * described in \p mipmapDesc. The structure
+ * ::cudaExternalMemoryMipmappedArrayDesc is defined as follows:
+ *
+ * \code
+        typedef struct cudaExternalMemoryMipmappedArrayDesc_st {
+            unsigned long long offset;
+            cudaChannelFormatDesc formatDesc;
+            cudaExtent extent;
+            unsigned int flags;
+            unsigned int numLevels;
+        } cudaExternalMemoryMipmappedArrayDesc;
+ * \endcode
+ *
+ * where ::cudaExternalMemoryMipmappedArrayDesc::offset is the
+ * offset in the memory object where the base level of the mipmap
+ * chain is.
+ * ::cudaExternalMemoryMipmappedArrayDesc::formatDesc describes the
+ * format of the data.
+ * ::cudaExternalMemoryMipmappedArrayDesc::extent specifies the
+ * dimensions of the base level of the mipmap chain.
+ * ::cudaExternalMemoryMipmappedArrayDesc::flags are flags associated
+ * with CUDA mipmapped arrays. For further details, please refer to
+ * the documentation for ::cudaMalloc3DArray. Note that if the mipmapped
+ * array is bound as a color target in the graphics API, then the flag
+ * ::cudaArrayColorAttachment must be specified in 
+ * ::cudaExternalMemoryMipmappedArrayDesc::flags.
+ * ::cudaExternalMemoryMipmappedArrayDesc::numLevels specifies
+ * the total number of levels in the mipmap chain.
+ *
+ * The returned CUDA mipmapped array must be freed using ::cudaFreeMipmappedArray.
+ *
+ * \param mipmap     - Returned CUDA mipmapped array
+ * \param extMem     - Handle to external memory object
+ * \param mipmapDesc - CUDA array descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note On Tegra devices, this API will always attempt to do a compressed mapping when the ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeOpaqueFd
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaDestroyExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer
+ *
+ * \note If ::cudaExternalMemoryHandleDesc::type is
+ * ::cudaExternalMemoryHandleTypeNvSciBuf, then
+ * ::cudaExternalMemoryMipmappedArrayDesc::numLevels must not be greater than 1.
+ */
+extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem, const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc);
+
+/**
+ * \brief Destroys an external memory object.
+ *
+ * Destroys the specified external memory object. Any existing buffers
+ * and CUDA mipmapped arrays mapped onto this object must no longer be
+ * used and must be explicitly freed using ::cudaFree and
+ * ::cudaFreeMipmappedArray respectively.
+ *
+ * \param extMem - External memory object to be destroyed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaImportExternalMemory,
+ * ::cudaExternalMemoryGetMappedBuffer,
+ * ::cudaExternalMemoryGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalMemory(cudaExternalMemory_t extMem);
+
+/**
+ * \brief Imports an external semaphore
+ *
+ * Imports an externally allocated synchronization object and returns
+ * a handle to that in \p extSem_out.
+ *
+ * The properties of the handle being imported must be described in
+ * \p semHandleDesc. The ::cudaExternalSemaphoreHandleDesc is defined
+ * as follows:
+ *
+ * \code
+        typedef struct cudaExternalSemaphoreHandleDesc_st {
+            cudaExternalSemaphoreHandleType type;
+            union {
+                int fd;
+                struct {
+                    void *handle;
+                    const void *name;
+                } win32;
+                const void* NvSciSyncObj;
+            } handle;
+            unsigned int flags;
+        } cudaExternalSemaphoreHandleDesc;
+ * \endcode
+ *
+ * where ::cudaExternalSemaphoreHandleDesc::type specifies the type of
+ * handle being imported. ::cudaExternalSemaphoreHandleType is defined
+ * as:
+ *
+ * \code
+        typedef enum cudaExternalSemaphoreHandleType_enum {
+            cudaExternalSemaphoreHandleTypeOpaqueFd                = 1,
+            cudaExternalSemaphoreHandleTypeOpaqueWin32             = 2,
+            cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt          = 3,
+            cudaExternalSemaphoreHandleTypeD3D12Fence              = 4,
+            cudaExternalSemaphoreHandleTypeD3D11Fence              = 5,
+            cudaExternalSemaphoreHandleTypeNvSciSync               = 6,
+            cudaExternalSemaphoreHandleTypeKeyedMutex              = 7,
+            cudaExternalSemaphoreHandleTypeKeyedMutexKmt           = 8,
+            cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd     = 9,
+            cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32  = 10
+        } cudaExternalSemaphoreHandleType;
+ * \endcode
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::fd must be a valid file
+ * descriptor referencing a synchronization object. Ownership of the
+ * file descriptor is transferred to the CUDA driver when the handle
+ * is imported successfully. Performing any operations on the file
+ * descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name is
+ * not NULL, then it must name a valid synchronization object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle must be
+ * non-NULL and ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must be a globally shared KMT
+ * handle. This handle does not hold a reference to the underlying
+ * object, and thus will be invalid when all references to the
+ * synchronization object are destroyed.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D12Device::CreateSharedHandle when referring to a
+ * ID3D12Fence object. This handle holds a reference to the underlying
+ * object. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D12Fence object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * is returned by ID3D11Fence::CreateSharedHandle. If 
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * is not NULL, then it must name a valid synchronization object that
+ * refers to a valid ID3D11Fence object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeNvSciSync, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::nvSciSyncObj
+ * represents a valid NvSciSyncObj.
+ *
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it represent a valid shared NT handle that
+ * is returned by IDXGIResource1::CreateSharedHandle when referring to
+ * a IDXGIKeyedMutex object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle must be
+ * non-NULL and ::cudaExternalSemaphoreHandleDesc::handle::win32::name
+ * must be NULL. The handle specified must represent a valid KMT
+ * handle that is returned by IDXGIResource::GetSharedHandle when
+ * referring to a IDXGIKeyedMutex object.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd, then
+ * ::cudaExternalSemaphoreHandleDesc::handle::fd must be a valid file
+ * descriptor referencing a synchronization object. Ownership of the
+ * file descriptor is transferred to the CUDA driver when the handle
+ * is imported successfully. Performing any operations on the file
+ * descriptor after it is imported results in undefined behavior.
+ *
+ * If ::cudaExternalSemaphoreHandleDesc::type is
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32, then exactly one of
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and
+ * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be
+ * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle
+ * is not NULL, then it must represent a valid shared NT handle that
+ * references a synchronization object. Ownership of this handle is
+ * not transferred to CUDA after the import operation, so the
+ * application must release the handle using the appropriate system
+ * call. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name is
+ * not NULL, then it must name a valid synchronization object.
+ *
+ * \param extSem_out    - Returned handle to an external semaphore
+ * \param semHandleDesc - Semaphore import handle descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(cudaExternalSemaphore_t *extSem_out, const struct cudaExternalSemaphoreHandleDesc *semHandleDesc);
+
+/**
+ * \brief Signals a set of external semaphore objects
+ *
+ * Enqueues a signal operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of signaling a semaphore depends on the type of
+ * the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+ * then signaling the semaphore will set it to the signaled state.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence,
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+ * then the semaphore will be set to the value specified in
+ * ::cudaExternalSemaphoreSignalParams::params::fence::value.
+ *
+ * If the semaphore object is of the type ::cudaExternalSemaphoreHandleTypeNvSciSync
+ * this API sets ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence to a
+ * value that can be used by subsequent waiters of the same NvSciSync object to
+ * order operations with those currently submitted in \p stream. Such an update
+ * will overwrite previous contents of
+ * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence. By deefault,
+ * signaling such an external semaphore object causes appropriate memory synchronization
+ * operations to be performed over all the external memory objects that are imported as
+ * ::cudaExternalMemoryHandleTypeNvSciBuf. This ensures that any subsequent accesses
+ * made by other importers of the same set of NvSciBuf memory object(s) are coherent.
+ * These operations can be skipped by specifying the flag
+ * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cudaDeviceGetNvSciSyncAttributes to cudaNvSciSyncAttrSignal, this API will return
+ * cudaErrorNotSupported.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex,
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt,
+ * then the keyed mutex will be released with the key specified in
+ * ::cudaExternalSemaphoreSignalParams::params::keyedmutex::key.
+ *
+ * \param extSemArray - Set of external semaphores to be signaled
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to signal
+ * \param stream     - Stream to enqueue the signal operations in
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Waits on a set of external semaphore objects
+ *
+ * Enqueues a wait operation on a set of externally allocated
+ * semaphore object in the specified stream. The operations will be
+ * executed when all prior operations in the stream complete.
+ *
+ * The exact semantics of waiting on a semaphore depends on the type
+ * of the object.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeOpaqueFd,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32,
+ * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+ * then waiting on the semaphore will wait until the semaphore reaches
+ * the signaled state. The semaphore will then be reset to the
+ * unsignaled state. Therefore for every signal operation, there can
+ * only be one wait operation.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeD3D12Fence,
+ * ::cudaExternalSemaphoreHandleTypeD3D11Fence,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd,
+ * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+ * then waiting on the semaphore will wait until the value of the
+ * semaphore is greater than or equal to
+ * ::cudaExternalSemaphoreWaitParams::params::fence::value.
+ *
+ * If the semaphore object is of the type ::cudaExternalSemaphoreHandleTypeNvSciSync
+ * then, waiting on the semaphore will wait until the
+ * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence is signaled by the
+ * signaler of the NvSciSyncObj that was associated with this semaphore object.
+ * By default, waiting on such an external semaphore object causes appropriate
+ * memory synchronization operations to be performed over all external memory objects
+ * that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf. This ensures that
+ * any subsequent accesses made by other importers of the same set of NvSciBuf memory
+ * object(s) are coherent. These operations can be skipped by specifying the flag
+ * ::cudaExternalSemaphoreWaitSkipNvSciBufMemSync, which can be used as a
+ * performance optimization when data coherency is not required. But specifying this
+ * flag in scenarios where data coherency is required results in undefined behavior.
+ * Also, for semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync,
+ * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in
+ * ::cudaDeviceGetNvSciSyncAttributes to cudaNvSciSyncAttrWait, this API will return
+ * cudaErrorNotSupported.
+ *
+ * If the semaphore object is any one of the following types:
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutex,
+ * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt,
+ * then the keyed mutex will be acquired when it is released with the key specified 
+ * in ::cudaExternalSemaphoreSignalParams::params::keyedmutex::key or
+ * until the timeout specified by
+ * ::cudaExternalSemaphoreSignalParams::params::keyedmutex::timeoutMs
+ * has lapsed. The timeout interval can either be a finite value
+ * specified in milliseconds or an infinite value. In case an infinite
+ * value is specified the timeout never elapses. The windows INFINITE
+ * macro must be used to specify infinite timeout
+ *
+ * \param extSemArray - External semaphores to be waited on
+ * \param paramsArray - Array of semaphore parameters
+ * \param numExtSems  - Number of semaphores to wait on
+ * \param stream      - Stream to enqueue the wait operations in
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * ::cudaErrorTimeout
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaDestroyExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Destroys an external semaphore
+ *
+ * Destroys an external semaphore object and releases any references
+ * to the underlying resource. Any outstanding signals or waits must
+ * have completed before the semaphore is destroyed.
+ *
+ * \param extSem - External semaphore to be destroyed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem);
+
+/** @} */ /* END CUDART_EXTRES_INTEROP */
+
+/**
+ * \defgroup CUDART_EXECUTION Execution Control
+ *
+ * ___MANBRIEF___ execution control functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the execution control functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory
+ * \param stream      - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * ::cuLaunchKernel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+
+/**
+ * \brief Launches a CUDA function with launch-time configuration
+ *
+ * Note that the functionally equivalent variadic template ::cudaLaunchKernelEx
+ * is available for C++11 and newer.
+ *
+ * Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x
+ * &times; \p config->gridDim.y &times; \p config->gridDim.z) grid of blocks.
+ * Each block contains \p config->blockDim (\p config->blockDim.x &times;
+ * \p config->blockDim.y &times; \p config->blockDim.z) threads.
+ *
+ * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that
+ * will be available to each thread block.
+ *
+ * \p config->stream specifies a stream the invocation is associated to.
+ *
+ * Configuration beyond grid and block dimensions, dynamic shared memory size,
+ * and stream can be provided with the following two fields of \p config:
+ *
+ * \p config->attrs is an array of \p config->numAttrs contiguous
+ * ::cudaLaunchAttribute elements. The value of this pointer is not considered
+ * if \p config->numAttrs is zero. However, in that case, it is recommended to
+ * set the pointer to NULL.                                  
+ * \p config->numAttrs is the number of attributes populating the first
+ * \p config->numAttrs positions of the \p config->attrs array.
+ *
+ * If the kernel has N parameters the \p args should point to array of N
+ * pointers. Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point
+ * to the region of memory from which the actual parameter will be copied.
+ *
+ * N.B. This function is so named to avoid unintentionally invoking the
+ *      templated version, \p cudaLaunchKernelEx, for kernels taking a single
+ *      void** or void* parameter.
+ *
+ * \param config - Launch configuration
+ * \param func   - Kernel to launch
+ * \param args   - Array of pointers to kernel parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchKernelEx(const cudaLaunchConfig_t *config, void (*kernel)(ExpTypes...), ActTypes &&... args) "cudaLaunchKernelEx (C++ API)",
+ * ::cuLaunchKernelEx
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args);
+
+/**
+ * \brief Launches a device function where thread blocks can cooperate and synchronize as they execute
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::cudaDevAttrCooperativeLaunch.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory
+ * \param stream      - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorCooperativeLaunchTooLarge,
+ * ::cudaErrorSharedObjectInitFailed
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)",
+ * ::cudaLaunchCooperativeKernelMultiDevice,
+ * ::cuLaunchCooperativeKernel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+
+/**
+ * \brief Launches device functions on multiple devices where thread blocks can cooperate and synchronize as they execute
+ *
+ * \deprecated This function is deprecated as of CUDA 11.3.
+ *
+ * Invokes kernels as specified in the \p launchParamsList array where each element
+ * of the array specifies all the parameters required to perform a single kernel launch.
+ * These kernels can cooperate and synchronize as they execute. The size of the array is
+ * specified by \p numDevices.
+ *
+ * No two kernels can be launched on the same device. All the devices targeted by this
+ * multi-device launch must be identical. All devices must have a non-zero value for the
+ * device attribute ::cudaDevAttrCooperativeMultiDeviceLaunch.
+ *
+ * The same kernel must be launched on all devices. Note that any __device__ or __constant__
+ * variables are independently instantiated on every device. It is the application's
+ * responsiblity to ensure these variables are initialized and used appropriately.
+ *
+ * The size of the grids as specified in blocks, the size of the blocks themselves and the
+ * amount of shared memory used by each thread block must also match across all launched kernels.
+ *
+ * The streams used to launch these kernels must have been created via either ::cudaStreamCreate
+ * or ::cudaStreamCreateWithPriority or ::cudaStreamCreateWithPriority. The NULL stream or
+ * ::cudaStreamLegacy or ::cudaStreamPerThread cannot be used.
+ *
+ * The total number of blocks launched per kernel cannot exceed the maximum number of blocks
+ * per multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. Since the
+ * total number of blocks launched per device has to match across all devices, the maximum
+ * number of blocks that can be launched per device will be limited by the device with the
+ * least number of multiprocessors.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * The ::cudaLaunchParams structure is defined as:
+ * \code
+        struct cudaLaunchParams
+        {
+            void *func;
+            dim3 gridDim;
+            dim3 blockDim;
+            void **args;
+            size_t sharedMem;
+            cudaStream_t stream;
+        };
+ * \endcode
+ * where:
+ * - ::cudaLaunchParams::func specifies the kernel to be launched. This same functions must
+ *   be launched on all devices. For templated functions, pass the function symbol as follows:
+ *   func_name<template_arg_0,...,template_arg_N>
+ * - ::cudaLaunchParams::gridDim specifies the width, height and depth of the grid in blocks.
+ *   This must match across all kernels launched.
+ * - ::cudaLaunchParams::blockDim is the width, height and depth of each thread block. This
+ *   must match across all kernels launched.
+ * - ::cudaLaunchParams::args specifies the arguments to the kernel. If the kernel has
+ *   N parameters then ::cudaLaunchParams::args should point to array of N pointers. Each
+ *   pointer, from <tt>::cudaLaunchParams::args[0]</tt> to <tt>::cudaLaunchParams::args[N - 1]</tt>,
+ *   point to the region of memory from which the actual parameter will be copied.
+ * - ::cudaLaunchParams::sharedMem is the dynamic shared-memory size per thread block in bytes.
+ *   This must match across all kernels launched.
+ * - ::cudaLaunchParams::stream is the handle to the stream to perform the launch in. This cannot
+ *   be the NULL stream or ::cudaStreamLegacy or ::cudaStreamPerThread.
+ *
+ * By default, the kernel won't begin execution on any GPU until all prior work in all the specified
+ * streams has completed. This behavior can be overridden by specifying the flag
+ * ::cudaCooperativeLaunchMultiDeviceNoPreSync. When this flag is specified, each kernel
+ * will only wait for prior work in the stream corresponding to that GPU to complete before it begins
+ * execution.
+ *
+ * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin
+ * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying
+ * the flag ::cudaCooperativeLaunchMultiDeviceNoPostSync. When this flag is specified,
+ * any subsequent work pushed in any of the specified streams will only wait for the kernel launched
+ * on the GPU corresponding to that stream to complete before it begins execution.
+ *
+ * \param launchParamsList - List of launch parameters, one per device
+ * \param numDevices       - Size of the \p launchParamsList array
+ * \param flags            - Flags to control launch behavior
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorCooperativeLaunchTooLarge,
+ * ::cudaErrorSharedObjectInitFailed
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)",
+ * ::cudaLaunchCooperativeKernel,
+ * ::cuLaunchCooperativeKernelMultiDevice
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags  __dv(0));
+
+/**
+ * \brief Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache configuration
+ * for the function specified via \p func. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute \p func.
+ *
+ * \p func is a device function symbol and must be declared as a
+ * \c __global__ function. If the specified function does not exist,
+ * then ::cudaErrorInvalidDeviceFunction is returned. For templated functions,
+ * pass the function symbol as follows: func_name<template_arg_0,...,template_arg_N>
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory
+ *
+ * \param func        - Device function symbol
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa 
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
+ * ::cudaThreadGetCacheConfig,
+ * ::cudaThreadSetCacheConfig,
+ * ::cuFuncSetCacheConfig
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig);
+
+/**
+ * \brief Sets the shared memory configuration for a device function
+ *
+ * On devices with configurable shared memory banks, this function will 
+ * force all subsequent launches of the specified device function to have
+ * the given shared memory bank size configuration. On any given launch of the
+ * function, the shared memory configuration of the device will be temporarily
+ * changed if needed to suit the function's preferred configuration. Changes in
+ * shared memory configuration between subsequent launches of functions, 
+ * may introduce a device side synchronization point.
+ *
+ * Any per-function setting of shared memory bank size set via 
+ * ::cudaFuncSetSharedMemConfig will override the device wide setting set by
+ * ::cudaDeviceSetSharedMemConfig.
+ *
+ * Changing the shared memory bank size will not increase shared memory usage
+ * or affect occupancy of kernels, but may have major effects on performance. 
+ * Larger bank sizes will allow for greater potential bandwidth to shared memory,
+ * but will change what kinds of accesses to shared memory will result in bank 
+ * conflicts.
+ *
+ * This function will do nothing on devices with fixed shared memory bank size.
+ *
+ * For templated functions, pass the function symbol as follows:
+ * func_name<template_arg_0,...,template_arg_N>
+ *
+ * The supported bank configurations are:
+ * - ::cudaSharedMemBankSizeDefault: use the device's shared memory configuration
+ *   when launching this function.
+ * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be 
+ *   four bytes natively when launching this function.
+ * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight 
+ *   bytes natively when launching this function.
+ *
+ * \param func   - Device function symbol
+ * \param config - Requested shared memory configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceSetSharedMemConfig,
+ * ::cudaDeviceGetSharedMemConfig,
+ * ::cudaDeviceSetCacheConfig,
+ * ::cudaDeviceGetCacheConfig,
+ * ::cudaFuncSetCacheConfig,
+ * ::cuFuncSetSharedMemConfig
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config);
+
+/**
+ * \brief Find out attributes for a given function
+ *
+ * This function obtains the attributes of a function specified via \p func.
+ * \p func is a device function symbol and must be declared as a
+ * \c __global__ function. The fetched attributes are placed in \p attr.
+ * If the specified function does not exist, then
+ * ::cudaErrorInvalidDeviceFunction is returned. For templated functions, pass
+ * the function symbol as follows: func_name<template_arg_0,...,template_arg_N>
+ *
+ * Note that some function attributes such as
+ * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
+ * may vary based on the device that is currently being used.
+ *
+ * \param attr - Return pointer to function's attributes
+ * \param func - Device function symbol
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_string_api_deprecation2
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa 
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)",
+ * ::cuFuncGetAttribute
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func);
+
+
+/**
+ * \brief Set attributes for a given function
+ *
+ * This function sets the attributes of a function specified via \p func.
+ * The parameter \p func must be a pointer to a function that executes
+ * on the device. The parameter specified by \p func must be declared as a \p __global__
+ * function. The enumeration defined by \p attr is set to the value defined by \p value.
+ * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ * If the specified attribute cannot be written, or if the value is incorrect, 
+ * then ::cudaErrorInvalidValue is returned.
+ *
+ * Valid values for \p attr are:
+ * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
+ *   cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
+ * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, 
+ *   this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ *
+ * \param func  - Function to get attributes of
+ * \param attr  - Attribute to set
+ * \param value - Value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value);
+
+/**
+ * \brief Converts a double argument to be executed on a device
+ *
+ * \param d - Double to convert
+ *
+ * \deprecated This function is deprecated as of CUDA 7.5
+ *
+ * Converts the double value of \p d to an internal float representation if
+ * the device does not support double arithmetic. If the device does natively
+ * support doubles, then this function does nothing.
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForHost
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d);
+
+/**
+ * \brief Converts a double argument after execution on a device
+ *
+ * \deprecated This function is deprecated as of CUDA 7.5
+ *
+ * Converts the double value of \p d from a potentially internal float
+ * representation if the device does not support double arithmetic. If the
+ * device does natively support doubles, then this function does nothing.
+ *
+ * \param d - Double to convert
+ *
+ * \return
+ * ::cudaSuccess
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice
+ */
+extern __CUDA_DEPRECATED  __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d);
+
+/**
+ * \brief Enqueues a host function call in a stream
+ *
+ * Enqueues a host function to run in a stream.  The function will be called
+ * after currently enqueued work and will block work added after it.
+ *
+ * The host function must not make any CUDA API calls.  Attempting to use a
+ * CUDA API may result in ::cudaErrorNotPermitted, but this is not required.
+ * The host function must not perform any synchronization that may depend on
+ * outstanding CUDA work not mandated to run earlier.  Host functions without a
+ * mandated order (such as in independent streams) execute in undefined order
+ * and may be serialized.
+ *
+ * For the purposes of Unified Memory, execution makes a number of guarantees:
+ * <ul>
+ *   <li>The stream is considered idle for the duration of the function's
+ *   execution.  Thus, for example, the function may always use memory attached
+ *   to the stream it was enqueued in.</li>
+ *   <li>The start of execution of the function has the same effect as
+ *   synchronizing an event recorded in the same stream immediately prior to
+ *   the function.  It thus synchronizes streams which have been "joined"
+ *   prior to the function.</li>
+ *   <li>Adding device work to any stream does not have the effect of making
+ *   the stream active until all preceding host functions and stream callbacks
+ *   have executed.  Thus, for
+ *   example, a function might use global attached memory even if work has
+ *   been added to another stream, if the work has been ordered behind the
+ *   function call with an event.</li>
+ *   <li>Completion of the function does not cause a stream to become
+ *   active except as described above.  The stream will remain idle
+ *   if no device work follows the function, and will remain idle across
+ *   consecutive host functions or stream callbacks without device work in
+ *   between.  Thus, for example,
+ *   stream synchronization can be done by signaling from a host function at the
+ *   end of the stream.</li>
+ * </ul>
+ *
+ * Note that, in constrast to ::cuStreamAddCallback, the function will not be
+ * called in the event of an error in the CUDA context.
+ *
+ * \param hStream  - Stream to enqueue function call in
+ * \param fn       - The function to call once preceding stream operations are complete
+ * \param userData - User-specified data to be passed to the function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate,
+ * ::cudaStreamQuery,
+ * ::cudaStreamSynchronize,
+ * ::cudaStreamWaitEvent,
+ * ::cudaStreamDestroy,
+ * ::cudaMallocManaged,
+ * ::cudaStreamAttachMemAsync,
+ * ::cudaStreamAddCallback,
+ * ::cuLaunchHostFunc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData);
+
+/** @} */ /* END CUDART_EXECUTION */
+
+/**
+ * \defgroup CUDART_OCCUPANCY Occupancy
+ *
+ * ___MANBRIEF___ occupancy calculation functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the occupancy calculation functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Besides the occupancy calculator functions
+ * (\ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessor and \ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags),
+ * there are also C++ only occupancy-based launch configuration functions documented in
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * See
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)"
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns occupancy for a device function
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessor
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize);
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM.
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the block
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, const void *func, int numBlocks, int blockSize);
+
+/**
+ * \brief Returns occupancy for a device function with the specified flags
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor,
+ * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)",
+ * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)",
+ * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)",
+ * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags);
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect 
+ * the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaFuncGetAttributes
+ * \ref ::cudaOccupancyMaxPotentialClusterSize(int*, T, const cudaLaunchConfig_t*) "cudaOccupancyMaxPotentialClusterSize (C++ API)",
+ * ::cuOccupancyMaxPotentialClusterSize
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *func, const cudaLaunchConfig_t *launchConfig);
+
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes), the cluster size from config must either be
+ * unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidClusterSize,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ * \ref ::cudaOccupancyMaxActiveClusters(int*, T, const cudaLaunchConfig_t*) "cudaOccupancyMaxActiveClusters (C++ API)",
+ * ::cuOccupancyMaxActiveClusters
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig);
+/** @} */ /* END CUDA_OCCUPANCY */
+
+/**
+ * \defgroup CUDART_MEMORY Memory Management
+ *
+ * ___MANBRIEF___ memory management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the memory management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p size bytes of managed memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::cudaErrorNotSupported is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::cudaDevAttrManagedMemory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p size
+ * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
+ * default value for \p flags is ::cudaMemAttachGlobal.
+ * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
+ * any stream on any device. If ::cudaMemAttachHost is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
+ * ::cudaStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cudaStreamAttachMemAsync to
+ * a single stream, the default association, as specifed during ::cudaMallocManaged,
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::cudaMemAttachGlobal. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cudaMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
+ * value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all devices used in
+ * that process that support managed memory have to be peer-to-peer compatible
+ * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
+ * that supports managed memory is used and it is not peer-to-peer compatible with
+ * any of the other managed memory supporting devices that were previously used in
+ * that process, even if ::cudaDeviceReset has been called on those devices. These
+ * environment variables are described in the CUDA programming guide under the
+ * "CUDA environment variables" section.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ * \param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync,
+ * ::cuMemAllocManaged
+ */
+#if defined(__cplusplus)
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags = cudaMemAttachGlobal);
+#else
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags);
+#endif
+
+/**
+ * \brief Allocate memory on the device
+ *
+ * Allocates \p size bytes of linear memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. The allocated memory is
+ * suitably aligned for any kind of variable. The memory is not cleared.
+ * ::cudaMalloc() returns ::cudaErrorMemoryAllocation in case of failure.
+ *
+ * The device version of ::cudaFree cannot be used with a \p *devPtr
+ * allocated using the host API, and vice versa.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMemAlloc
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size);
+
+/**
+ * \brief Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy*(). Since the memory can be accessed directly by the device,
+ * it can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * memory with ::cudaMallocHost() may degrade system performance, since it
+ * reduces the amount of memory available to the system for paging. As a
+ * result, this function is best used sparingly to allocate staging areas for
+ * data exchange between host and device.
+ *
+ * \param ptr  - Pointer to allocated host memory
+ * \param size - Requested allocation size in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaMalloc3D,
+ * ::cudaMalloc3DArray, ::cudaHostAlloc, ::cudaFree, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t, unsigned int) "cudaMallocHost (C++ API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMemAllocHost
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size);
+
+/**
+ * \brief Allocates pitched memory on the device
+ *
+ * Allocates at least \p width (in bytes) * \p height bytes of linear memory
+ * on the device and returns in \p *devPtr a pointer to the allocated memory.
+ * The function may pad the allocation to ensure that corresponding pointers
+ * in any given row will continue to meet the alignment requirements for
+ * coalescing as the address is updated from row to row. The pitch returned in
+ * \p *pitch by ::cudaMallocPitch() is the width in bytes of the allocation.
+ * The intended usage of \p pitch is as a separate parameter of the allocation,
+ * used to compute addresses within the 2D array. Given the row and column of
+ * an array element of type \p T, the address is computed as:
+ * \code
+    T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column;
+   \endcode
+ *
+ * For allocations of 2D arrays, it is recommended that programmers consider
+ * performing pitch allocations using ::cudaMallocPitch(). Due to pitch
+ * alignment restrictions in the hardware, this is especially true if the
+ * application will be performing 2D memory copies between different regions
+ * of device memory (whether linear memory or CUDA arrays).
+ *
+ * \param devPtr - Pointer to allocated pitched device memory
+ * \param pitch  - Pitch for allocation
+ * \param width  - Requested pitched allocation width (in bytes)
+ * \param height - Requested pitched allocation height
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * ::cudaHostAlloc,
+ * ::cuMemAllocPitch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height);
+
+/**
+ * \brief Allocate an array on the device
+ *
+ * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA array in \p *array.
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+    enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
+ * - ::cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using a surface reference
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the array.
+ * - ::cudaArraySparse: Allocates a CUDA array without physical backing memory. The subregions within this sparse array
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. 
+ *   The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA array without physical backing memory. The entire array can 
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. 
+ *   The physical backing memory must be allocated via ::cuMemCreate.
+ *
+ * \p width and \p height must meet certain size requirements. See ::cudaMalloc3DArray() for more details.
+ *
+ * \param array  - Pointer to allocated array in device memory
+ * \param desc   - Requested channel format
+ * \param width  - Requested array allocation width
+ * \param height - Requested array allocation height
+ * \param flags  - Requested properties of allocated array
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * ::cudaHostAlloc,
+ * ::cuArrayCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0));
+
+/**
+ * \brief Frees memory on the device
+ *
+ * Frees the memory space pointed to by \p devPtr, which must have been
+ * returned by a previous call to one of the following memory allocation APIs -
+ * ::cudaMalloc(), ::cudaMallocPitch(), ::cudaMallocManaged(), ::cudaMallocAsync(),
+ * ::cudaMallocFromPoolAsync().
+ * 
+ * Note - This API will not perform any implicit synchronization when the pointer was
+ * allocated with ::cudaMallocAsync or ::cudaMallocFromPoolAsync. Callers must ensure
+ * that all accesses to the pointer have completed before invoking ::cudaFree. For
+ * best performance and memory reuse, users should use ::cudaFreeAsync to free memory
+ * allocated via the stream ordered memory allocator.
+ * 
+ * If ::cudaFree(\p devPtr) has already been called before,
+ * an error is returned. If \p devPtr is 0, no operation is performed.
+ * ::cudaFree() returns ::cudaErrorValue in case of failure.
+ *
+ * The device version of ::cudaFree cannot be used with a \p *devPtr
+ * allocated using the host API, and vice versa.
+ *
+ * \param devPtr - Device pointer to memory to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocManaged, ::cudaMallocArray, ::cudaFreeArray, ::cudaMallocAsync, ::cudaMallocFromPoolAsync
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaFreeAsync
+ * ::cudaHostAlloc,
+ * ::cuMemFree
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr);
+
+/**
+ * \brief Frees page-locked memory
+ *
+ * Frees the memory space pointed to by \p hostPtr, which must have been
+ * returned by a previous call to ::cudaMallocHost() or ::cudaHostAlloc().
+ *
+ * \param ptr - Pointer to memory to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaHostAlloc,
+ * ::cuMemFreeHost
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr);
+
+/**
+ * \brief Frees an array on the device
+ *
+ * Frees the CUDA array \p array, which must have been returned by a
+ * previous call to ::cudaMallocArray(). If \p devPtr is 0,
+ * no operation is performed.
+ *
+ * \param array - Pointer to array to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuArrayDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array);
+
+/**
+ * \brief Frees a mipmapped array on the device
+ *
+ * Frees the CUDA mipmapped array \p mipmappedArray, which must have been 
+ * returned by a previous call to ::cudaMallocMipmappedArray(). If \p devPtr
+ * is 0, no operation is performed.
+ *
+ * \param mipmappedArray - Pointer to mipmapped array to free
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::cuMipmappedArrayDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray);
+
+
+/**
+ * \brief Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
+ * can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between host
+ * and device.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaHostAllocDefault: This flag's value is defined to be 0 and causes
+ * ::cudaHostAlloc() to emulate ::cudaMallocHost().
+ * - ::cudaHostAllocPortable: The memory returned by this call will be
+ * considered as pinned memory by all CUDA contexts, not just the one that
+ * performed the allocation.
+ * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
+ * The device pointer to the memory may be obtained by calling
+ * ::cudaHostGetDevicePointer().
+ * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
+ * WC memory can be transferred across the PCI Express bus more quickly on some
+ * system configurations, but cannot be read efficiently by most CPUs.  WC
+ * memory is a good option for buffers that will be written by the CPU and read
+ * by the device via mapped pinned memory or host->device transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * In order for the ::cudaHostAllocMapped flag to have any effect, the CUDA context
+ * must support the ::cudaDeviceMapHost flag, which can be checked via
+ * ::cudaGetDeviceFlags(). The ::cudaDeviceMapHost flag is implicitly set for
+ * contexts created via the runtime API.
+ *
+ * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
+ * that do not support mapped pinned memory. The failure is deferred to
+ * ::cudaHostGetDevicePointer() because the memory may be mapped into other
+ * CUDA contexts via the ::cudaHostAllocPortable flag.
+ *
+ * Memory allocated by this function must be freed with ::cudaFreeHost().
+ *
+ * \param pHost - Device pointer to allocated memory
+ * \param size  - Requested allocation size in bytes
+ * \param flags - Requested properties of allocated memory
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost,
+ * ::cudaGetDeviceFlags,
+ * ::cuMemHostAlloc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags);
+
+/**
+ * \brief Registers an existing host memory range for use by CUDA
+ *
+ * Page-locks the memory range specified by \p ptr and \p size and maps it
+ * for the device(s) as specified by \p flags. This memory range also is added
+ * to the same tracking mechanism as ::cudaHostAlloc() to automatically accelerate
+ * calls to functions such as ::cudaMemcpy(). Since the memory can be accessed 
+ * directly by the device, it can be read or written with much higher bandwidth 
+ * than pageable memory that has not been registered.  Page-locking excessive
+ * amounts of memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to register staging areas for data exchange between
+ * host and device.
+ *
+ * ::cudaHostRegister is supported only on I/O coherent devices that have a non-zero
+ * value for the device attribute ::cudaDevAttrHostRegisterSupported.
+ *
+ * The \p flags parameter enables different options to be specified that
+ * affect the allocation, as follows.
+ *
+ * - ::cudaHostRegisterDefault: On a system with unified virtual addressing,
+ *   the memory will be both mapped and portable.  On a system with no unified
+ *   virtual addressing, the memory will be neither mapped nor portable.
+ *
+ * - ::cudaHostRegisterPortable: The memory returned by this call will be
+ *   considered as pinned memory by all CUDA contexts, not just the one that
+ *   performed the allocation.
+ *
+ * - ::cudaHostRegisterMapped: Maps the allocation into the CUDA address
+ *   space. The device pointer to the memory may be obtained by calling
+ *   ::cudaHostGetDevicePointer().
+ *
+ * - ::cudaHostRegisterIoMemory: The passed memory pointer is treated as
+ *   pointing to some memory-mapped I/O space, e.g. belonging to a
+ *   third-party PCIe device, and it will marked as non cache-coherent and
+ *   contiguous.
+ *
+ * - ::cudaHostRegisterReadOnly: The passed memory pointer is treated as
+ *   pointing to memory that is considered read-only by the device.  On
+ *   platforms without ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, this
+ *   flag is required in order to register memory mapped to the CPU as
+ *   read-only.  Support for the use of this flag can be queried from the device
+ *   attribute cudaDeviceAttrReadOnlyHostRegisterSupported.  Using this flag with
+ *   a current context associated with a device that does not have this attribute
+ *   set will cause ::cudaHostRegister to error with cudaErrorNotSupported.
+ *
+ * All of these flags are orthogonal to one another: a developer may page-lock
+ * memory that is portable or mapped with no restrictions.
+ *
+ * The CUDA context must have been created with the ::cudaMapHost flag in
+ * order for the ::cudaHostRegisterMapped flag to have any effect.
+ *
+ * The ::cudaHostRegisterMapped flag may be specified on CUDA contexts for
+ * devices that do not support mapped pinned memory. The failure is deferred
+ * to ::cudaHostGetDevicePointer() because the memory may be mapped into
+ * other CUDA contexts via the ::cudaHostRegisterPortable flag.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory
+ * can also be accessed from the device using the host pointer \p ptr.
+ * The device pointer returned by ::cudaHostGetDevicePointer() may or may not
+ * match the original host pointer \p ptr and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cudaHostGetDevicePointer()
+ * will match the original pointer \p ptr. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cudaHostGetDevicePointer() will not match the original host pointer \p ptr,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * The memory page-locked by this function must be unregistered with ::cudaHostUnregister().
+ *
+ * \param ptr   - Host pointer to memory to page-lock
+ * \param size  - Size in bytes of the address range to page-lock in bytes
+ * \param flags - Flags for allocation request
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorHostMemoryAlreadyRegistered,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostUnregister, ::cudaHostGetFlags, ::cudaHostGetDevicePointer,
+ * ::cuMemHostRegister
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags);
+
+/**
+ * \brief Unregisters a memory range that was registered with cudaHostRegister
+ *
+ * Unmaps the memory range whose base address is specified by \p ptr, and makes
+ * it pageable again.
+ *
+ * The base address must be the same one specified to ::cudaHostRegister().
+ *
+ * \param ptr - Host pointer to memory to unregister
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorHostMemoryNotRegistered
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostUnregister,
+ * ::cuMemHostUnregister
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr);
+
+/**
+ * \brief Passes back device pointer of mapped host memory allocated by
+ * cudaHostAlloc or registered by cudaHostRegister
+ *
+ * Passes back the device pointer corresponding to the mapped, pinned host
+ * buffer allocated by ::cudaHostAlloc() or registered by ::cudaHostRegister().
+ *
+ * ::cudaHostGetDevicePointer() will fail if the ::cudaDeviceMapHost flag was
+ * not specified before deferred context creation occurred, or if called on a
+ * device that does not support mapped, pinned memory.
+ *
+ * For devices that have a non-zero value for the device attribute
+ * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory
+ * can also be accessed from the device using the host pointer \p pHost.
+ * The device pointer returned by ::cudaHostGetDevicePointer() may or may not
+ * match the original host pointer \p pHost and depends on the devices visible to the
+ * application. If all devices visible to the application have a non-zero value for the
+ * device attribute, the device pointer returned by ::cudaHostGetDevicePointer()
+ * will match the original pointer \p pHost. If any device visible to the application
+ * has a zero value for the device attribute, the device pointer returned by
+ * ::cudaHostGetDevicePointer() will not match the original host pointer \p pHost,
+ * but it will be suitable for use on all devices provided Unified Virtual Addressing
+ * is enabled. In such systems, it is valid to access the memory using either pointer
+ * on devices that have a non-zero value for the device attribute. Note however that
+ * such devices should access the memory using only of the two pointers and not both.
+ *
+ * \p flags provides for future releases.  For now, it must be set to 0.
+ *
+ * \param pDevice - Returned device pointer for mapped memory
+ * \param pHost   - Requested host pointer mapping
+ * \param flags   - Flags for extensions (must be 0 for now)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags, ::cudaHostAlloc,
+ * ::cuMemHostGetDevicePointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags);
+
+/**
+ * \brief Passes back flags used to allocate pinned host memory allocated by
+ * cudaHostAlloc
+ *
+ * ::cudaHostGetFlags() will fail if the input pointer does not
+ * reside in an address range allocated by ::cudaHostAlloc().
+ *
+ * \param pFlags - Returned flags word
+ * \param pHost - Host pointer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaHostAlloc,
+ * ::cuMemHostGetFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost);
+
+/**
+ * \brief Allocates logical 1D, 2D, or 3D memory objects on the device
+ *
+ * Allocates at least \p width * \p height * \p depth bytes of linear memory
+ * on the device and returns a ::cudaPitchedPtr in which \p ptr is a pointer
+ * to the allocated memory. The function may pad the allocation to ensure
+ * hardware alignment requirements are met. The pitch returned in the \p pitch
+ * field of \p pitchedDevPtr is the width in bytes of the allocation.
+ *
+ * The returned ::cudaPitchedPtr contains additional fields \p xsize and
+ * \p ysize, the logical width and height of the allocation, which are
+ * equivalent to the \p width and \p height \p extent parameters provided by
+ * the programmer during allocation.
+ *
+ * For allocations of 2D and 3D objects, it is highly recommended that
+ * programmers perform allocations using ::cudaMalloc3D() or
+ * ::cudaMallocPitch(). Due to alignment restrictions in the hardware, this is
+ * especially true if the application will be performing memory copies
+ * involving 2D or 3D objects (whether linear memory or CUDA arrays).
+ *
+ * \param pitchedDevPtr  - Pointer to allocated pitched device memory
+ * \param extent         - Requested allocation size (\p width field in bytes)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMemcpy3D, ::cudaMemset3D,
+ * ::cudaMalloc3DArray, ::cudaMallocArray, ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::make_cudaPitchedPtr, ::make_cudaExtent,
+ * ::cuMemAllocPitch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent);
+
+/**
+ * \brief Allocate an array on the device
+ *
+ * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA array in \p *array.
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+        enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * ::cudaMalloc3DArray() can allocate the following:
+ *
+ * - A 1D array is allocated if the height and depth extents are both zero.
+ * - A 2D array is allocated if only the depth extent is zero.
+ * - A 3D array is allocated if all three extents are non-zero.
+ * - A 1D layered CUDA array is allocated if only the height extent is zero and
+ * the cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is 
+ * determined by the depth extent.
+ * - A 2D layered CUDA array is allocated if all three extents are non-zero and 
+ * the cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is 
+ * determined by the depth extent.
+ * - A cubemap CUDA array is allocated if all three extents are non-zero and the
+ * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A cubemap is
+ * a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube. 
+ * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
+ * - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both,
+ * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be 
+ * a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists 
+ * of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form 
+ * the second cubemap, and so on.
+ *
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation
+ * - ::cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers
+ * - ::cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six.
+ *   If the cudaArrayLayered flag is also set, depth must be a multiple of six.
+ * - ::cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface
+ *   reference.
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA 
+ *   array. Texture gather can only be performed on 2D CUDA arrays.
+ * - ::cudaArraySparse: Allocates a CUDA array without physical backing memory. The subregions within this sparse array 
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. This flag can only be used for 
+ *   creating 2D, 3D or 2D layered sparse CUDA arrays. The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA array without physical backing memory. The entire array can
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. The physical backing memory must be allocated
+ *   via ::cuMemCreate.
+ *
+ * The width, height and depth extents must meet certain size requirements as listed in the following table.
+ * All values are specified in elements.
+ *
+ * Note that 2D CUDA arrays have different size requirements if the ::cudaArrayTextureGather flag is set. In that
+ * case, the valid range for (width, height, depth) is ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0).
+ *
+ * \xmlonly
+ * <table outputclass="xmlonly">
+ * <tgroup cols="3" colsep="1" rowsep="1">
+ * <colspec colname="c1" colwidth="1.0*"/>
+ * <colspec colname="c2" colwidth="3.0*"/>
+ * <colspec colname="c3" colwidth="3.0*"/>
+ * <thead>
+ * <row>
+ * <entry>CUDA array type</entry>
+ * <entry>Valid extents that must always be met {(width range in elements),
+ * (height range), (depth range)}</entry>
+ * <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in
+ * elements), (height range), (depth range)}</entry>
+ * </row>
+ * </thead>
+ * <tbody>
+ * <row>
+ * <entry>1D</entry>
+ * <entry>{ (1,maxTexture1D), 0, 0 }</entry>
+ * <entry>{ (1,maxSurface1D), 0, 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>2D</entry>
+ * <entry>{ (1,maxTexture2D[0]), (1,maxTexture2D[1]), 0 }</entry>
+ * <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>3D</entry>
+ * <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
+ * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]),
+ * (1,maxTexture3DAlt[2]) }</entry>
+ * <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>1D Layered</entry>
+ * <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry>
+ * <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry>
+ * </row>
+ * <row>
+ * <entry>2D Layered</entry>
+ * <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
+ * (1,maxTexture2DLayered[2]) }</entry>
+ * <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]),
+ * (1,maxSurface2DLayered[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap</entry>
+ * <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry>
+ * <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap Layered</entry>
+ * <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
+ * (1,maxTextureCubemapLayered[1]) }</entry>
+ * <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]),
+ * (1,maxSurfaceCubemapLayered[1]) }</entry>
+ * </row>
+ * </tbody>
+ * </tgroup>
+ * </table>
+ * \endxmlonly
+ *
+ * \param array  - Pointer to allocated array in device memory
+ * \param desc   - Requested channel format
+ * \param extent - Requested allocation size (\p width field in elements)
+ * \param flags  - Flags for extensions
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuArray3DCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags __dv(0));
+
+/**
+ * \brief Allocate a mipmapped array on the device
+ *
+ * Allocates a CUDA mipmapped array according to the ::cudaChannelFormatDesc structure
+ * \p desc and returns a handle to the new CUDA mipmapped array in \p *mipmappedArray.
+ * \p numLevels specifies the number of mipmap levels to be allocated. This value is
+ * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+ *
+ * The ::cudaChannelFormatDesc is defined as:
+ * \code
+    struct cudaChannelFormatDesc {
+        int x, y, z, w;
+        enum cudaChannelFormatKind f;
+    };
+    \endcode
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * ::cudaMallocMipmappedArray() can allocate the following:
+ *
+ * - A 1D mipmapped array is allocated if the height and depth extents are both zero.
+ * - A 2D mipmapped array is allocated if only the depth extent is zero.
+ * - A 3D mipmapped array is allocated if all three extents are non-zero.
+ * - A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and
+ * the cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is 
+ * determined by the depth extent.
+ * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and 
+ * the cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is 
+ * determined by the depth extent.
+ * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the
+ * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six.
+ * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace.
+ * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both,
+ * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be 
+ * a multiple of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped
+ * array that consists of a collection of cubemap mipmapped arrays. The first six layers represent the 
+ * first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on.
+ *
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation
+ * - ::cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number of layers
+ * - ::cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to height, and depth must be six.
+ *   If the cudaArrayLayered flag is also set, depth must be a multiple of six.
+ * - ::cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA mipmapped array 
+ *   will be read from or written to using a surface reference.
+ * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA 
+ *   array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations are
+ *   performed only on the most detailed mipmap level.
+ * - ::cudaArraySparse: Allocates a CUDA mipmapped array without physical backing memory. The subregions within this sparse array
+ *   can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. This flag can only be used for creating 
+ *   2D, 3D or 2D layered sparse CUDA mipmapped arrays. The physical backing memory must be allocated via ::cuMemCreate.
+ * - ::cudaArrayDeferredMapping: Allocates a CUDA mipmapped array without physical backing memory. The entire array can
+ *   later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. The physical backing memory must be allocated
+ *   via ::cuMemCreate.
+ *
+ * The width, height and depth extents must meet certain size requirements as listed in the following table.
+ * All values are specified in elements.
+ *
+ * \xmlonly
+ * <table outputclass="xmlonly">
+ * <tgroup cols="3" colsep="1" rowsep="1">
+ * <colspec colname="c1" colwidth="1.0*"/>
+ * <colspec colname="c2" colwidth="3.0*"/>
+ * <colspec colname="c3" colwidth="3.0*"/>
+ * <thead>
+ * <row>
+ * <entry>CUDA array type</entry>
+ * <entry>Valid extents that must always be met {(width range in elements),
+ * (height range), (depth range)}</entry>
+ * <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in
+ * elements), (height range), (depth range)}</entry>
+ * </row>
+ * </thead>
+ * <tbody>
+ * <row>
+ * <entry>1D</entry>
+ * <entry>{ (1,maxTexture1DMipmap), 0, 0 }</entry>
+ * <entry>{ (1,maxSurface1D), 0, 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>2D</entry>
+ * <entry>{ (1,maxTexture2DMipmap[0]), (1,maxTexture2DMipmap[1]), 0 }</entry>
+ * <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry>
+ * </row>
+ * <row>
+ * <entry>3D</entry>
+ * <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) }
+ * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]),
+ * (1,maxTexture3DAlt[2]) }</entry>
+ * <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>1D Layered</entry>
+ * <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry>
+ * <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry>
+ * </row>
+ * <row>
+ * <entry>2D Layered</entry>
+ * <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]),
+ * (1,maxTexture2DLayered[2]) }</entry>
+ * <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]),
+ * (1,maxSurface2DLayered[2]) }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap</entry>
+ * <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry>
+ * <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry>
+ * </row>
+ * <row>
+ * <entry>Cubemap Layered</entry>
+ * <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]),
+ * (1,maxTextureCubemapLayered[1]) }</entry>
+ * <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]),
+ * (1,maxSurfaceCubemapLayered[1]) }</entry>
+ * </row>
+ * </tbody>
+ * </tgroup>
+ * </table>
+ * \endxmlonly
+ *
+ * \param mipmappedArray  - Pointer to allocated mipmapped array in device memory
+ * \param desc            - Requested channel format
+ * \param extent          - Requested allocation size (\p width field in elements)
+ * \param numLevels       - Number of mipmap levels to allocate
+ * \param flags           - Flags for extensions
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuMipmappedArrayCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0));
+
+/**
+ * \brief Gets a mipmap level of a CUDA mipmapped array
+ *
+ * Returns in \p *levelArray a CUDA array that represents a single mipmap level
+ * of the CUDA mipmapped array \p mipmappedArray.
+ *
+ * If \p level is greater than the maximum number of levels in this mipmapped array,
+ * ::cudaErrorInvalidValue is returned.
+ *
+ * If \p mipmappedArray is NULL,
+ * ::cudaErrorInvalidResourceHandle is returned.
+ *
+ * \param levelArray     - Returned mipmap level CUDA array
+ * \param mipmappedArray - CUDA mipmapped array
+ * \param level          - Mipmap level
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree,
+ * ::cudaFreeArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc,
+ * ::make_cudaExtent,
+ * ::cuMipmappedArrayGetLevel
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level);
+
+/**
+ * \brief Copies data between 3D objects
+ *
+\code
+struct cudaExtent {
+  size_t width;
+  size_t height;
+  size_t depth;
+};
+struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);
+
+struct cudaPos {
+  size_t x;
+  size_t y;
+  size_t z;
+};
+struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);
+
+struct cudaMemcpy3DParms {
+  cudaArray_t           srcArray;
+  struct cudaPos        srcPos;
+  struct cudaPitchedPtr srcPtr;
+  cudaArray_t           dstArray;
+  struct cudaPos        dstPos;
+  struct cudaPitchedPtr dstPtr;
+  struct cudaExtent     extent;
+  enum cudaMemcpyKind   kind;
+};
+\endcode
+ *
+ * ::cudaMemcpy3D() copies data betwen two 3D objects. The source and
+ * destination objects may be in either host memory, device memory, or a CUDA
+ * array. The source, destination, extent, and kind of copy performed is
+ * specified by the ::cudaMemcpy3DParms struct which should be initialized to
+ * zero before use:
+\code
+cudaMemcpy3DParms myParms = {0};
+\endcode
+ *
+ * The struct passed to ::cudaMemcpy3D() must specify one of \p srcArray or
+ * \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
+ * non-zero source or destination will cause ::cudaMemcpy3D() to return an
+ * error.
+ *
+ * The \p srcPos and \p dstPos fields are optional offsets into the source and
+ * destination objects and are defined in units of each object's elements. The
+ * element for a host or device pointer is assumed to be <b>unsigned char</b>.
+ *
+ * The \p extent field defines the dimensions of the transferred area in
+ * elements. If a CUDA array is participating in the copy, the extent is
+ * defined in terms of that array's elements. If no CUDA array is
+ * participating in the copy then the extents are defined in elements of
+ * <b>unsigned char</b>.
+ *
+ * The \p kind field defines the direction of the copy. It must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost
+ * passed as kind and cudaArray type passed as source or destination, if the kind
+ * implies cudaArray type to be present on the host, ::cudaMemcpy3D() will
+ * disregard that implication and silently correct the kind based on the fact that
+ * cudaArray type can only be present on the device.
+ *
+ * If the source and destination are both arrays, ::cudaMemcpy3D() will return
+ * an error if they do not have the same element size.
+ *
+ * The source and destination object may not overlap. If overlapping source
+ * and destination objects are specified, undefined behavior will result.
+ *
+ * The source object must entirely contain the region defined by \p srcPos
+ * and \p extent. The destination object must entirely contain the region
+ * defined by \p dstPos and \p extent.
+ *
+ * ::cudaMemcpy3D() returns an error if the pitch of \p srcPtr or \p dstPtr
+ * exceeds the maximum allowed. The pitch of a ::cudaPitchedPtr allocated
+ * with ::cudaMalloc3D() will always be valid.
+ *
+ * \param p - 3D memory copy parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3DAsync,
+ * ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::make_cudaExtent, ::make_cudaPos,
+ * ::cuMemcpy3D
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
+
+/**
+ * \brief Copies memory between devices
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p p.  See the definition of the ::cudaMemcpy3DPeerParms structure
+ * for documentation of its parameters.
+ *
+ * Note that this function is synchronous with respect to the host only if
+ * the source or destination of the transfer is host memory.  Note also 
+ * that this copy is serialized with respect to all pending and future 
+ * asynchronous work in to the current device, the copy's source device,
+ * and the copy's destination device (use ::cudaMemcpy3DPeerAsync to avoid 
+ * this synchronization).
+ *
+ * \param p - Parameters for the memory copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpy3DPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
+
+/**
+ * \brief Copies data between 3D objects
+ *
+\code
+struct cudaExtent {
+  size_t width;
+  size_t height;
+  size_t depth;
+};
+struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d);
+
+struct cudaPos {
+  size_t x;
+  size_t y;
+  size_t z;
+};
+struct cudaPos make_cudaPos(size_t x, size_t y, size_t z);
+
+struct cudaMemcpy3DParms {
+  cudaArray_t           srcArray;
+  struct cudaPos        srcPos;
+  struct cudaPitchedPtr srcPtr;
+  cudaArray_t           dstArray;
+  struct cudaPos        dstPos;
+  struct cudaPitchedPtr dstPtr;
+  struct cudaExtent     extent;
+  enum cudaMemcpyKind   kind;
+};
+\endcode
+ *
+ * ::cudaMemcpy3DAsync() copies data betwen two 3D objects. The source and
+ * destination objects may be in either host memory, device memory, or a CUDA
+ * array. The source, destination, extent, and kind of copy performed is
+ * specified by the ::cudaMemcpy3DParms struct which should be initialized to
+ * zero before use:
+\code
+cudaMemcpy3DParms myParms = {0};
+\endcode
+ *
+ * The struct passed to ::cudaMemcpy3DAsync() must specify one of \p srcArray
+ * or \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one
+ * non-zero source or destination will cause ::cudaMemcpy3DAsync() to return an
+ * error.
+ *
+ * The \p srcPos and \p dstPos fields are optional offsets into the source and
+ * destination objects and are defined in units of each object's elements. The
+ * element for a host or device pointer is assumed to be <b>unsigned char</b>.
+ * For CUDA arrays, positions must be in the range [0, 2048) for any
+ * dimension.
+ *
+ * The \p extent field defines the dimensions of the transferred area in
+ * elements. If a CUDA array is participating in the copy, the extent is
+ * defined in terms of that array's elements. If no CUDA array is
+ * participating in the copy then the extents are defined in elements of
+ * <b>unsigned char</b>.
+ *
+ * The \p kind field defines the direction of the copy. It must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost
+ * passed as kind and cudaArray type passed as source or destination, if the kind
+ * implies cudaArray type to be present on the host, ::cudaMemcpy3DAsync() will
+ * disregard that implication and silently correct the kind based on the fact that
+ * cudaArray type can only be present on the device.
+ *
+ * If the source and destination are both arrays, ::cudaMemcpy3DAsync() will
+ * return an error if they do not have the same element size.
+ *
+ * The source and destination object may not overlap. If overlapping source
+ * and destination objects are specified, undefined behavior will result.
+ *
+ * The source object must lie entirely within the region defined by \p srcPos
+ * and \p extent. The destination object must lie entirely within the region
+ * defined by \p dstPos and \p extent.
+ *
+ * ::cudaMemcpy3DAsync() returns an error if the pitch of \p srcPtr or
+ * \p dstPtr exceeds the maximum allowed. The pitch of a
+ * ::cudaPitchedPtr allocated with ::cudaMalloc3D() will always be valid.
+ *
+ * ::cudaMemcpy3DAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param p      - 3D memory copy parameters
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3D,
+ * ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, :::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::make_cudaExtent, ::make_cudaPos,
+ * ::cuMemcpy3DAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies memory between devices asynchronously.
+ *
+ * Perform a 3D memory copy according to the parameters specified in
+ * \p p.  See the definition of the ::cudaMemcpy3DPeerParms structure
+ * for documentation of its parameters.
+ *
+ * \param p      - Parameters for the memory copy
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpy3DPeerAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Gets free and total device memory
+ *
+ * Returns in \p *total the total amount of memory available to the the current context.
+ * Returns in \p *free the amount of memory on the device that is free according to the OS.
+ * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
+ * In a multi-tenet situation, free estimate returned is prone to race condition where
+ * a new allocation/free done by a different process or a different thread in the same
+ * process between the time when free memory was estimated and reported, will result in
+ * deviation in free value reported and actual free memory.
+ *
+ * The integrated GPU on Tegra shares memory with CPU and other component
+ * of the SoC. The free and total values returned by the API excludes
+ * the SWAP memory space maintained by the OS on some platforms.
+ * The OS may move some of the memory pages into swap area as the GPU or
+ * CPU allocate or access memory. See Tegra app note on how to calculate
+ * total and free memory on Tegra.
+ *
+ * \param free  - Returned free memory in bytes
+ * \param total - Returned total memory in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuMemGetInfo
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total);
+
+/**
+ * \brief Gets info about the specified cudaArray
+ * 
+ * Returns in \p *desc, \p *extent and \p *flags respectively, the type, shape 
+ * and flags of \p array.
+ *
+ * Any of \p *desc, \p *extent and \p *flags may be specified as NULL.
+ *
+ * \param desc   - Returned array type
+ * \param extent - Returned array shape. 2D arrays will have depth of zero
+ * \param flags  - Returned array flags
+ * \param array  - The ::cudaArray to get info for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuArrayGetDescriptor,
+ * ::cuArray3DGetDescriptor
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array);
+
+/**
+ * \brief Gets a CUDA array plane from a CUDA array
+ *
+ * Returns in \p pPlaneArray a CUDA array that represents a single format plane
+ * of the CUDA array \p hArray.
+ *
+ * If \p planeIdx is greater than the maximum number of planes in this array or if the array does
+ * not have a multi-planar format e.g: ::cudaChannelFormatKindNV12, then ::cudaErrorInvalidValue is returned.
+ *
+ * Note that if the \p hArray has format ::cudaChannelFormatKindNV12, then passing in 0 for \p planeIdx returns
+ * a CUDA array of the same size as \p hArray but with one 8-bit channel and ::cudaChannelFormatKindUnsigned as its format kind.
+ * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width
+ * of \p hArray with two 8-bit channels and ::cudaChannelFormatKindUnsigned as its format kind.
+ *
+ * \param pPlaneArray   - Returned CUDA array referenced by the \p planeIdx
+ * \param hArray        - CUDA array
+ * \param planeIdx      - Plane index
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cuArrayGetPlane
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx);
+
+/**
+ * \brief Returns the memory requirements of a CUDA array
+ *
+ * Returns the memory requirements of a CUDA array in \p memoryRequirements
+ * If the CUDA array is not allocated with flag ::cudaArrayDeferredMapping
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * The returned value in ::cudaArrayMemoryRequirements::size
+ * represents the total size of the CUDA array.
+ * The returned value in ::cudaArrayMemoryRequirements::alignment
+ * represents the alignment necessary for mapping the CUDA array.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] memoryRequirements - Pointer to ::cudaArrayMemoryRequirements
+ * \param[in] array - CUDA array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cudaMipmappedArrayGetMemoryRequirements
+ */
+extern __host__ cudaError_t CUDARTAPI cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements  *memoryRequirements, cudaArray_t array, int device);
+
+/**
+ * \brief Returns the memory requirements of a CUDA mipmapped array
+ *
+ * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements
+ * If the CUDA mipmapped array is not allocated with flag ::cudaArrayDeferredMapping
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * The returned value in ::cudaArrayMemoryRequirements::size
+ * represents the total size of the CUDA mipmapped array.
+ * The returned value in ::cudaArrayMemoryRequirements::alignment
+ * represents the alignment necessary for mapping the CUDA mipmapped
+ * array.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] memoryRequirements - Pointer to ::cudaArrayMemoryRequirements
+ * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of
+ * \param[in] device - Device to get the memory requirements for
+ * \sa ::cudaArrayGetMemoryRequirements
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements *memoryRequirements, cudaMipmappedArray_t mipmap, int device);
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA array
+ *
+ * Returns the layout properties of a sparse CUDA array in \p sparseProperties.
+ * If the CUDA array is not allocated with flag ::cudaArraySparse
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * If the returned value in ::cudaArraySparseProperties::flags contains ::cudaArraySparsePropertiesSingleMipTail,
+ * then ::cudaArraySparseProperties::miptailSize represents the total size of the array. Otherwise, it will be zero.
+ * Also, the returned value in ::cudaArraySparseProperties::miptailFirstLevel is always zero.
+ * Note that the \p array must have been allocated using ::cudaMallocArray or ::cudaMalloc3DArray. For CUDA arrays obtained
+ * using ::cudaMipmappedArrayGetLevel, ::cudaErrorInvalidValue will be returned. Instead, ::cudaMipmappedArrayGetSparseProperties
+ * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] sparseProperties - Pointer to return the ::cudaArraySparseProperties
+ * \param[in] array             - The CUDA array to get the sparse properties of 
+ *
+ * \sa
+ * ::cudaMipmappedArrayGetSparseProperties,
+ * ::cuMemMapArrayAsync
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaArray_t array);
+#endif
+
+/**
+ * \brief Returns the layout properties of a sparse CUDA mipmapped array
+ *
+ * Returns the sparse array layout properties in \p sparseProperties.
+ * If the CUDA mipmapped array is not allocated with flag ::cudaArraySparse
+ * ::cudaErrorInvalidValue will be returned.
+ *
+ * For non-layered CUDA mipmapped arrays, ::cudaArraySparseProperties::miptailSize returns the
+ * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth
+ * is less than that of the tile.
+ * For layered CUDA mipmapped arrays, if ::cudaArraySparseProperties::flags contains ::cudaArraySparsePropertiesSingleMipTail,
+ * then ::cudaArraySparseProperties::miptailSize specifies the size of the mip tail of all layers combined.
+ * Otherwise, ::cudaArraySparseProperties::miptailSize specifies mip tail size per layer.
+ * The returned value of ::cudaArraySparseProperties::miptailFirstLevel is valid only if ::cudaArraySparseProperties::miptailSize is non-zero.
+ *
+ * \return
+ * ::cudaSuccess
+ * ::cudaErrorInvalidValue
+ *
+ * \param[out] sparseProperties - Pointer to return ::cudaArraySparseProperties
+ * \param[in] mipmap            - The CUDA mipmapped array to get the sparse properties of
+ *
+ * \sa
+ * ::cudaArrayGetSparseProperties,
+ * ::cuMemMapArrayAsync
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaMipmappedArray_t mipmap);
+#endif
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * memory area pointed to by \p dst, where \p kind specifies the direction
+ * of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Calling
+ * ::cudaMemcpy() with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * \param dst   - Destination memory address
+ * \param src   - Source memory address
+ * \param count - Size in bytes to copy
+ * \param kind  - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \note_sync
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyDtoH,
+ * ::cuMemcpyHtoD,
+ * ::cuMemcpyDtoD,
+ * ::cuMemcpy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies memory between two devices
+ *
+ * Copies memory from one device to memory on another device.  \p dst is the 
+ * base device pointer of the destination memory and \p dstDevice is the 
+ * destination device.  \p src is the base device pointer of the source memory 
+ * and \p srcDevice is the source device.  \p count specifies the number of bytes 
+ * to copy.
+ *
+ * Note that this function is asynchronous with respect to the host, but 
+ * serialized with respect all pending and future asynchronous work in to the 
+ * current device, \p srcDevice, and \p dstDevice (use ::cudaMemcpyPeerAsync 
+ * to avoid this synchronization).
+ *
+ * \param dst       - Destination device pointer
+ * \param dstDevice - Destination device
+ * \param src       - Source device pointer
+ * \param srcDevice - Source device
+ * \param count     - Size of memory copy in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpyPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. \p dpitch and
+ * \p spitch are the widths in memory in bytes of the 2D arrays pointed to by
+ * \p dst and \p src, including any padding added to the end of each row. The
+ * memory areas may not overlap. \p width must not exceed either \p dpitch or
+ * \p spitch. Calling ::cudaMemcpy2D() with \p dst and \p src pointers that do
+ * not match the direction of the copy results in an undefined behavior.
+ * ::cudaMemcpy2D() returns an error if \p dpitch or \p spitch exceeds
+ * the maximum allowed.
+ *
+ * \param dst    - Destination memory address
+ * \param dpitch - Pitch of destination memory
+ * \param src    - Source memory address
+ * \param spitch - Pitch of source memory
+ * \param width  - Width of matrix transfer (columns in bytes)
+ * \param height - Height of matrix transfer (rows)
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the CUDA array \p dst starting at
+ * \p hOffset rows and \p wOffset bytes from the upper left corner,
+ * where \p kind specifies the direction of the copy, and must be one
+ * of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p spitch is the width in memory in bytes of the 2D array pointed to by
+ * \p src, including any padding added to the end of each row. \p wOffset +
+ * \p width must not exceed the width of the CUDA array \p dst. \p width must
+ * not exceed \p spitch. ::cudaMemcpy2DToArray() returns an error if \p spitch
+ * exceeds the maximum allowed.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param spitch  - Pitch of source memory
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffset rows and \p wOffset bytes from the
+ * upper left corner to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. \p dpitch is the
+ * width in memory in bytes of the 2D array pointed to by \p dst, including any
+ * padding added to the end of each row. \p wOffset + \p width must not exceed
+ * the width of the CUDA array \p src. \p width must not exceed \p dpitch.
+ * ::cudaMemcpy2DFromArray() returns an error if \p dpitch exceeds the maximum
+ * allowed.
+ *
+ * \param dst     - Destination memory address
+ * \param dpitch  - Pitch of destination memory
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffsetSrc rows and \p wOffsetSrc bytes from the
+ * upper left corner to the CUDA array \p dst starting at \p hOffsetDst rows
+ * and \p wOffsetDst bytes from the upper left corner, where \p kind
+ * specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p wOffsetDst + \p width must not exceed the width of the CUDA array \p dst.
+ * \p wOffsetSrc + \p width must not exceed the width of the CUDA array \p src.
+ *
+ * \param dst        - Destination memory address
+ * \param wOffsetDst - Destination starting X offset (columns in bytes)
+ * \param hOffsetDst - Destination starting Y offset (rows)
+ * \param src        - Source memory address
+ * \param wOffsetSrc - Source starting X offset (columns in bytes)
+ * \param hOffsetSrc - Source starting Y offset (rows)
+ * \param width      - Width of matrix transfer (columns in bytes)
+ * \param height     - Height of matrix transfer (rows)
+ * \param kind       - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2D,
+ * ::cuMemcpy2DUnaligned
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+
+/**
+ * \brief Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area pointed to by \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param symbol - Device symbol address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray,  ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy,
+ * ::cuMemcpyHtoD,
+ * ::cuMemcpyDtoD
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
+
+/**
+ * \brief Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy,
+ * ::cuMemcpyDtoH,
+ * ::cuMemcpyDtoD
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
+
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * memory area pointed to by \p dst, where \p kind specifies the
+ * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * 
+ * The memory areas may not overlap. Calling ::cudaMemcpyAsync() with \p dst and
+ * \p src pointers that do not match the direction of the copy results in an
+ * undefined behavior.
+ *
+ * ::cudaMemcpyAsync() is asynchronous with respect to the host, so the call
+ * may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and the \p stream is
+ * non-zero, the copy may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param dst    - Destination memory address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyDtoHAsync,
+ * ::cuMemcpyHtoDAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies memory between two devices asynchronously.
+ *
+ * Copies memory from one device to memory on another device.  \p dst is the 
+ * base device pointer of the destination memory and \p dstDevice is the 
+ * destination device.  \p src is the base device pointer of the source memory 
+ * and \p srcDevice is the source device.  \p count specifies the number of bytes 
+ * to copy.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param dst       - Destination device pointer
+ * \param dstDevice - Destination device
+ * \param src       - Source device pointer
+ * \param srcDevice - Source device
+ * \param count     - Size of memory copy in bytes
+ * \param stream    - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync,
+ * ::cuMemcpyPeerAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p dpitch and \p spitch are the widths in memory in bytes of the 2D arrays
+ * pointed to by \p dst and \p src, including any padding added to the end of
+ * each row. The memory areas may not overlap. \p width must not exceed either
+ * \p dpitch or \p spitch.
+ *
+ * Calling ::cudaMemcpy2DAsync() with \p dst and \p src pointers that do not
+ * match the direction of the copy results in an undefined behavior.
+ * ::cudaMemcpy2DAsync() returns an error if \p dpitch or \p spitch is greater
+ * than the maximum allowed.
+ *
+ * ::cudaMemcpy2DAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
+ * \p stream is non-zero, the copy may overlap with operations in other
+ * streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param dst    - Destination memory address
+ * \param dpitch - Pitch of destination memory
+ * \param src    - Source memory address
+ * \param spitch - Pitch of source memory
+ * \param width  - Width of matrix transfer (columns in bytes)
+ * \param height - Height of matrix transfer (rows)
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the memory
+ * area pointed to by \p src to the CUDA array \p dst starting at \p hOffset
+ * rows and \p wOffset bytes from the upper left corner, where \p kind specifies
+ * the direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p spitch is the width in memory in bytes of the 2D array pointed to by
+ * \p src, including any padding added to the end of each row. \p wOffset +
+ * \p width must not exceed the width of the CUDA array \p dst. \p width must
+ * not exceed \p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if
+ * \p spitch exceeds the maximum allowed.
+ *
+ * ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and
+ * \p stream is non-zero, the copy may overlap with operations in other
+ * streams.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param spitch  - Pitch of source memory
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ *
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * Copies a matrix (\p height rows of \p width bytes each) from the CUDA
+ * array \p src starting at \p hOffset rows and \p wOffset bytes from the
+ * upper left corner to the memory area pointed to by \p dst,
+ * where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ * \p dpitch is the width in memory in bytes of the 2D
+ * array pointed to by \p dst, including any padding added to the end of each
+ * row. \p wOffset + \p width must not exceed the width of the CUDA array
+ * \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArrayAsync()
+ * returns an error if \p dpitch exceeds the maximum allowed.
+ *
+ * ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream is
+ * non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param dpitch  - Pitch of destination memory
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param width   - Width of matrix transfer (columns in bytes)
+ * \param height  - Height of matrix transfer (rows)
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidPitchValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ * \note_memcpy
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ *
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area pointed to by \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
+ * may overlap with operations in other streams.
+ *
+ * \param symbol - Device symbol address
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyHtoDAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that resides in
+ * global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
+ * with operations in other streams.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync,
+ * ::cuMemcpyAsync,
+ * ::cuMemcpyDtoHAsync,
+ * ::cuMemcpyDtoDAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Fills the first \p count bytes of the memory area pointed to by \p devPtr
+ * with the constant byte value \p value.
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p devPtr refers to pinned host memory.
+ *
+ * \param devPtr - Pointer to device memory
+ * \param value  - Value to set for each byte of specified memory
+ * \param count  - Size in bytes to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuMemsetD8,
+ * ::cuMemsetD16,
+ * ::cuMemsetD32
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Sets to the specified value \p value a matrix (\p height rows of \p width
+ * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
+ * 2D array pointed to by \p dstPtr, including any padding added to the end
+ * of each row. This function performs fastest when the pitch is one that has
+ * been passed back by ::cudaMallocPitch().
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p devPtr refers to pinned host memory.
+ *
+ * \param devPtr - Pointer to 2D device memory
+ * \param pitch  - Pitch in bytes of 2D device memory(Unused if \p height is 1)
+ * \param value  - Value to set for each byte of specified memory
+ * \param width  - Width of matrix set (columns in bytes)
+ * \param height - Height of matrix set (rows)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset3D, ::cudaMemsetAsync,
+ * ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD2D8,
+ * ::cuMemsetD2D16,
+ * ::cuMemsetD2D32
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Initializes each element of a 3D array to the specified value \p value.
+ * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
+ * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
+ * to by \p pitchedDevPtr, including any padding added to the end of each row.
+ * The \p xsize field specifies the logical width of each row in bytes, while
+ * the \p ysize field specifies the height of each 2D slice in rows.
+ * The \p pitch field of \p pitchedDevPtr is ignored when \p height and \p depth 
+ * are both equal to 1. 
+ *
+ * The extents of the initialized region are specified as a \p width in bytes,
+ * a \p height in rows, and a \p depth in slices.
+ *
+ * Extents with \p width greater than or equal to the \p xsize of
+ * \p pitchedDevPtr may perform significantly faster than extents narrower
+ * than the \p xsize. Secondarily, extents with \p height equal to the
+ * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
+ * shorter than the \p ysize.
+ *
+ * This function performs fastest when the \p pitchedDevPtr has been allocated
+ * by ::cudaMalloc3D().
+ *
+ * Note that this function is asynchronous with respect to the host unless
+ * \p pitchedDevPtr refers to pinned host memory.
+ *
+ * \param pitchedDevPtr - Pointer to pitched device memory
+ * \param value         - Value to set for each byte of specified memory
+ * \param extent        - Size parameters for where to set device memory (\p width field in bytes)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D,
+ * ::cudaMemsetAsync, ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cudaMalloc3D, ::make_cudaPitchedPtr,
+ * ::make_cudaExtent
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Fills the first \p count bytes of the memory area pointed to by \p devPtr
+ * with the constant byte value \p value.
+ *
+ * ::cudaMemsetAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param devPtr - Pointer to device memory
+ * \param value  - Value to set for each byte of specified memory
+ * \param count  - Size in bytes to set
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemset2DAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD8Async,
+ * ::cuMemsetD16Async,
+ * ::cuMemsetD32Async
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Sets to the specified value \p value a matrix (\p height rows of \p width
+ * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the
+ * 2D array pointed to by \p dstPtr, including any padding added to the end
+ * of each row. This function performs fastest when the pitch is one that has
+ * been passed back by ::cudaMallocPitch().
+ *
+ * ::cudaMemset2DAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param devPtr - Pointer to 2D device memory
+ * \param pitch  - Pitch in bytes of 2D device memory(Unused if \p height is 1)
+ * \param value  - Value to set for each byte of specified memory
+ * \param width  - Width of matrix set (columns in bytes)
+ * \param height - Height of matrix set (rows)
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemsetAsync, ::cudaMemset3DAsync,
+ * ::cuMemsetD2D8Async,
+ * ::cuMemsetD2D16Async,
+ * ::cuMemsetD2D32Async
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Initializes or sets device memory to a value
+ *
+ * Initializes each element of a 3D array to the specified value \p value.
+ * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field
+ * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed
+ * to by \p pitchedDevPtr, including any padding added to the end of each row.
+ * The \p xsize field specifies the logical width of each row in bytes, while
+ * the \p ysize field specifies the height of each 2D slice in rows.
+ * The \p pitch field of \p pitchedDevPtr is ignored when \p height and \p depth 
+ * are both equal to 1. 
+ *
+ * The extents of the initialized region are specified as a \p width in bytes,
+ * a \p height in rows, and a \p depth in slices.
+ *
+ * Extents with \p width greater than or equal to the \p xsize of
+ * \p pitchedDevPtr may perform significantly faster than extents narrower
+ * than the \p xsize. Secondarily, extents with \p height equal to the
+ * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is
+ * shorter than the \p ysize.
+ *
+ * This function performs fastest when the \p pitchedDevPtr has been allocated
+ * by ::cudaMalloc3D().
+ *
+ * ::cudaMemset3DAsync() is asynchronous with respect to the host, so
+ * the call may return before the memset is complete. The operation can optionally
+ * be associated to a stream by passing a non-zero \p stream argument.
+ * If \p stream is non-zero, the operation may overlap with operations in other streams.
+ *
+ * The device version of this function only handles device to device copies and
+ * cannot be given local or shared pointers.
+ *
+ * \param pitchedDevPtr - Pointer to pitched device memory
+ * \param value         - Value to set for each byte of specified memory
+ * \param extent        - Size parameters for where to set device memory (\p width field in bytes)
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_memset
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D,
+ * ::cudaMemsetAsync, ::cudaMemset2DAsync,
+ * ::cudaMalloc3D, ::make_cudaPitchedPtr,
+ * ::make_cudaExtent
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Finds the address associated with a CUDA symbol
+ *
+ * Returns in \p *devPtr the address of symbol \p symbol on the device.
+ * \p symbol is a variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared in the
+ * global or constant memory space, \p *devPtr is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param devPtr - Return device pointer associated with symbol
+ * \param symbol - Device symbol address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)",
+ * ::cuModuleGetGlobal
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol);
+
+/**
+ * \brief Finds the size of the object associated with a CUDA symbol
+ *
+ * Returns in \p *size the size of symbol \p symbol. \p symbol is a variable that
+ * resides in global or constant memory space. If \p symbol cannot be found, or
+ * if \p symbol is not declared in global or constant memory space, \p *size is
+ * unchanged and the error ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param size   - Size of object associated with symbol
+ * \param symbol - Device symbol address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)",
+ * ::cuModuleGetGlobal
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol);
+
+/**
+ * \brief Prefetches memory to the specified destination device
+ *
+ * Prefetches memory to the specified destination device.  \p devPtr is the 
+ * base device pointer of the memory to be prefetched and \p dstDevice is the 
+ * destination device. \p count specifies the number of bytes to copy. \p stream
+ * is the stream in which the operation is enqueued. The memory range must refer
+ * to managed memory allocated via ::cudaMallocManaged or declared via __managed__ variables.
+ *
+ * Passing in cudaCpuDeviceId for \p dstDevice will prefetch the data to host memory. If
+ * \p dstDevice is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * must be non-zero. Additionally, \p stream must be associated with a device that has a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * The start address and end address of the memory range will be rounded down and rounded up
+ * respectively to be aligned to CPU page size before the prefetch operation is enqueued
+ * in the stream.
+ *
+ * If no physical memory has been allocated for this region, then this memory region
+ * will be populated and mapped on the destination device. If there's insufficient
+ * memory to prefetch the desired region, the Unified Memory driver may evict pages from other
+ * ::cudaMallocManaged allocations to host memory in order to make room. Device memory
+ * allocated using ::cudaMalloc or ::cudaMallocArray will not be evicted.
+ *
+ * By default, any mappings to the previous location of the migrated pages are removed and
+ * mappings for the new location are only setup on \p dstDevice. The exact behavior however
+ * also depends on the settings applied to this memory range via ::cudaMemAdvise as described
+ * below:
+ *
+ * If ::cudaMemAdviseSetReadMostly was set on any subset of this memory range,
+ * then that subset will create a read-only copy of the pages on \p dstDevice.
+ *
+ * If ::cudaMemAdviseSetPreferredLocation was called on any subset of this memory
+ * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the
+ * preferred location of any pages in the memory range.
+ *
+ * If ::cudaMemAdviseSetAccessedBy was called on any subset of this memory range,
+ * then mappings to those pages from all the appropriate processors are updated to
+ * refer to the new location if establishing such a mapping is possible. Otherwise,
+ * those mappings are cleared.
+ *
+ * Note that this API is not required for functionality and only serves to improve performance
+ * by allowing the application to migrate data to a suitable location before it is accessed.
+ * Memory accesses to this range are always coherent and are allowed even when the data is
+ * actively being migrated.
+ *
+ * Note that this function is asynchronous with respect to the host and all work
+ * on other devices.
+ *
+ * \param devPtr    - Pointer to be prefetched
+ * \param count     - Size in bytes
+ * \param dstDevice - Destination device to prefetch to
+ * \param stream    - Stream to enqueue prefetch operation
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemAdvise,
+ * ::cuMemPrefetchAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Advise about the usage of a given memory range
+ *
+ * Advise the Unified Memory subsystem about the usage pattern for the memory range
+ * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory
+ * range will be rounded down and rounded up respectively to be aligned to CPU page size before the
+ * advice is applied. The memory range must refer to managed memory allocated via ::cudaMallocManaged
+ * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable
+ * memory provided it represents a valid, host-accessible region of memory and all additional constraints
+ * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable
+ * memory range results in an error being returned.
+ *
+ * The \p advice parameter can take the following values:
+ * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read
+ * from and only occasionally written to. Any read accesses from any processor to this region will create a
+ * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync
+ * is called on this region, it will create a read-only copy of the data on the destination processor.
+ * If any processor writes to this region, all copies of the corresponding page will be invalidated
+ * except for the one where the write occurred. The \p device argument is ignored for this advice.
+ * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU
+ * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * Also, if a context is created on a device that does not have the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until
+ * all such contexts are destroyed.
+ * If the memory region refers to valid system-allocated pageable memory, then the accessing device must
+ * have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess for a read-only
+ * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the
+ * device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, then setting this advice
+ * will not create a read-only copy when that device accesses this memory region.
+ *
+ * - ::cudaMemAdviceUnsetReadMostly: Undoes the effect of ::cudaMemAdviceReadMostly and also prevents the
+ * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated
+ * copies of the data will be collapsed into a single copy. The location for the collapsed
+ * copy will be the preferred location if the page has a preferred location and one of the read-duplicated
+ * copies was resident at that location. Otherwise, the location chosen is arbitrary.
+ *
+ * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the
+ * data to be the memory belonging to \p device. Passing in cudaCpuDeviceId for \p device sets the
+ * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Setting the preferred location
+ * does not cause data to migrate to that location immediately. Instead, it guides the migration policy
+ * when a fault occurs on that memory region. If the data is already in its preferred location and the
+ * faulting processor can establish a mapping without requiring the data to be migrated, then
+ * data migration will be avoided. On the other hand, if the data is not in its preferred location
+ * or if a direct mapping cannot be established, then it will be migrated to the processor accessing
+ * it. It is important to note that setting the preferred location does not prevent data prefetching
+ * done using ::cudaMemPrefetchAsync.
+ * Having a preferred location can override the page thrash detection and resolution logic in the Unified
+ * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device
+ * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But
+ * if the preferred location is set as device memory, then the page will continue to thrash indefinitely.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice, unless read accesses from
+ * \p device will not result in a read-only copy being created on that device as outlined in description for
+ * the advice ::cudaMemAdviseSetReadMostly.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has
+ * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect. Note however that this behavior may change in the future.
+ *
+ * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation
+ * and changes the preferred location to none.
+ *
+ * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by \p device.
+ * Passing in ::cudaCpuDeviceId for \p device will set the advice for the CPU. If \p device is a GPU, then
+ * the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero.
+ * This advice does not cause data migration and has no impact on the location of the data per se. Instead,
+ * it causes the data to always be mapped in the specified processor's page tables, as long as the
+ * location of the data permits a mapping to be established. If the data gets migrated for any reason,
+ * the mappings are updated accordingly.
+ * This advice is recommended in scenarios where data locality is not important, but avoiding faults is.
+ * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
+ * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data
+ * over to the other GPUs is not as important because the accesses are infrequent and the overhead of
+ * migration may be too high. But preventing faults can still help improve performance, and so having
+ * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
+ * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the
+ * ::cudaMemAdviceSetAccessedBy flag set for this data will now have its mapping updated to point to the
+ * page in host memory.
+ * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the
+ * policies associated with that advice will override the policies of this advice. Additionally, if the
+ * preferred location of this memory region or any subset of it is also \p device, then the policies
+ * associated with ::cudaMemAdviseSetPreferredLocation will override the policies of this advice.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has
+ * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * - ::cudaMemAdviseUnsetAccessedBy: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to
+ * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults.
+ * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero
+ * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has
+ * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables,
+ * then this call has no effect.
+ *
+ * \param devPtr - Pointer to memory to set the advice for
+ * \param count  - Size in bytes of the memory range
+ * \param advice - Advice to be applied for the specified memory range
+ * \param device - Device to apply the advice for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync,
+ * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync,
+ * ::cuMemAdvise
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device);
+
+/**
+* \brief Query an attribute of a given memory range
+*
+* Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The
+* memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via
+* __managed__ variables.
+*
+* The \p attribute parameter can take the following values:
+* - ::cudaMemRangeAttributeReadMostly: If this attribute is specified, \p data will be interpreted
+* as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given
+* memory range have read-duplication enabled, or 0 otherwise.
+* - ::cudaMemRangeAttributePreferredLocation: If this attribute is specified, \p data will be
+* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device
+* id if all pages in the memory range have that GPU as their preferred location, or it will be cudaCpuDeviceId
+* if all pages in the memory range have the CPU as their preferred location, or it will be cudaInvalidDeviceId
+* if either all the pages don't have the same preferred location or some of the pages don't have a
+* preferred location at all. Note that the actual location of the pages in the memory range at the time of
+* the query may be different from the preferred location.
+* - ::cudaMemRangeAttributeAccessedBy: If this attribute is specified, \p data will be interpreted
+* as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned
+* will be a list of device ids that had ::cudaMemAdviceSetAccessedBy set for that entire memory range.
+* If any device does not have that advice set for the entire memory range, that device will not be included.
+* If \p data is larger than the number of devices that have that advice set for that memory range,
+* cudaInvalidDeviceId will be returned in all the extra space provided. For ex., if \p dataSize is 12
+* (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be
+* { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If \p data is smaller than the number of devices that have
+* that advice set, then only as many devices will be returned as can fit in the array. There is no
+* guarantee on which specific devices will be returned, however.
+* - ::cudaMemRangeAttributeLastPrefetchLocation: If this attribute is specified, \p data will be
+* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location
+* to which all pages in the memory range were prefetched explicitly via ::cudaMemPrefetchAsync. This will either be
+* a GPU id or cudaCpuDeviceId depending on whether the last location for prefetch was a GPU or the CPU
+* respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
+* prefetched to the same location, cudaInvalidDeviceId will be returned. Note that this simply returns the
+* last location that the applicaton requested to prefetch the memory range to. It gives no indication as to
+* whether the prefetch operation to that location has completed or even begun.
+*
+* \param data      - A pointers to a memory location where the result
+*                    of each attribute query will be written to.
+* \param dataSize  - Array containing the size of data
+* \param attribute - The attribute to query
+* \param devPtr    - Start of the range to query
+* \param count     - Size of the range to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemRangeGetAttributes, ::cudaMemPrefetchAsync,
+ * ::cudaMemAdvise,
+ * ::cuMemRangeGetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count);
+
+/**
+ * \brief Query attributes of a given memory range.
+ *
+ * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The
+ * memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via
+ * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes
+ * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries.
+ * The results of the query will be stored in \p data.
+ *
+ * The list of supported attributes are given below. Please refer to ::cudaMemRangeGetAttribute for
+ * attribute descriptions and restrictions.
+ *
+ * - ::cudaMemRangeAttributeReadMostly
+ * - ::cudaMemRangeAttributePreferredLocation
+ * - ::cudaMemRangeAttributeAccessedBy
+ * - ::cudaMemRangeAttributeLastPrefetchLocation
+ *
+ * \param data          - A two-dimensional array containing pointers to memory
+ *                        locations where the result of each attribute query will be written to.
+ * \param dataSizes     - Array containing the sizes of each result
+ * \param attributes    - An array of attributes to query
+ *                        (numAttributes and the number of attributes in this array should match)
+ * \param numAttributes - Number of attributes to query
+ * \param devPtr        - Start of the range to query
+ * \param count         - Size of the range to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemRangeGetAttribute, ::cudaMemAdvise,
+ * ::cudaMemPrefetchAsync,
+ * ::cuMemRangeGetAttributes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count);
+
+/** @} */ /* END CUDART_MEMORY */
+
+/**
+ * \defgroup CUDART_MEMORY_DEPRECATED Memory Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ deprecated memory management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes deprecated memory management functions of the CUDA runtime
+ * application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * CUDA array \p dst starting at \p hOffset rows and \p wOffset bytes from
+ * the upper left corner, where \p kind specifies the direction
+ * of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyHtoA,
+ * ::cuMemcpyDtoA
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffset rows
+ * and \p wOffset bytes from the upper left corner to the memory area pointed to
+ * by \p dst, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst     - Destination memory address
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_sync
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoH,
+ * ::cuMemcpyAtoD
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffsetSrc
+ * rows and \p wOffsetSrc bytes from the upper left corner to the CUDA array
+ * \p dst starting at \p hOffsetDst rows and \p wOffsetDst bytes from the upper
+ * left corner, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param dst        - Destination memory address
+ * \param wOffsetDst - Destination starting X offset (columns in bytes)
+ * \param hOffsetDst - Destination starting Y offset (rows)
+ * \param src        - Source memory address
+ * \param wOffsetSrc - Source starting X offset (columns in bytes)
+ * \param hOffsetSrc - Source starting Y offset (rows)
+ * \param count      - Size in bytes to copy
+ * \param kind       - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoA
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src to the
+ * CUDA array \p dst starting at \p hOffset rows and \p wOffset bytes from
+ * the upper left corner, where \p kind specifies the
+ * direction of the copy, and must be one of ::cudaMemcpyHostToHost,
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If \p
+ * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param wOffset - Destination starting X offset (columns in bytes)
+ * \param hOffset - Destination starting Y offset (rows)
+ * \param src     - Source memory address
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyHtoAAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Copies data between host and device
+ *
+ * \deprecated
+ *
+ * Copies \p count bytes from the CUDA array \p src starting at \p hOffset rows
+ * and \p wOffset bytes from the upper left corner to the memory area pointed to
+ * by \p dst, where \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If \p
+ * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream
+ * is non-zero, the copy may overlap with operations in other streams.
+ *
+ * \param dst     - Destination memory address
+ * \param src     - Source memory address
+ * \param wOffset - Source starting X offset (columns in bytes)
+ * \param hOffset - Source starting Y offset (rows)
+ * \param count   - Size in bytes to copy
+ * \param kind    - Type of transfer
+ * \param stream  - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidMemcpyDirection
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync,
+ * ::cuMemcpyAtoHAsync,
+ * ::cuMemcpy2DAsync
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+
+/** @} */ /* END CUDART_MEMORY_DEPRECATED */
+
+/**
+ * \defgroup CUDART_MEMORY_POOLS Stream Ordered Memory Allocator 
+ *
+ * ___MANBRIEF___ Functions for performing allocation and free operations in stream order.
+ *                Functions for controlling the behavior of the underlying allocator.
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ * 
+ *
+ * @{
+ *
+ * \section CUDART_MEMORY_POOLS_overview overview
+ *
+ * The asynchronous allocator allows the user to allocate and free in stream order.
+ * All asynchronous accesses of the allocation must happen between
+ * the stream executions of the allocation and the free. If the memory is accessed
+ * outside of the promised stream order, a use before allocation / use after free error
+ * will cause undefined behavior.
+ *
+ * The allocator is free to reallocate the memory as long as it can guarantee
+ * that compliant memory accesses will not overlap temporally.
+ * The allocator may refer to internal stream ordering as well as inter-stream dependencies
+ * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee.
+ * The allocator may also insert inter-stream dependencies to establish the temporal guarantee.
+ *
+ * \section CUDART_MEMORY_POOLS_support Supported Platforms
+ *
+ * Whether or not a device supports the integrated stream ordered memory allocator
+ * may be queried by calling ::cudaDeviceGetAttribute() with the device attribute
+ * ::cudaDevAttrMemoryPoolsSupported.
+ */
+
+/**
+ * \brief Allocates memory with stream ordered semantics
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the memory pool associated with the stream's device.
+ *
+ * \note The default memory pool of a device contains device memory from that device.
+ * \note Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs.
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] devPtr  - Returned device pointer
+ * \param[in] size     - Number of bytes to allocate
+ * \param[in] hStream  - The stream establishing the stream ordering contract and the memory pool to allocate from
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorOutOfMemory,
+ * \notefnerr
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuMemAllocAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream)  "cudaMallocAsync (C++ API)", 
+ * ::cudaMallocFromPoolAsync, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute, ::cudaMemPoolGetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream);
+
+/**
+ * \brief Frees memory with stream ordered semantics
+ *
+ * Inserts a free operation into \p hStream.
+ * The allocation must not be accessed after stream execution reaches the free.
+ * After this API returns, accessing the memory from any subsequent work launched on the GPU
+ * or querying its pointer attributes results in undefined behavior.
+ *
+ * \note During stream capture, this function results in the creation of a free node and
+ *       must therefore be passed the address of a graph allocation.
+ *
+ * \param dptr - memory to free
+ * \param hStream - The stream establishing the stream ordering promise
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ * \notefnerr
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cuMemFreeAsync, ::cudaMallocAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr, cudaStream_t hStream);
+
+/**
+ * \brief Tries to release memory back to the OS
+ *
+ * Releases memory back to the OS until the pool contains fewer than minBytesToKeep
+ * reserved bytes, or there is no more memory that the allocator can safely release.
+ * The allocator cannot release OS allocations that back outstanding asynchronous allocations.
+ * The OS allocations may happen at different granularity from the user allocations.
+ *
+ * \note: Allocations that have not been freed count as outstanding.
+ * \note: Allocations that have been asynchronously freed but whose completion has
+ *        not been observed on the host (eg. by a synchronize) can count as outstanding.
+ *
+ * \param[in] pool           - The memory pool to trim
+ * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved,
+ * the TrimTo operation is a no-op.  Otherwise the pool will be guaranteed to have
+ * at least minBytesToKeep bytes reserved after the operation.
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolTrimTo, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep);
+
+/**
+ * \brief Sets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::cudaMemPoolAttrReleaseThreshold: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::cudaMemPoolReuseFollowEventDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::cudaMemPoolReuseAllowOpportunistic: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::cudaMemPoolReuseAllowInternalDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cudaFreeAsync (default enabled).
+ * - ::cudaMemPoolAttrReservedMemHigh: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of backing memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ * - ::cudaMemPoolAttrUsedMemHigh: (value type = cuuint64_t)
+ *                    Reset the high watermark that tracks the amount of used memory that was
+ *                    allocated for the memory pool. It is illegal to set this attribute to a non-zero value.
+ *
+ * \param[in] pool  - The memory pool to modify
+ * \param[in] attr  - The attribute to modify
+ * \param[in] value - Pointer to the value to assign
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolSetAttribute, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value );
+
+/**
+ * \brief Gets attributes of a memory pool
+ *
+ * Supported attributes are:
+ * - ::cudaMemPoolAttrReleaseThreshold: (value type = cuuint64_t)
+ *                    Amount of reserved memory in bytes to hold onto before trying
+ *                    to release memory back to the OS. When more than the release
+ *                    threshold bytes of memory are held by the memory pool, the
+ *                    allocator will try to release memory back to the OS on the
+ *                    next call to stream, event or context synchronize. (default 0)
+ * - ::cudaMemPoolReuseFollowEventDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to use memory asynchronously freed
+ *                    in another stream as long as a stream ordering dependency
+ *                    of the allocating stream on the free action exists.
+ *                    Cuda events and null stream interactions can create the required
+ *                    stream ordered dependencies. (default enabled)
+ * - ::cudaMemPoolReuseAllowOpportunistic: (value type = int)
+ *                    Allow reuse of already completed frees when there is no dependency
+ *                    between the free and allocation. (default enabled)
+ * - ::cudaMemPoolReuseAllowInternalDependencies: (value type = int)
+ *                    Allow ::cudaMallocAsync to insert new stream dependencies
+ *                    in order to establish the stream ordering required to reuse
+ *                    a piece of memory released by ::cudaFreeAsync (default enabled).
+ * - ::cudaMemPoolAttrReservedMemCurrent: (value type = cuuint64_t)
+ *                    Amount of backing memory currently allocated for the mempool.
+ * - ::cudaMemPoolAttrReservedMemHigh: (value type = cuuint64_t)
+ *                    High watermark of backing memory allocated for the mempool since
+ *                    the last time it was reset.
+ * - ::cudaMemPoolAttrUsedMemCurrent: (value type = cuuint64_t)
+ *                    Amount of memory from the pool that is currently in use by the application.
+ * - ::cudaMemPoolAttrUsedMemHigh: (value type = cuuint64_t)
+ *                    High watermark of the amount of memory from the pool that was in use by the
+ *                    application since the last time it was reset.
+ *
+ * \param[in] pool  - The memory pool to get attributes of 
+ * \param[in] attr  - The attribute to get
+ * \param[in] value - Retrieved value 
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_callback
+ *
+ * \sa ::cuMemPoolGetAttribute, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value );
+
+/**
+ * \brief Controls visibility of pools between devices
+ *
+ * \param[in] pool  - The pool being modified
+ * \param[in] map   - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu
+ * \param[in] count - Number of descriptors in the map array.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa ::cuMemPoolSetAccess, ::cudaMemPoolGetAccess, ::cudaMallocAsync, cudaFreeAsync
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAccessDesc *descList, size_t count);
+
+/**
+ * \brief Returns the accessibility of a pool from a device
+ *
+ * Returns the accessibility of the pool's memory from the specified location.
+ *
+ * \param[out] flags   - the accessibility of the pool from the specified location
+ * \param[in] memPool  - the pool being queried
+ * \param[in] location - the location accessing the pool
+ *
+ * \sa ::cuMemPoolGetAccess, ::cudaMemPoolSetAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool, struct cudaMemLocation *location);
+
+/**
+ * \brief Creates a memory pool
+ *
+ * Creates a CUDA memory pool and returns the handle in \p pool.  The \p poolProps determines
+ * the properties of the pool such as the backing device and IPC capabilities.
+ *
+ * By default, the pool's memory will be accessible from the device it is allocated on.
+ *
+ * \note Specifying cudaMemHandleTypeNone creates a memory pool that will not support IPC.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported
+ *
+ * \sa ::cuMemPoolCreate, ::cudaDeviceSetMemPool, ::cudaMallocFromPoolAsync, ::cudaMemPoolExportToShareableHandle, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool
+
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps);
+
+/**
+ * \brief Destroys the specified memory pool 
+ *
+ * If any pointers obtained from this pool haven't been freed or
+ * the pool has free operations that haven't completed
+ * when ::cudaMemPoolDestroy is invoked, the function will return immediately and the
+ * resources associated with the pool will be released automatically
+ * once there are no more outstanding allocations.
+ *
+ * Destroying the current mempool of a device sets the default mempool of
+ * that device as the current mempool for that device.
+ *
+ * \note A device's default memory pool cannot be destroyed.
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa cuMemPoolDestroy, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolDestroy(cudaMemPool_t memPool);
+
+/**
+ * \brief Allocates memory from a specified pool with stream ordered semantics.
+ *
+ * Inserts an allocation operation into \p hStream.
+ * A pointer to the allocated memory is returned immediately in *dptr.
+ * The allocation must not be accessed until the the allocation operation completes.
+ * The allocation comes from the specified memory pool.
+ *
+ * \note
+ *    -  The specified memory pool may be from a device different than that of the specified \p hStream.
+ *
+ *    -  Basic stream ordering allows future work submitted into the same stream to use the allocation.
+ *       Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation
+ *       operation completes before work submitted in a separate stream runs.
+ *
+ * \note During stream capture, this function results in the creation of an allocation node.  In this case,
+ *       the allocation is owned by the graph instead of the memory pool. The memory pool's properties
+ *       are used to set the node's creation parameters.
+ *
+ * \param[out] ptr     - Returned device pointer
+ * \param[in] bytesize - Number of bytes to allocate
+ * \param[in] memPool  - The pool to allocate from
+ * \param[in] stream   - The stream establishing the stream ordering semantic
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemAllocFromPoolAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream)  "cudaMallocAsync (C++ API)", 
+ * ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaMemPoolCreate, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream);
+
+/**
+ * \brief Exports a memory pool to the requested handle type.
+ *
+ * Given an IPC capable mempool, create an OS handle to share the pool with another process.
+ * A recipient process can convert the shareable handle into a mempool with ::cudaMemPoolImportFromShareableHandle.
+ * Individual pointers can then be shared with the ::cudaMemPoolExportPointer and ::cudaMemPoolImportPointer APIs.
+ * The implementation of what the shareable handle is and how it can be transferred is defined by the requested
+ * handle type.
+ *
+ * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than cudaMemHandleTypeNone.
+ *
+ * \param[out] handle_out  - pointer to the location in which to store the requested handle 
+ * \param[in] pool         - pool to export
+ * \param[in] handleType   - the type of handle to create
+ * \param[in] flags        - must be 0
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolExportPointer, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle(
+    void                            *shareableHandle,
+    cudaMemPool_t                    memPool,
+    enum cudaMemAllocationHandleType handleType,
+    unsigned int                     flags);
+
+/**
+ * \brief imports a memory pool from a shared handle.
+ *
+ * Specific allocations can be imported from the imported pool with ::cudaMemPoolImportPointer.
+ *
+ * \note Imported memory pools do not support creating new allocations.
+ *       As such imported memory pools may not be used in ::cudaDeviceSetMemPool
+ *       or ::cudaMallocFromPoolAsync calls.
+ *
+ * \param[out] pool_out    - Returned memory pool
+ * \param[in] handle       - OS handle of the pool to open
+ * \param[in] handleType   - The type of handle being imported
+ * \param[in] flags        - must be 0
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolImportFromShareableHandle, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolExportPointer, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle(
+    cudaMemPool_t                   *memPool,
+    void                            *shareableHandle,
+    enum cudaMemAllocationHandleType handleType,
+    unsigned int                     flags);
+
+/**
+ * \brief Export data to share a memory pool allocation between processes.
+ *
+ * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool.
+ * The recipient process can import the allocation with the ::cudaMemPoolImportPointer api.
+ * The data is not a handle and may be shared through any IPC mechanism.
+ *
+ * \param[out] shareData_out - Returned export data
+ * \param[in] ptr            - pointer to memory being exported
+ *
+ * \returns
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ *
+ * \sa ::cuMemPoolExportPointer, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolImportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(struct cudaMemPoolPtrExportData *exportData, void *ptr);
+
+/**
+ * \brief Import a memory pool allocation from another process.
+ *
+ * Returns in \p ptr_out a pointer to the imported memory.
+ * The imported memory must not be accessed before the allocation operation completes
+ * in the exporting process. The imported memory must be freed from all importing processes before
+ * being freed in the exporting process. The pointer may be freed with cudaFree
+ * or cudaFreeAsync.  If ::cudaFreeAsync is used, the free must be completed
+ * on the importing process before the free operation on the exporting process.
+ *
+ * \note The ::cudaFreeAsync api may be used in the exporting process before
+ *       the ::cudaFreeAsync operation completes in its stream as long as the
+ *       ::cudaFreeAsync in the exporting process specifies a stream with
+ *       a stream dependency on the importing process's ::cudaFreeAsync.
+ *
+ * \param[out] ptr_out  - pointer to imported memory
+ * \param[in] pool      - pool from which to import
+ * \param[in] shareData - data specifying the memory to import
+ *
+ * \returns
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ *
+ * \sa ::cuMemPoolImportPointer, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolExportPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool, struct cudaMemPoolPtrExportData *exportData);
+
+/** @} */ /* END CUDART_MEMORY_POOLS */
+
+/**
+ * \defgroup CUDART_UNIFIED Unified Addressing
+ *
+ * ___MANBRIEF___ unified addressing functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the unified addressing functions of the CUDA 
+ * runtime application programming interface.
+ *
+ * @{
+ *
+ * \section CUDART_UNIFIED_overview Overview
+ *
+ * CUDA devices can share a unified address space with the host.  
+ * For these devices there is no distinction between a device
+ * pointer and a host pointer -- the same pointer value may be 
+ * used to access memory from the host program and from a kernel 
+ * running on the device (with exceptions enumerated below).
+ *
+ * \section CUDART_UNIFIED_support Supported Platforms
+ * 
+ * Whether or not a device supports unified addressing may be 
+ * queried by calling ::cudaGetDeviceProperties() with the device 
+ * property ::cudaDeviceProp::unifiedAddressing.
+ *
+ * Unified addressing is automatically enabled in 64-bit processes .
+ *
+ * \section CUDART_UNIFIED_lookup Looking Up Information from Pointer Values
+ *
+ * It is possible to look up information about the memory which backs a 
+ * pointer value.  For instance, one may want to know if a pointer points
+ * to host or device memory.  As another example, in the case of device 
+ * memory, one may want to know on which CUDA device the memory 
+ * resides.  These properties may be queried using the function 
+ * ::cudaPointerGetAttributes()
+ *
+ * Since pointers are unique, it is not necessary to specify information
+ * about the pointers specified to ::cudaMemcpy() and other copy functions.  
+ * The copy direction ::cudaMemcpyDefault may be used to specify that the 
+ * CUDA runtime should infer the location of the pointer from its value.
+ *
+ * \section CUDART_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory
+ *
+ * All host memory allocated through all devices using ::cudaMallocHost() and
+ * ::cudaHostAlloc() is always directly accessible from all devices that 
+ * support unified addressing.  This is the case regardless of whether or 
+ * not the flags ::cudaHostAllocPortable and ::cudaHostAllocMapped are 
+ * specified.
+ *
+ * The pointer value through which allocated host memory may be accessed 
+ * in kernels on all devices that support unified addressing is the same 
+ * as the pointer value through which that memory is accessed on the host.
+ * It is not necessary to call ::cudaHostGetDevicePointer() to get the device 
+ * pointer for these allocations.  
+ *
+ * Note that this is not the case for memory allocated using the flag
+ * ::cudaHostAllocWriteCombined, as discussed below.
+ *
+ * \section CUDART_UNIFIED_autopeerregister Direct Access of Peer Memory
+ 
+ * Upon enabling direct access from a device that supports unified addressing 
+ * to another peer device that supports unified addressing using 
+ * ::cudaDeviceEnablePeerAccess() all memory allocated in the peer device using 
+ * ::cudaMalloc() and ::cudaMallocPitch() will immediately be accessible 
+ * by the current device.  The device pointer value through 
+ * which any peer's memory may be accessed in the current device 
+ * is the same pointer value through which that memory may be 
+ * accessed from the peer device. 
+ *
+ * \section CUDART_UNIFIED_exceptions Exceptions, Disjoint Addressing
+ * 
+ * Not all memory may be accessed on devices through the same pointer
+ * value through which they are accessed on the host.  These exceptions
+ * are host memory registered using ::cudaHostRegister() and host memory
+ * allocated using the flag ::cudaHostAllocWriteCombined.  For these 
+ * exceptions, there exists a distinct host and device address for the
+ * memory.  The device address is guaranteed to not overlap any valid host
+ * pointer range and is guaranteed to have the same value across all devices
+ * that support unified addressing.  
+ * 
+ * This device address may be queried using ::cudaHostGetDevicePointer() 
+ * when a device using unified addressing is current.  Either the host 
+ * or the unified device pointer value may be used to refer to this memory 
+ * in ::cudaMemcpy() and similar functions using the ::cudaMemcpyDefault 
+ * memory direction.
+ *
+ */
+
+/**
+ * \brief Returns attributes about a specified pointer
+ *
+ * Returns in \p *attributes the attributes of the pointer \p ptr.
+ * If pointer was not allocated in, mapped by or registered with context
+ * supporting unified addressing ::cudaErrorInvalidValue is returned.
+ *
+ * \note In CUDA 11.0 forward passing host pointer will return ::cudaMemoryTypeUnregistered
+ * in ::cudaPointerAttributes::type and call will return ::cudaSuccess.
+ *
+ * The ::cudaPointerAttributes structure is defined as:
+ * \code
+    struct cudaPointerAttributes {
+        enum cudaMemoryType type;
+        int device;
+        void *devicePointer;
+        void *hostPointer;
+    }
+    \endcode
+ * In this structure, the individual fields mean
+ *
+ * - \ref ::cudaPointerAttributes::type identifies type of memory. It can be
+ *    ::cudaMemoryTypeUnregistered for unregistered host memory,
+ *    ::cudaMemoryTypeHost for registered host memory, ::cudaMemoryTypeDevice for device
+ *    memory or  ::cudaMemoryTypeManaged for managed memory.
+ *
+ * - \ref ::cudaPointerAttributes::device "device" is the device against which
+ *   \p ptr was allocated.  If \p ptr has memory type ::cudaMemoryTypeDevice
+ *   then this identifies the device on which the memory referred to by \p ptr
+ *   physically resides.  If \p ptr has memory type ::cudaMemoryTypeHost then this
+ *   identifies the device which was current when the allocation was made
+ *   (and if that device is deinitialized then this allocation will vanish
+ *   with that device's state).
+ *
+ * - \ref ::cudaPointerAttributes::devicePointer "devicePointer" is
+ *   the device pointer alias through which the memory referred to by \p ptr
+ *   may be accessed on the current device.
+ *   If the memory referred to by \p ptr cannot be accessed directly by the 
+ *   current device then this is NULL.  
+ *
+ * - \ref ::cudaPointerAttributes::hostPointer "hostPointer" is
+ *   the host pointer alias through which the memory referred to by \p ptr
+ *   may be accessed on the host.
+ *   If the memory referred to by \p ptr cannot be accessed directly by the
+ *   host then this is NULL.
+ *
+ * \param attributes - Attributes for the specified pointer
+ * \param ptr        - Pointer to get attributes for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice,
+ * ::cudaChooseDevice,
+ * ::cuPointerGetAttributes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr);
+
+/** @} */ /* END CUDART_UNIFIED */
+
+/**
+ * \defgroup CUDART_PEER Peer Device Memory Access
+ *
+ * ___MANBRIEF___ peer device memory access functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the peer device memory access functions of the CUDA runtime
+ * application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Queries if a device may directly access a peer device's memory.
+ *
+ * Returns in \p *canAccessPeer a value of 1 if device \p device is capable of
+ * directly accessing memory from \p peerDevice and 0 otherwise.  If direct
+ * access of \p peerDevice from \p device is possible, then access may be
+ * enabled by calling ::cudaDeviceEnablePeerAccess().
+ *
+ * \param canAccessPeer - Returned access capability
+ * \param device        - Device from which allocations on \p peerDevice are to
+ *                        be directly accessed.
+ * \param peerDevice    - Device on which the allocations to be directly accessed 
+ *                        by \p device reside.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceEnablePeerAccess,
+ * ::cudaDeviceDisablePeerAccess,
+ * ::cuDeviceCanAccessPeer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice);
+
+/**
+ * \brief Enables direct access to memory allocations on a peer device.
+ *
+ * On success, all allocations from \p peerDevice will immediately be accessible by
+ * the current device.  They will remain accessible until access is explicitly
+ * disabled using ::cudaDeviceDisablePeerAccess() or either device is reset using
+ * ::cudaDeviceReset().
+ *
+ * Note that access granted by this call is unidirectional and that in order to access
+ * memory on the current device from \p peerDevice, a separate symmetric call 
+ * to ::cudaDeviceEnablePeerAccess() is required.
+ *
+ * Note that there are both device-wide and system-wide limitations per system
+ * configuration, as noted in the CUDA Programming Guide under the section
+ * "Peer-to-Peer Memory Access".
+ *
+ * Returns ::cudaErrorInvalidDevice if ::cudaDeviceCanAccessPeer() indicates
+ * that the current device cannot directly access memory from \p peerDevice.
+ *
+ * Returns ::cudaErrorPeerAccessAlreadyEnabled if direct access of
+ * \p peerDevice from the current device has already been enabled.
+ *
+ * Returns ::cudaErrorInvalidValue if \p flags is not 0.
+ *
+ * \param peerDevice  - Peer device to enable direct access to from the current device
+ * \param flags       - Reserved for future use and must be set to 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorPeerAccessAlreadyEnabled,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceCanAccessPeer,
+ * ::cudaDeviceDisablePeerAccess,
+ * ::cuCtxEnablePeerAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags);
+
+/**
+ * \brief Disables direct access to memory allocations on a peer device.
+ *
+ * Returns ::cudaErrorPeerAccessNotEnabled if direct access to memory on
+ * \p peerDevice has not yet been enabled from the current device.
+ *
+ * \param peerDevice - Peer device to disable direct access to
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorPeerAccessNotEnabled,
+ * ::cudaErrorInvalidDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaDeviceCanAccessPeer,
+ * ::cudaDeviceEnablePeerAccess,
+ * ::cuCtxDisablePeerAccess
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice);
+
+/** @} */ /* END CUDART_PEER */
+
+/** \defgroup CUDART_OPENGL OpenGL Interoperability */
+
+/** \defgroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D9 Direct3D 9 Interoperability */
+
+/** \defgroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D10 Direct3D 10 Interoperability */
+
+/** \defgroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_D3D11 Direct3D 11 Interoperability */
+
+/** \defgroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] */
+
+/** \defgroup CUDART_VDPAU VDPAU Interoperability */
+
+/** \defgroup CUDART_EGL EGL Interoperability */
+
+/**
+ * \defgroup CUDART_INTEROP Graphics Interoperability
+ *
+ * ___MANBRIEF___ graphics interoperability functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graphics interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Unregisters a graphics resource for access by CUDA
+ *
+ * Unregisters the graphics resource \p resource so it is not accessible by
+ * CUDA unless registered again.
+ *
+ * If \p resource is invalid then ::cudaErrorInvalidResourceHandle is
+ * returned.
+ *
+ * \param resource - Resource to unregister
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphicsD3D9RegisterResource,
+ * ::cudaGraphicsD3D10RegisterResource,
+ * ::cudaGraphicsD3D11RegisterResource,
+ * ::cudaGraphicsGLRegisterBuffer,
+ * ::cudaGraphicsGLRegisterImage,
+ * ::cuGraphicsUnregisterResource
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource);
+
+/**
+ * \brief Set usage flags for mapping a graphics resource
+ *
+ * Set \p flags for mapping the graphics resource \p resource.
+ *
+ * Changes to \p flags will take effect the next time \p resource is mapped.
+ * The \p flags argument may be any of the following:
+ * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how \p resource will
+ *     be used. It is therefore assumed that CUDA may read from or write to \p resource.
+ * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA will not write to \p resource.
+ * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies CUDA will not read from \p resource and will
+ *   write over the entire contents of \p resource, so none of the data
+ *   previously stored in \p resource will be preserved.
+ *
+ * If \p resource is presently mapped for access by CUDA then ::cudaErrorUnknown is returned.
+ * If \p flags is not one of the above values then ::cudaErrorInvalidValue is returned.
+ *
+ * \param resource - Registered resource to set flags for
+ * \param flags    - Parameters for resource mapping
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cuGraphicsResourceSetMapFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags);
+
+/**
+ * \brief Map graphics resources for access by CUDA
+ *
+ * Maps the \p count graphics resources in \p resources for access by CUDA.
+ *
+ * The resources in \p resources may be accessed by CUDA until they
+ * are unmapped. The graphics API from which \p resources were registered
+ * should not access any resources while they are mapped by CUDA. If an
+ * application does so, the results are undefined.
+ *
+ * This function provides the synchronization guarantee that any graphics calls
+ * issued before ::cudaGraphicsMapResources() will complete before any subsequent CUDA
+ * work issued in \p stream begins.
+ *
+ * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
+ * is returned. If any of \p resources are presently mapped for access by
+ * CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count     - Number of resources to map
+ * \param resources - Resources to map for CUDA
+ * \param stream    - Stream for synchronization
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsUnmapResources,
+ * ::cuGraphicsMapResources
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Unmap graphics resources.
+ *
+ * Unmaps the \p count graphics resources in \p resources.
+ *
+ * Once unmapped, the resources in \p resources may not be accessed by CUDA
+ * until they are mapped again.
+ *
+ * This function provides the synchronization guarantee that any CUDA work issued
+ * in \p stream before ::cudaGraphicsUnmapResources() will complete before any
+ * subsequently issued graphics work begins.
+ *
+ * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle
+ * is returned. If any of \p resources are not presently mapped for access by
+ * CUDA then ::cudaErrorUnknown is returned.
+ *
+ * \param count     - Number of resources to unmap
+ * \param resources - Resources to unmap
+ * \param stream    - Stream for synchronization
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cuGraphicsUnmapResources
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0));
+
+/**
+ * \brief Get an device pointer through which to access a mapped graphics resource.
+ *
+ * Returns in \p *devPtr a pointer through which the mapped graphics resource
+ * \p resource may be accessed.
+ * Returns in \p *size the size of the memory in bytes which may be accessed from that pointer.
+ * The value set in \p devPtr may change every time that \p resource is mapped.
+ *
+ * If \p resource is not a buffer then it cannot be accessed via a pointer and
+ * ::cudaErrorUnknown is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ * *
+ * \param devPtr     - Returned pointer through which \p resource may be accessed
+ * \param size       - Returned size of the buffer accessible starting at \p *devPtr
+ * \param resource   - Mapped resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsMapResources,
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cuGraphicsResourceGetMappedPointer
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource);
+
+/**
+ * \brief Get an array through which to access a subresource of a mapped graphics resource.
+ *
+ * Returns in \p *array an array through which the subresource of the mapped
+ * graphics resource \p resource which corresponds to array index \p arrayIndex
+ * and mipmap level \p mipLevel may be accessed.  The value set in \p array may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::cudaErrorUnknown is returned.
+ * If \p arrayIndex is not a valid array index for \p resource then
+ * ::cudaErrorInvalidValue is returned.
+ * If \p mipLevel is not a valid mipmap level for \p resource then
+ * ::cudaErrorInvalidValue is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ *
+ * \param array       - Returned array through which a subresource of \p resource may be accessed
+ * \param resource    - Mapped resource to access
+ * \param arrayIndex  - Array index for array textures or cubemap face
+ *                      index as defined by ::cudaGraphicsCubeFace for
+ *                      cubemap textures for the subresource to access
+ * \param mipLevel    - Mipmap level for the subresource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsSubResourceGetMappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel);
+
+/**
+ * \brief Get a mipmapped array through which to access a mapped graphics resource.
+ *
+ * Returns in \p *mipmappedArray a mipmapped array through which the mapped
+ * graphics resource \p resource may be accessed. The value set in \p mipmappedArray may
+ * change every time that \p resource is mapped.
+ *
+ * If \p resource is not a texture then it cannot be accessed via an array and
+ * ::cudaErrorUnknown is returned.
+ * If \p resource is not mapped then ::cudaErrorUnknown is returned.
+ *
+ * \param mipmappedArray - Returned mipmapped array through which \p resource may be accessed
+ * \param resource       - Mapped resource to access
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorUnknown
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsResourceGetMappedMipmappedArray
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource);
+
+/** @} */ /* END CUDART_INTEROP */
+
+/**
+ * \defgroup CUDART_TEXTURE Texture Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ texture reference management functions of the CUDA runtime
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the low level texture reference management functions
+ * of the CUDA runtime application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Binds a memory area to a texture
+ *
+ * \deprecated
+ *
+ * Binds \p size bytes of the memory area pointed to by \p devPtr to the
+ * texture reference \p texref. \p desc describes how the memory is interpreted
+ * when fetching values from the texture. Any memory previously bound to
+ * \p texref is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
+ * returns in \p *offset a byte offset that
+ * must be applied to texture fetches in order to read from the desired memory.
+ * This offset must be divided by the texel size and passed to kernels that
+ * read from the texture so they can be applied to the ::tex1Dfetch() function.
+ * If the device memory pointer was returned from ::cudaMalloc(), the offset is
+ * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
+ *
+ * The total number of elements (or texels) in the linear address range
+ * cannot exceed ::cudaDeviceProp::maxTexture1DLinear[0].
+ * The number of elements is computed as (\p size / elementSize),
+ * where elementSize is determined from \p desc.
+ *
+ * \param offset - Offset in bytes
+ * \param texref - Texture to bind
+ * \param devPtr - Memory area on device
+ * \param desc   - Channel format
+ * \param size   - Size of the memory area pointed to by devPtr
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)",
+ * ::cuTexRefSetAddress,
+ * ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFormat,
+ * ::cuTexRefSetFlags,
+ * ::cuTexRefSetBorderColor
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX));
+
+/**
+ * \brief Binds a 2D memory area to a texture
+ *
+ * \deprecated
+ *
+ * Binds the 2D memory area pointed to by \p devPtr to the
+ * texture reference \p texref. The size of the area is constrained by
+ * \p width in texel units, \p height in texel units, and \p pitch in byte
+ * units. \p desc describes how the memory is interpreted when fetching values
+ * from the texture. Any memory previously bound to \p texref is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses, ::cudaBindTexture2D() returns in \p *offset a byte offset that
+ * must be applied to texture fetches in order to read from the desired memory.
+ * This offset must be divided by the texel size and passed to kernels that
+ * read from the texture so they can be applied to the ::tex2D() function.
+ * If the device memory pointer was returned from ::cudaMalloc(), the offset is
+ * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
+ *
+ * \p width and \p height, which are specified in elements (or texels), cannot
+ * exceed ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1]
+ * respectively. \p pitch, which is specified in bytes, cannot exceed
+ * ::cudaDeviceProp::maxTexture2DLinear[2].
+ *
+ * The driver returns ::cudaErrorInvalidValue if \p pitch is not a multiple of
+ * ::cudaDeviceProp::texturePitchAlignment.
+ *
+ * \param offset - Offset in bytes
+ * \param texref - Texture reference to bind
+ * \param devPtr - 2D memory area on device
+ * \param desc   - Channel format
+ * \param width  - Width in texel units
+ * \param height - Height in texel units
+ * \param pitch  - Pitch in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)",
+ * ::cuTexRefSetAddress2D,
+ * ::cuTexRefSetFormat,
+ * ::cuTexRefSetFlags,
+ * ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetBorderColor
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch);
+
+/**
+ * \brief Binds an array to a texture
+ *
+ * \deprecated
+ *
+ * Binds the CUDA array \p array to the texture reference \p texref.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the texture. Any CUDA array previously bound to \p texref is unbound.
+ *
+ * \param texref - Texture to bind
+ * \param array  - Memory array on device
+ * \param desc   - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)",
+ * ::cuTexRefSetArray,
+ * ::cuTexRefSetFormat,
+ * ::cuTexRefSetFlags,
+ * ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetFilterMode,
+ * ::cuTexRefSetBorderColor,
+ * ::cuTexRefSetMaxAnisotropy
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
+
+/**
+ * \brief Binds a mipmapped array to a texture
+ *
+ * \deprecated
+ *
+ * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p texref.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the texture. Any CUDA mipmapped array previously bound to \p texref is unbound.
+ *
+ * \param texref         - Texture to bind
+ * \param mipmappedArray - Memory mipmapped array on device
+ * \param desc           - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)",
+ * ::cuTexRefSetMipmappedArray,
+ * ::cuTexRefSetMipmapFilterMode,
+ * ::cuTexRefSetMipmapLevelClamp,
+ * ::cuTexRefSetMipmapLevelBias,
+ * ::cuTexRefSetFormat,
+ * ::cuTexRefSetFlags,
+ * ::cuTexRefSetAddressMode,
+ * ::cuTexRefSetBorderColor,
+ * ::cuTexRefSetMaxAnisotropy
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc);
+
+/**
+ * \brief Unbinds a texture
+ *
+ * \deprecated
+ *
+ * Unbinds the texture bound to \p texref. If \p texref is not currently bound, no operation is performed.
+ *
+ * \param texref - Texture to unbind
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref);
+
+/**
+ * \brief Get the alignment offset of a texture
+ *
+ * \deprecated
+ *
+ * Returns in \p *offset the offset that was returned when texture reference
+ * \p texref was bound.
+ *
+ * \param offset - Offset of texture reference in bytes
+ * \param texref - Texture to get offset of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref);
+
+/**
+ * \brief Get the texture reference associated with a symbol
+ *
+ * \deprecated
+ *
+ * Returns in \p *texref the structure associated to the texture reference
+ * defined by symbol \p symbol.
+ *
+ * \param texref - Texture reference associated with symbol
+ * \param symbol - Texture to get reference for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_string_api_deprecation_50
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaGetChannelDesc,
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)",
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
+ * ::cuModuleGetTexRef
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol);
+
+/** @} */ /* END CUDART_TEXTURE */
+
+/**
+ * \defgroup CUDART_SURFACE Surface Reference Management [DEPRECATED]
+ *
+ * ___MANBRIEF___ surface reference management functions of the CUDA runtime
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the low level surface reference management functions
+ * of the CUDA runtime application programming interface.
+ *
+ * Some functions have overloaded C++ API template versions documented separately in the
+ * \ref CUDART_HIGHLEVEL "C++ API Routines" module.
+ *
+ * @{
+ */
+
+/**
+ * \brief Binds an array to a surface
+ *
+ * \deprecated
+ *
+ * Binds the CUDA array \p array to the surface reference \p surfref.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the surface. Any CUDA array previously bound to \p surfref is unbound.
+ *
+ * \param surfref - Surface to bind
+ * \param array  - Memory array on device
+ * \param desc   - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSurface
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)",
+ * \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)",
+ * ::cudaGetSurfaceReference,
+ * ::cuSurfRefSetArray
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc);
+
+/**
+ * \brief Get the surface reference associated with a symbol
+ *
+ * \deprecated
+ *
+ * Returns in \p *surfref the structure associated to the surface reference
+ * defined by symbol \p symbol.
+ *
+ * \param surfref - Surface reference associated with symbol
+ * \param symbol - Surface to get reference for
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSurface
+ * \notefnerr
+ * \note_string_api_deprecation_50
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
+ * ::cuModuleGetSurfRef
+ */
+extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol);
+
+/** @} */ /* END CUDART_SURFACE */
+
+/**
+ * \defgroup CUDART_TEXTURE_OBJECT Texture Object Management
+ *
+ * ___MANBRIEF___ texture object management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the low level texture object management functions
+ * of the CUDA runtime application programming interface. The texture
+ * object API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Get the channel descriptor of an array
+ *
+ * Returns in \p *desc the channel descriptor of the CUDA array \p array.
+ *
+ * \param desc  - Channel format
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)",
+ * ::cudaCreateTextureObject, ::cudaCreateSurfaceObject
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array);
+
+/**
+ * \brief Returns a channel descriptor using the specified format
+ *
+ * Returns a channel descriptor with format \p f and number of bits of each
+ * component \p x, \p y, \p z, and \p w.  The ::cudaChannelFormatDesc is
+ * defined as:
+ * \code
+  struct cudaChannelFormatDesc {
+    int x, y, z, w;
+    enum cudaChannelFormatKind f;
+  };
+ * \endcode
+ *
+ * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned,
+ * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat.
+ *
+ * \param x - X component
+ * \param y - Y component
+ * \param z - Z component
+ * \param w - W component
+ * \param f - Channel format
+ *
+ * \return
+ * Channel descriptor with format \p f
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaCreateTextureObject, ::cudaCreateSurfaceObject
+ */
+extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f);
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::cudaResourceDesc structure is defined as:
+ * \code
+        struct cudaResourceDesc {
+            enum cudaResourceType resType;
+            
+            union {
+                struct {
+                    cudaArray_t array;
+                } array;
+                struct {
+                    cudaMipmappedArray_t mipmap;
+                } mipmap;
+                struct {
+                    void *devPtr;
+                    struct cudaChannelFormatDesc desc;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    void *devPtr;
+                    struct cudaChannelFormatDesc desc;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+        };
+ * \endcode
+ * where:
+ * - ::cudaResourceDesc::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        enum cudaResourceType {
+            cudaResourceTypeArray          = 0x00,
+            cudaResourceTypeMipmappedArray = 0x01,
+            cudaResourceTypeLinear         = 0x02,
+            cudaResourceTypePitch2D        = 0x03
+        };
+ * \endcode
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap
+ * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc::normalizedCoords must be set to true.
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
+ * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed 
+ * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)).
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
+ * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width
+ * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively.
+ * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to 
+ * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2].
+ *
+ *
+ * The ::cudaTextureDesc struct is defined as
+ * \code
+        struct cudaTextureDesc {
+            enum cudaTextureAddressMode addressMode[3];
+            enum cudaTextureFilterMode  filterMode;
+            enum cudaTextureReadMode    readMode;
+            int                         sRGB;
+            float                       borderColor[4];
+            int                         normalizedCoords;
+            unsigned int                maxAnisotropy;
+            enum cudaTextureFilterMode  mipmapFilterMode;
+            float                       mipmapLevelBias;
+            float                       minMipmapLevelClamp;
+            float                       maxMipmapLevelClamp;
+            int                         disableTrilinearOptimization;
+        };
+ * \endcode
+ * where
+ * - ::cudaTextureDesc::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as:
+ *   \code
+        enum cudaTextureAddressMode {
+            cudaAddressModeWrap   = 0,
+            cudaAddressModeClamp  = 1,
+            cudaAddressModeMirror = 2,
+            cudaAddressModeBorder = 3
+        };
+ *   \endcode
+ *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc::normalizedCoords
+ *   is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp.
+ *
+ * - ::cudaTextureDesc::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as:
+ *   \code
+        enum cudaTextureFilterMode {
+            cudaFilterModePoint  = 0,
+            cudaFilterModeLinear = 1
+        };
+ *   \endcode
+ *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear.
+ *
+ * - ::cudaTextureDesc::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as:
+ *   \code
+        enum cudaTextureReadMode {
+            cudaReadModeElementType     = 0,
+            cudaReadModeNormalizedFloat = 1
+        };
+ *   \endcode
+ *   Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of 
+ *   whether or not this ::cudaTextureDesc::readMode is set ::cudaReadModeNormalizedFloat is specified.
+ *
+ * - ::cudaTextureDesc::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch.
+ *
+ * - ::cudaTextureDesc::borderColor specifies the float values of color. where:
+ *   ::cudaTextureDesc::borderColor[0] contains value of 'R', 
+ *   ::cudaTextureDesc::borderColor[1] contains value of 'G',
+ *   ::cudaTextureDesc::borderColor[2] contains value of 'B', 
+ *   ::cudaTextureDesc::borderColor[3] contains value of 'A'
+ *   Note that application using integer border color values will need to <reinterpret_cast> these values to float.
+ *   The values are set only when the addressing mode specified by ::cudaTextureDesc::addressMode is cudaAddressModeBorder.
+ *
+ * - ::cudaTextureDesc::normalizedCoords specifies whether the texture coordinates will be normalized or not.
+ *
+ * - ::cudaTextureDesc::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::cudaTextureDesc::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::cudaTextureDesc::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::cudaTextureDesc::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::cudaTextureDesc::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ * - ::cudaTextureDesc::disableTrilinearOptimization specifies whether the trilinear filtering optimizations will be disabled.
+ *
+ * The ::cudaResourceViewDesc struct is defined as
+ * \code
+        struct cudaResourceViewDesc {
+            enum cudaResourceViewFormat format;
+            size_t                      width;
+            size_t                      height;
+            size_t                      depth;
+            unsigned int                firstMipmapLevel;
+            unsigned int                lastMipmapLevel;
+            unsigned int                firstLayer;
+            unsigned int                lastLayer;
+        };
+ * \endcode
+ * where:
+ * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int
+ *   format but with 4 channels.
+ *
+ * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::cudaTextureDesc::minMipmapLevelClamp and ::cudaTextureDesc::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, 
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDestroyTextureObject,
+ * ::cuTexObjectCreate
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);
+
+/**
+ * \brief Creates a texture object
+ *
+ * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes
+ * the data to texture from. \p pTexDesc describes how the data should be sampled.
+ * \p pResViewDesc is an optional argument that specifies an alternate format for
+ * the data described by \p pResDesc, and also describes the subresource region
+ * to restrict access to when texturing. \p pResViewDesc can only be specified if
+ * the type of resource is a CUDA array or a CUDA mipmapped array.
+ *
+ * Texture objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a texture object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * The ::cudaResourceDesc structure is defined as:
+ * \code
+        struct cudaResourceDesc {
+            enum cudaResourceType resType;
+            
+            union {
+                struct {
+                    cudaArray_t array;
+                } array;
+                struct {
+                    cudaMipmappedArray_t mipmap;
+                } mipmap;
+                struct {
+                    void *devPtr;
+                    struct cudaChannelFormatDesc desc;
+                    size_t sizeInBytes;
+                } linear;
+                struct {
+                    void *devPtr;
+                    struct cudaChannelFormatDesc desc;
+                    size_t width;
+                    size_t height;
+                    size_t pitchInBytes;
+                } pitch2D;
+            } res;
+        };
+ * \endcode
+ * where:
+ * - ::cudaResourceDesc::resType specifies the type of resource to texture from.
+ * CUresourceType is defined as:
+ * \code
+        enum cudaResourceType {
+            cudaResourceTypeArray          = 0x00,
+            cudaResourceTypeMipmappedArray = 0x01,
+            cudaResourceTypeLinear         = 0x02,
+            cudaResourceTypePitch2D        = 0x03
+        };
+ * \endcode
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array
+ * must be set to a valid CUDA array handle.
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap
+ * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc_v2::normalizedCoords must be set to true.
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr
+ * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
+ * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes
+ * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed 
+ * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)).
+ *
+ * \par
+ * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr
+ * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment.
+ * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width
+ * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed
+ * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively.
+ * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to 
+ * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2].
+ *
+ *
+ * The ::cudaTextureDesc_v2 struct is defined as
+ * \code
+        struct cudaTextureDesc_v2 {
+            enum cudaTextureAddressMode addressMode[3];
+            enum cudaTextureFilterMode  filterMode;
+            enum cudaTextureReadMode    readMode;
+            int                         sRGB;
+            float                       borderColor[4];
+            int                         normalizedCoords;
+            unsigned int                maxAnisotropy;
+            enum cudaTextureFilterMode  mipmapFilterMode;
+            float                       mipmapLevelBias;
+            float                       minMipmapLevelClamp;
+            float                       maxMipmapLevelClamp;
+            int                         disableTrilinearOptimization;
+            int                         seamlessCubemap;
+        };
+ * \endcode
+ * where
+ * - ::cudaTextureDesc_v2::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as:
+ *   \code
+        enum cudaTextureAddressMode {
+            cudaAddressModeWrap   = 0,
+            cudaAddressModeClamp  = 1,
+            cudaAddressModeMirror = 2,
+            cudaAddressModeBorder = 3
+        };
+ *   \endcode
+ *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc_v2::normalizedCoords
+ *   is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp.
+ *
+ * - ::cudaTextureDesc_v2::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as:
+ *   \code
+        enum cudaTextureFilterMode {
+            cudaFilterModePoint  = 0,
+            cudaFilterModeLinear = 1
+        };
+ *   \endcode
+ *   This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear.
+ *
+ * - ::cudaTextureDesc_v2::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as:
+ *   \code
+        enum cudaTextureReadMode {
+            cudaReadModeElementType     = 0,
+            cudaReadModeNormalizedFloat = 1
+        };
+ *   \endcode
+ *   Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of 
+ *   whether or not this ::cudaTextureDesc_v2::readMode is set ::cudaReadModeNormalizedFloat is specified.
+ *
+ * - ::cudaTextureDesc_v2::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch.
+ *
+ * - ::cudaTextureDesc_v2::borderColor specifies the float values of color. where:
+ *   ::cudaTextureDesc_v2::borderColor[0] contains value of 'R', 
+ *   ::cudaTextureDesc_v2::borderColor[1] contains value of 'G',
+ *   ::cudaTextureDesc_v2::borderColor[2] contains value of 'B', 
+ *   ::cudaTextureDesc_v2::borderColor[3] contains value of 'A'
+ *   Note that application using integer border color values will need to <reinterpret_cast> these values to float.
+ *   The values are set only when the addressing mode specified by ::cudaTextureDesc_v2::addressMode is cudaAddressModeBorder.
+ *
+ * - ::cudaTextureDesc_v2::normalizedCoords specifies whether the texture coordinates will be normalized or not.
+ *
+ * - ::cudaTextureDesc_v2::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be
+ *   clamped to the range [1,16].
+ *
+ * - ::cudaTextureDesc_v2::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels.
+ *
+ * - ::cudaTextureDesc_v2::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level.
+ *
+ * - ::cudaTextureDesc_v2::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to.
+ *
+ * - ::cudaTextureDesc_v2::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to.
+ *
+ * - ::cudaTextureDesc_v2::disableTrilinearOptimization specifies whether the trilinear filtering optimizations will be disabled.
+ *
+ * - ::cudaTextureDesc_v2::seamlessCubemap specifies whether seamless cube map filtering is enabled. This flag can only be specified if the 
+ *   underlying resource is a CUDA array or a CUDA mipmapped array that was created with the flag ::cudaArrayCubemap.
+ *   When seamless cube map filtering is enabled, texture address modes specified by ::cudaTextureDesc_v2::addressMode are ignored.
+ *   Instead, if the ::cudaTextureDesc_v2::filterMode is set to ::cudaFilterModePoint the address mode ::cudaAddressModeClamp will be applied for all dimensions.
+ *   If the ::cudaTextureDesc_v2::filterMode is set to ::cudaFilterModeLinear seamless cube map filtering will be performed when sampling along the cube face borders.
+ *
+ * The ::cudaResourceViewDesc struct is defined as
+ * \code
+        struct cudaResourceViewDesc {
+            enum cudaResourceViewFormat format;
+            size_t                      width;
+            size_t                      height;
+            size_t                      depth;
+            unsigned int                firstMipmapLevel;
+            unsigned int                lastMipmapLevel;
+            unsigned int                firstLayer;
+            unsigned int                lastLayer;
+        };
+ * \endcode
+ * where:
+ * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should
+ *   be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block
+ *   compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format
+ *   with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have
+ *   a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int
+ *   format but with 4 channels.
+ *
+ * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block
+ *   compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats,
+ *   this value has to be equal to that of the original resource.
+ *
+ * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the
+ *   original resource.
+ *
+ * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero.
+ *   For non-mipmapped resources, this value has to be zero.::cudaTextureDesc_v2::minMipmapLevelClamp and ::cudaTextureDesc_v2::maxMipmapLevelClamp
+ *   will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified,
+ *   then the actual minimum mipmap level clamp will be 3.2.
+ *
+ * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value
+ *   has to be zero.
+ *
+ * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero.
+ *   For non-layered resources, this value has to be zero.
+ *
+ * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, 
+ *   this value has to be zero.
+ *
+ *
+ * \param pTexObject   - Texture object to create
+ * \param pResDesc     - Resource descriptor
+ * \param pTexDesc     - Texture descriptor
+ * \param pResViewDesc - Resource view descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDestroyTextureObject,
+ * ::cuTexObjectCreate
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject_v2(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc_v2 *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc);
+
+/**
+ * \brief Destroys a texture object
+ *
+ * Destroys the texture object specified by \p texObject.
+ *
+ * \param texObject - Texture object to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's resource descriptor
+ *
+ * Returns the resource descriptor for the texture object specified by \p texObject.
+ *
+ * \param pResDesc  - Resource descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetResourceDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetTextureDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's texture descriptor
+ *
+ * Returns the texture descriptor for the texture object specified by \p texObject.
+ *
+ * \param pTexDesc  - Texture descriptor
+ * \param texObject - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetTextureDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc_v2(struct cudaTextureDesc_v2 *pTexDesc, cudaTextureObject_t texObject);
+
+/**
+ * \brief Returns a texture object's resource view descriptor
+ *
+ * Returns the resource view descriptor for the texture object specified by \p texObject.
+ * If no resource view was specified, ::cudaErrorInvalidValue is returned.
+ *
+ * \param pResViewDesc - Resource view descriptor
+ * \param texObject    - Texture object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateTextureObject,
+ * ::cuTexObjectGetResourceViewDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject);
+
+/** @} */ /* END CUDART_TEXTURE_OBJECT */
+
+/**
+ * \defgroup CUDART_SURFACE_OBJECT Surface Object Management
+ *
+ * ___MANBRIEF___ surface object management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the low level texture object management functions
+ * of the CUDA runtime application programming interface. The surface object 
+ * API is only supported on devices of compute capability 3.0 or higher.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a surface object
+ *
+ * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes
+ * the data to perform surface load/stores on. ::cudaResourceDesc::resType must be 
+ * ::cudaResourceTypeArray and  ::cudaResourceDesc::res::array::array
+ * must be set to a valid CUDA array handle.
+ *
+ * Surface objects are only supported on devices of compute capability 3.0 or higher.
+ * Additionally, a surface object is an opaque value, and, as such, should only be
+ * accessed through CUDA API calls.
+ *
+ * \param pSurfObject - Surface object to create
+ * \param pResDesc    - Resource descriptor
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidChannelDescriptor,
+ * ::cudaErrorInvalidResourceHandle
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDestroySurfaceObject,
+ * ::cuSurfObjectCreate
+ */
+
+extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc);
+
+/**
+ * \brief Destroys a surface object
+ *
+ * Destroys the surface object specified by \p surfObject.
+ *
+ * \param surfObject - Surface object to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaCreateSurfaceObject,
+ * ::cuSurfObjectDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject);
+
+/**
+ * \brief Returns a surface object's resource descriptor
+ * Returns the resource descriptor for the surface object specified by \p surfObject.
+ *
+ * \param pResDesc   - Resource descriptor
+ * \param surfObject - Surface object
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaCreateSurfaceObject,
+ * ::cuSurfObjectGetResourceDesc
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject);
+
+/** @} */ /* END CUDART_SURFACE_OBJECT */
+
+/**
+ * \defgroup CUDART__VERSION Version Management
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the latest version of CUDA supported by the driver
+ *
+ * Returns in \p *driverVersion the latest version of CUDA supported by
+ * the driver. The version is returned as (1000 &times; major + 10 &times; minor).
+ * For example, CUDA 9.2 would be represented by 9020. If no driver is installed,
+ * then 0 is returned as the driver version.
+ *
+ * This function automatically returns ::cudaErrorInvalidValue
+ * if \p driverVersion is NULL.
+ *
+ * \param driverVersion - Returns the CUDA driver version.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaRuntimeGetVersion,
+ * ::cuDriverGetVersion
+ */
+extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion);
+
+/**
+ * \brief Returns the CUDA Runtime version
+ *
+ * Returns in \p *runtimeVersion the version number of the current CUDA
+ * Runtime instance. The version is returned as
+ * (1000 &times; major + 10 &times; minor). For example,
+ * CUDA 9.2 would be represented by 9020.
+ *
+ * This function automatically returns ::cudaErrorInvalidValue if
+ * the \p runtimeVersion argument is NULL.
+ *
+ * \param runtimeVersion - Returns the CUDA Runtime version.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDriverGetVersion,
+ * ::cuDriverGetVersion
+ */
+extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion);
+
+/** @} */ /* END CUDART__VERSION */
+
+/**
+ * \defgroup CUDART_GRAPH Graph Management
+ *
+ * ___MANBRIEF___ graph management functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the graph management functions of CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Creates a graph
+ *
+ * Creates an empty graph, which is returned via \p pGraph.
+ *
+ * \param pGraph - Returns newly created graph
+ * \param flags   - Graph creation flags, must be 0
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphDestroy,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph, unsigned int flags);
+
+/**
+ * \brief Creates a kernel execution node and adds it to a graph
+ *
+ * Creates a new kernel execution node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p pNodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * The cudaKernelNodeParams structure is defined as:
+ *
+ * \code
+ *  struct cudaKernelNodeParams
+ *  {
+ *      void* func;
+ *      dim3 gridDim;
+ *      dim3 blockDim;
+ *      unsigned int sharedMemBytes;
+ *      void **kernelParams;
+ *      void **extra;
+ *  };
+ * \endcode
+ *
+ * When the graph is launched, the node will invoke kernel \p func on a (\p gridDim.x x
+ * \p gridDim.y x \p gridDim.z) grid of blocks. Each block contains
+ * (\p blockDim.x x \p blockDim.y x \p blockDim.z) threads.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be
+ * available to each thread block.
+ *
+ * Kernel parameters to \p func can be specified in one of two ways:
+ *
+ * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N
+ * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer,
+ * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual
+ * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need
+ * to be specified as that information is retrieved directly from the kernel's image.
+ *
+ * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in
+ * via \p extra. This places the burden on the application of knowing each kernel
+ * parameter's size and alignment/padding within the buffer. The \p extra parameter exists
+ * to allow this function to take additional less commonly used arguments. \p extra specifies
+ * a list of names of extra settings and their corresponding values. Each extra setting name is
+ * immediately followed by the corresponding value. The list must be terminated with either NULL or
+ * CU_LAUNCH_PARAM_END.
+ *
+ * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra
+ *   array;
+ * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next
+ *   value in \p extra will be a pointer to a buffer
+ *   containing all the kernel parameters for launching kernel
+ *   \p func;
+ * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next
+ *   value in \p extra will be a pointer to a size_t
+ *   containing the size of the buffer specified with
+ *   ::CU_LAUNCH_PARAM_BUFFER_POINTER;
+ *
+ * The error ::cudaErrorInvalidValue will be returned if kernel parameters are specified with both
+ * \p kernelParams and \p extra (i.e. both \p kernelParams and
+ * \p extra are non-NULL).
+ *
+ * The \p kernelParams or \p extra array, as well as the argument values it points to,
+ * are copied during this call.
+ *
+ * \note Kernels launched using graphs must not use texture and surface references. Reading or
+ *       writing through any texture or surface reference is undefined behavior.
+ *       This restriction does not apply to texture and surface objects.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pNodeParams      - Parameters for the GPU execution node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphKernelNodeGetParams,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Returns a kernel node's parameters
+ *
+ * Returns the parameters of kernel node \p node in \p pNodeParams.
+ * The \p kernelParams or \p extra array returned in \p pNodeParams,
+ * as well as the argument values it points to, are owned by the node.
+ * This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphKernelNodeSetParams to update the
+ * parameters of this node.
+ *
+ * The params will contain either \p kernelParams or \p extra,
+ * according to which of these was most recently set on the node.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDeviceFunction
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Sets a kernel node's parameters
+ *
+ * Sets the parameters of kernel node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Copies attributes from source node to destination node.
+ *
+ * Copies attributes from source node \p src to destination node \p dst.
+ * Both node must have the same context.
+ *
+ * \param[out] dst Destination node
+ * \param[in] src Source node
+ * For list of attributes see ::cudaKernelNodeAttrID
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidContext
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeCopyAttributes(
+        cudaGraphNode_t hSrc,
+        cudaGraphNode_t hDst);
+
+/**
+ * \brief Queries node attribute.
+ *
+ * Queries attribute \p attr from node \p hNode and stores it in corresponding
+ * member of \p value_out.
+ *
+ * \param[in] hNode
+ * \param[in] attr
+ * \param[out] value_out
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute(
+    cudaGraphNode_t hNode,
+    cudaKernelNodeAttrID attr,
+    cudaKernelNodeAttrValue *value_out);
+
+/**
+ * \brief Sets node attribute.
+ *
+ * Sets attribute \p attr on node \p hNode from corresponding attribute of
+ * \p value.
+ *
+ * \param[out] hNode
+ * \param[in] attr
+ * \param[out] value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaAccessPolicyWindow
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute(
+    cudaGraphNode_t hNode,
+    cudaKernelNodeAttrID attr,
+    const cudaKernelNodeAttrValue *value);
+
+/**
+ * \brief Creates a memcpy node and adds it to a graph
+ *
+ * Creates a new memcpy node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will perform the memcpy described by \p pCopyParams.
+ * See ::cudaMemcpy3D() for a description of the structure and its restrictions.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pCopyParams      - Parameters for the memory copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy3D,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphAddMemcpyNode1D,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms *pCopyParams);
+
+/**
+ * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol(
+    cudaGraphNode_t* pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t* pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a 1D memcpy node and adds it to a graph
+ *
+ * Creates a new 1D memcpy node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Launching a
+ * memcpy node with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Returns a memcpy node's parameters
+ *
+ * Returns the parameters of memcpy node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy3D,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters
+ *
+ * Sets the parameters of memcpy node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy3D,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets a memcpy node's parameters to copy to a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol(
+    cudaGraphNode_t node,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets a memcpy node's parameters to copy from a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol(
+    cudaGraphNode_t node,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets a memcpy node's parameters to perform a 1-dimensional copy
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory
+ * area pointed to by \p src to the memory area pointed to by \p dst, where
+ * \p kind specifies the direction of the copy, and must be one of
+ * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost,
+ * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing
+ * ::cudaMemcpyDefault is recommended, in which case the type of transfer is
+ * inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing. Launching a
+ * memcpy node with dst and src pointers that do not match the direction of
+ * the copy results in an undefined behavior.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpy,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams1D(
+    cudaGraphNode_t node,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Creates a memset node and adds it to a graph
+ *
+ * Creates a new memset node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * The element size must be 1, 2, or 4 bytes.
+ * When the graph is launched, the node will perform the memset described by \p pMemsetParams.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pMemsetParams    - Parameters for the memory set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemset2D,
+ * ::cudaGraphMemsetNodeGetParams,
+ * ::cudaGraphMemsetNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemsetParams *pMemsetParams);
+
+/**
+ * \brief Returns a memset node's parameters
+ *
+ * Returns the parameters of memset node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemset2D,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Sets a memset node's parameters
+ *
+ * Sets the parameters of memset node \p node to \p pNodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemset2D,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Creates a host execution node and adds it to a graph
+ *
+ * Creates a new CPU execution node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p pNodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will invoke the specified CPU function.
+ * Host nodes are not supported under MPS with pre-Volta GPUs.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param pNodeParams      - Parameters for the host node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphHostNodeGetParams,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Returns a host node's parameters
+ *
+ * Returns the parameters of host node \p node in \p pNodeParams.
+ *
+ * \param node        - Node to get the parameters for
+ * \param pNodeParams - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Sets a host node's parameters
+ *
+ * Sets the parameters of host node \p node to \p nodeParams.
+ *
+ * \param node        - Node to set the parameters for
+ * \param pNodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchHostFunc,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeGetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Creates a child graph node and adds it to a graph
+ *
+ * Creates a new node which executes an embedded graph, and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * If \p hGraph contains allocation or free nodes, this call will return an error.
+ *
+ * The node executes an embedded child graph. The child graph is cloned in this call.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param childGraph      - The graph to clone into this node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaGraph_t childGraph);
+
+/**
+ * \brief Gets a handle to the embedded graph of a child graph node
+ *
+ * Gets a handle to the embedded graph in a child graph node. This call
+ * does not clone the graph. Changes to the graph will be reflected in
+ * the node, and the node retains ownership of the graph.
+ *
+ * Allocation and free nodes cannot be added to the returned graph.
+ * Attempting to do so will return an error.
+ *
+ * \param node   - Node to get the embedded graph for
+ * \param pGraph - Location to store a handle to the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphNodeFindInClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph);
+
+/**
+ * \brief Creates an empty node and adds it to a graph
+ *
+ * Creates a new node which performs no operation, and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * An empty node performs no operation during execution, but can be used for
+ * transitive ordering. For example, a phased execution graph with 2 groups of n
+ * nodes with a barrier between them can be represented using an empty node and
+ * 2*n dependency edges, rather than no empty node and n^2 dependency edges.
+ *
+ * \param pGraphNode     - Returns newly created node
+ * \param graph          - Graph to which to add the node
+ * \param pDependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies);
+
+/**
+ * \brief Creates an event record node and adds it to a graph
+ *
+ * Creates a new event record node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * Each launch of the graph will record \p event to capture execution of the
+ * node's dependencies.
+ *
+ * These nodes may not be used in loops or conditionals.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Returns the event associated with an event record node
+ *
+ * Returns the event of event record node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+#endif
+
+/**
+ * \brief Sets an event record node's event
+ *
+ * Sets the event of event record node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Creates an event wait node and adds it to a graph
+ *
+ * Creates a new event wait node and adds it to \p hGraph with \p numDependencies
+ * dependencies specified via \p dependencies and event specified in \p event.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p phGraphNode.
+ *
+ * The graph node will wait for all work captured in \p event.  See ::cuEventRecord()
+ * for details on what is captured by an event.  The synchronization will be performed
+ * efficiently on the device when applicable.  \p event may be from a different context
+ * or device than the launch stream.
+ *
+ * These nodes may not be used in loops or conditionals.
+ *
+ * \param phGraphNode     - Returns newly created node
+ * \param hGraph          - Graph to which to add the node
+ * \param dependencies    - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param event           - Event for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Returns the event associated with an event wait node
+ *
+ * Returns the event of event wait node \p hNode in \p event_out.
+ *
+ * \param hNode     - Node to get the event for
+ * \param event_out - Pointer to return the event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out);
+#endif
+
+/**
+ * \brief Sets an event wait node's event
+ *
+ * Sets the event of event wait node \p hNode to \p event.
+ *
+ * \param hNode - Node to set the event for
+ * \param event - Event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Creates an external semaphore signal node and adds it to a graph
+ *
+ * Creates a new external semaphore signal node and adds it to \p graph with \p
+ * numDependencies dependencies specified via \p dependencies and arguments specified
+ * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the
+ * node will be placed at the root of the graph. \p dependencies may not have any
+ * duplicate entries. A handle to the new node will be returned in \p pGraphNode.
+ *
+ * Performs a signal operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The operation(s) will occur after all of the node's
+ * dependencies have completed.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExternalSemaphoresSignalNodeGetParams,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns an external semaphore signal node's parameters
+ *
+ * Returns the parameters of an external semaphore signal node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreSignalNodeParams *params_out);
+#endif
+
+/**
+ * \brief Sets an external semaphore signal node's parameters
+ *
+ * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Creates an external semaphore wait node and adds it to a graph
+ *
+ * Creates a new external semaphore wait node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p dependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p dependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * Performs a wait operation on a set of externally allocated semaphore objects
+ * when the node is launched.  The node's dependencies will not be launched until
+ * the wait operation has completed.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphExternalSemaphoresWaitNodeGetParams,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode,
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns an external semaphore wait node's parameters
+ *
+ * Returns the parameters of an external semaphore wait node \p hNode in \p params_out.
+ * The \p extSemArray and \p paramsArray returned in \p params_out,
+ * are owned by the node.  This memory remains valid until the node is destroyed or its
+ * parameters are modified, and should not be modified
+ * directly. Use ::cudaGraphExternalSemaphoresSignalNodeSetParams to update the
+ * parameters of this node.
+ *
+ * \param hNode      - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaLaunchKernel,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreWaitNodeParams *params_out);
+#endif
+
+/**
+ * \brief Sets an external semaphore wait node's parameters
+ *
+ * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams.
+ *
+ * \param hNode      - Node to set the parameters for
+ * \param nodeParams - Parameters to copy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Creates an allocation node and adds it to a graph
+ *
+ * Creates a new allocation node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and arguments specified in \p nodeParams.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param nodeParams      - Parameters for the node
+ *
+ * When ::cudaGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
+ * \p nodeParams.dptr.  The allocation's address remains fixed across instantiations and launches.
+ *
+ * If the allocation is freed in the same graph, by creating a free node using ::cudaGraphAddMemFreeNode,
+ * the allocation can be accessed by nodes ordered after the allocation node but before the free node.
+ * These allocations cannot be freed outside the owning graph, and they can only be freed once in the
+ * owning graph.
+ *
+ * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the
+ * graph which are ordered after the allocation node, but also by stream operations ordered after the
+ * graph's execution but before the allocation is freed.
+ *
+ * Allocations which are not freed in the same graph can be freed by:
+ * - passing the allocation to ::cudaMemFreeAsync or ::cudaMemFree;
+ * - launching a graph with a free node for that allocation; or
+ * - specifying ::cudaGraphInstantiateFlagAutoFreeOnLaunch during instantiation, which makes
+ *   each launch behave as though it called ::cudaMemFreeAsync for every unfreed allocation.
+ *
+ * It is not possible to free an allocation in both the owning graph and another graph.  If the allocation
+ * is freed in the same graph, a free node cannot be added to another graph.  If the allocation is freed
+ * in another graph, a free node can no longer be added to the owning graph.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaGraphMemAllocNodeGetParams,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemAllocNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, struct cudaMemAllocNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Returns a memory alloc node's parameters
+ *
+ * Returns the parameters of a memory alloc node \p hNode in \p params_out.
+ * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the
+ * node.  This memory remains valid until the node is destroyed.  The returned
+ * parameters must not be modified.
+ *
+ * \param node       - Node to get the parameters for
+ * \param params_out - Pointer to return the parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphMemFreeNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out);
+#endif
+
+/**
+ * \brief Creates a memory free node and adds it to a graph
+ *
+ * Creates a new memory free node and adds it to \p graph with \p numDependencies
+ * dependencies specified via \p pDependencies and address specified in \p dptr.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries. A handle
+ * to the new node will be returned in \p pGraphNode.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dptr            - Address of memory to free
+ *
+ * ::cudaGraphAddMemFreeNode will return ::cudaErrorInvalidValue if the user attempts to free:
+ * - an allocation twice in the same graph.
+ * - an address that was not returned by an allocation node.
+ * - an invalid address.
+ *
+ * The following restrictions apply to graphs which contain allocation and/or memory free nodes:
+ * - Nodes and edges of the graph cannot be deleted.
+ * - The graph cannot be used in a child node.
+ * - Only one instantiation of the graph may exist at any point in time.
+ * - The graph cannot be cloned.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorCudartUnloading,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOutOfMemory
+ * \note_graph_thread_safety
+ * \notefnerr
+ *
+ * \sa
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphMemFreeNodeGetParams,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemFreeNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr);
+#endif
+
+/**
+ * \brief Returns a memory free node's parameters
+ *
+ * Returns the address of a memory free node \p hNode in \p dptr_out.
+ *
+ * \param node     - Node to get the parameters for
+ * \param dptr_out - Pointer to return the device address
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaGraphMemFreeNodeGetParams
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out);
+#endif
+
+/**
+ * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS.
+ *
+ * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are
+ * freed back to the operating system.
+ *
+ * \param device - The device for which cached memory should be freed.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGraphMemTrim(int device);
+#endif
+
+/**
+ * \brief Query asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::cudaGraphMemAttrUsedMemCurrent: Amount of memory, in bytes, currently associated with graphs
+ * - ::cudaGraphMemAttrUsedMemHigh: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::cudaGraphMemAttrReservedMemCurrent: Amount of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ * - ::cudaGraphMemAttrReservedMemHigh: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - retrieved value
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceSetGraphMemAttribute,
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceGetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value);
+#endif
+
+/**
+ * \brief Set asynchronous allocation attributes related to graphs
+ *
+ * Valid attributes are:
+ *
+ * - ::cudaGraphMemAttrUsedMemHigh: High watermark of memory, in bytes, associated with graphs since the
+ *   last time it was reset.  High watermark can only be reset to zero.
+ * - ::cudaGraphMemAttrReservedMemHigh: High watermark of memory, in bytes, currently allocated for use by
+ *   the CUDA graphs asynchronous allocator.
+ *
+ * \param device - Specifies the scope of the query
+ * \param attr - attribute to get
+ * \param value - pointer to value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaDeviceGetGraphMemAttribute,
+ * ::cudaGraphAddMemAllocNode,
+ * ::cudaGraphAddMemFreeNode,
+ * ::cudaDeviceGraphMemTrim,
+ * ::cudaMallocAsync,
+ * ::cudaFreeAsync,
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaDeviceSetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value);
+#endif
+
+/**
+ * \brief Clones a graph
+ *
+ * This function creates a copy of \p originalGraph and returns it in \p pGraphClone.
+ * All parameters are copied into the cloned graph. The original graph may be modified 
+ * after this call without affecting the clone.
+ *
+ * Child graph nodes in the original graph are recursively copied into the clone.
+ *
+ * \param pGraphClone  - Returns newly created cloned graph
+ * \param originalGraph - Graph to clone
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorMemoryAllocation
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphNodeFindInClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph);
+
+/**
+ * \brief Finds a cloned version of a node
+ *
+ * This function returns the node in \p clonedGraph corresponding to \p originalNode 
+ * in the original graph.
+ *
+ * \p clonedGraph must have been cloned from \p originalGraph via ::cudaGraphClone. 
+ * \p originalNode must have been in \p originalGraph at the time of the call to 
+ * ::cudaGraphClone, and the corresponding cloned node in \p clonedGraph must not have 
+ * been removed. The cloned node is then returned via \p pClonedNode.
+ *
+ * \param pNode  - Returns handle to the cloned node
+ * \param originalNode - Handle to the original node
+ * \param clonedGraph - Cloned graph to query
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphClone
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph);
+
+/**
+ * \brief Returns a node's type
+ *
+ * Returns the node type of \p node in \p pType.
+ *
+ * \param node - Node to query
+ * \param pType  - Pointer to return the node type
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphKernelNodeGetParams,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphHostNodeGetParams,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemsetNodeGetParams,
+ * ::cudaGraphMemsetNodeSetParams
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType);
+
+/**
+ * \brief Returns a graph's nodes
+ *
+ * Returns a list of \p graph's nodes. \p nodes may be NULL, in which case this
+ * function will return the number of nodes in \p numNodes. Otherwise,
+ * \p numNodes entries will be filled in. If \p numNodes is higher than the actual
+ * number of nodes, the remaining entries in \p nodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p numNodes.
+ *
+ * \param graph    - Graph to query
+ * \param nodes    - Pointer to return the nodes
+ * \param numNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetType,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t *nodes, size_t *numNodes);
+
+/**
+ * \brief Returns a graph's root nodes
+ *
+ * Returns a list of \p graph's root nodes. \p pRootNodes may be NULL, in which case this
+ * function will return the number of root nodes in \p pNumRootNodes. Otherwise,
+ * \p pNumRootNodes entries will be filled in. If \p pNumRootNodes is higher than the actual
+ * number of root nodes, the remaining entries in \p pRootNodes will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p pNumRootNodes.
+ *
+ * \param graph       - Graph to query
+ * \param pRootNodes    - Pointer to return the root nodes
+ * \param pNumRootNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphCreate,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetType,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes);
+
+/**
+ * \brief Returns a graph's dependency edges
+ *
+ * Returns a list of \p graph's dependency edges. Edges are returned via corresponding
+ * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the
+ * node in \p from[i]. \p from and \p to may both be NULL, in which
+ * case this function only returns the number of edges in \p numEdges. Otherwise,
+ * \p numEdges entries will be filled in. If \p numEdges is higher than the actual
+ * number of edges, the remaining entries in \p from and \p to will be set to NULL, and
+ * the number of edges actually returned will be written to \p numEdges.
+ *
+ * \param graph    - Graph to get the edges from
+ * \param from     - Location to return edge endpoints
+ * \param to       - Location to return edge endpoints
+ * \param numEdges - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, size_t *numEdges);
+
+/**
+ * \brief Returns a node's dependencies
+ *
+ * Returns a list of \p node's dependencies. \p pDependencies may be NULL, in which case this
+ * function will return the number of dependencies in \p pNumDependencies. Otherwise,
+ * \p pNumDependencies entries will be filled in. If \p pNumDependencies is higher than the actual
+ * number of dependencies, the remaining entries in \p pDependencies will be set to NULL, and the
+ * number of nodes actually obtained will be returned in \p pNumDependencies.
+ *
+ * \param node           - Node to query
+ * \param pDependencies    - Pointer to return the dependencies
+ * \param pNumDependencies - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependentNodes,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, size_t *pNumDependencies);
+
+/**
+ * \brief Returns a node's dependent nodes
+ *
+ * Returns a list of \p node's dependent nodes. \p pDependentNodes may be NULL, in which
+ * case this function will return the number of dependent nodes in \p pNumDependentNodes.
+ * Otherwise, \p pNumDependentNodes entries will be filled in. If \p pNumDependentNodes is
+ * higher than the actual number of dependent nodes, the remaining entries in
+ * \p pDependentNodes will be set to NULL, and the number of nodes actually obtained will
+ * be returned in \p pNumDependentNodes.
+ *
+ * \param node             - Node to query
+ * \param pDependentNodes    - Pointer to return the dependent nodes
+ * \param pNumDependentNodes - See description
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphGetNodes,
+ * ::cudaGraphGetRootNodes,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphRemoveDependencies
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, size_t *pNumDependentNodes);
+
+/**
+ * \brief Adds dependency edges to a graph.
+ *
+ * The number of dependencies to be added is defined by \p numDependencies
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying an existing dependency will return an error.
+ *
+ * \param graph - Graph to which dependencies are added
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be added
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphRemoveDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+
+/**
+ * \brief Removes dependency edges from a graph.
+ *
+ * The number of \p pDependencies to be removed is defined by \p numDependencies.
+ * Elements in \p pFrom and \p pTo at corresponding indices define a dependency.
+ * Each node in \p pFrom and \p pTo must belong to \p graph.
+ *
+ * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored.
+ * Specifying a non-existing dependency will return an error.
+ *
+ * \param graph - Graph from which to remove dependencies
+ * \param from - Array of nodes that provide the dependencies
+ * \param to - Array of dependent nodes
+ * \param numDependencies - Number of dependencies to be removed
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddDependencies,
+ * ::cudaGraphGetEdges,
+ * ::cudaGraphNodeGetDependencies,
+ * ::cudaGraphNodeGetDependentNodes
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies);
+
+/**
+ * \brief Remove a node from the graph
+ *
+ * Removes \p node from its graph. This operation also severs any dependencies of other nodes 
+ * on \p node and vice versa.
+ *
+ * Dependencies cannot be removed from graphs which contain allocation or free nodes.
+ * Any attempt to do so will return an error.
+ *
+ * \param node  - Node to remove
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemsetNode
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroyNode(cudaGraphNode_t node);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p pGraphExec.
+ *
+ * If there are any errors, diagnostic information may be returned in \p pErrorNode and
+ * \p pLogBuffer. This is the primary way to inspect instantiation errors. The output
+ * will be null terminated unless the diagnostics overflow
+ * the buffer. In this case, they will be truncated, and the last byte can be
+ * inspected to determine if truncation occurred.
+ *
+ * \param pGraphExec - Returns instantiated graph
+ * \param graph      - Graph to instantiate
+ * \param pErrorNode - In case of an instantiation error, this may be modified to
+ *                      indicate a node contributing to the error
+ * \param pLogBuffer   - A character buffer to store diagnostic messages
+ * \param bufferSize  - Size of the log buffer in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiateWithFlags,
+ * ::cudaGraphCreate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode, char *pLogBuffer, size_t bufferSize);
+
+/**
+ * \brief Creates an executable graph from a graph
+ *
+ * Instantiates \p graph as an executable graph. The graph is validated for any
+ * structural constraints or intra-node constraints which were not previously
+ * validated. If instantiation is successful, a handle to the instantiated graph
+ * is returned in \p pGraphExec.
+ *
+ * The \p flags parameter controls the behavior of instantiation and subsequent
+ * graph launches.  Valid flags are:
+ *
+ * - ::cudaGraphInstantiateFlagAutoFreeOnLaunch, which configures a
+ * graph containing memory allocation nodes to automatically free any
+ * unfreed memory allocations before the graph is relaunched.
+ *
+ * - ::cudaGraphInstantiateFlagUseNodePriority, which causes the graph
+ * to use the priorities from the per-node attributes rather than the priority
+ * of the launch stream during execution. Note that priorities are only available
+ * on kernel nodes, and are copied from stream priority during stream capture.
+ *
+ * If \p graph contains any allocation or free nodes, there can be at most one
+ * executable graph in existence for that graph at a time.
+ *
+ * An attempt to instantiate a second executable graph before destroying the first
+ * with ::cudaGraphExecDestroy will result in an error.
+ *
+ * \param pGraphExec - Returns instantiated graph
+ * \param graph      - Graph to instantiate
+ * \param flags      - Flags to control instantiation.  See ::CUgraphInstantiate_flags.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphCreate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+#if __CUDART_API_VERSION >= 11040
+extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithFlags(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags);
+#endif
+
+/**
+ * \brief Sets the parameters for a kernel node in the given graphExec
+ *
+ * Sets the parameters of a kernel node in an executable graph \p hGraphExec. 
+ * The node is identified by the corresponding node \p node in the 
+ * non-executable graph, from which the executable graph was instantiated. 
+ *
+ * \p node must not have been removed from the original graph. The \p func field 
+ * of \p nodeParams cannot be modified and must match the original value.
+ * All other values can be modified. 
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already 
+ * enqueued or running launches of \p hGraphExec are not affected by this call. 
+ * \p node is also not modified by this call.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - kernel node from the graph from which graphExec was instantiated
+ * \param pNodeParams - Updated Parameters to set
+ * 
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * The source and destination memory in \p pNodeParams must be allocated from the same 
+ * contexts as the original source and destination memory.  Both the instantiation-time 
+ * memory operands and the memory operands in \p pNodeParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * either the original or new memory operands are multidimensional.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Memcpy node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams1D,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p symbol must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    const void* symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p symbol and \p dst must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsFromSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const void* symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional copy
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p dst must be allocated from the same contexts as the original source
+ * and destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNode1D,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParams1D,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const void* src,
+    size_t count,
+    enum cudaMemcpyKind kind);
+#endif
+
+/**
+ * \brief Sets the parameters for a memset node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * The destination memory in \p pNodeParams must be allocated from the same 
+ * context as the original destination memory.  Both the instantiation-time 
+ * memory operand and the memory operand in \p pNodeParams must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns cudaErrorInvalidValue if the memory operand's mappings changed or
+ * either the original or new memory operand are multidimensional.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Memset node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemsetNode,
+ * ::cudaGraphMemsetNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams);
+
+/**
+ * \brief Sets the parameters for a host node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained \p pNodeParams at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * \param hGraphExec  - The executable graph in which to set the specified node
+ * \param node        - Host node from the graph which was used to instantiate graphExec
+ * \param pNodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphHostNodeSetParams,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams);
+
+/**
+ * \brief Updates node parameters in the child graph node in the given graphExec.
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though the nodes contained
+ * in \p node's graph had the parameters contained in \p childGraph's nodes at instantiation.
+ * \p node must remain in the graph which was used to instantiate \p hGraphExec.
+ * Changed edges to and from \p node are ignored.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also
+ * not modified by this call.
+ *
+ * The topology of \p childGraph, as well as the node insertion order,  must match that
+ * of the graph contained in \p node.  See ::cudaGraphExecUpdate() for a list of restrictions
+ * on what can be updated in an instantiated graph.  The update is recursive, so child graph
+ * nodes contained within the top level child graph will also be updated.
+
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param node       - Host node from the graph which was used to instantiate graphExec
+ * \param childGraph - The graph supplying the updated parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphChildGraphNodeGetGraph,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph);
+#endif
+
+/**
+ * \brief Sets the event for an event record node in the given graphExec
+ *
+ * Sets the event of an event record node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Event record node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventRecordNode,
+ * ::cudaGraphEventRecordNodeGetEvent,
+ * ::cudaGraphEventWaitNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Sets the event for an event wait node in the given graphExec
+ *
+ * Sets the event of an event wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Event wait node from the graph from which graphExec was instantiated
+ * \param event      - Updated event to use
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddEventWaitNode,
+ * ::cudaGraphEventWaitNodeGetEvent,
+ * ::cudaGraphEventRecordNodeSetEvent,
+ * ::cudaEventRecordWithFlags,
+ * ::cudaStreamWaitEvent,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event);
+#endif
+
+/**
+ * \brief Sets the parameters for an external semaphore signal node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore signal node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddExternalSemaphoresSignalNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Sets the parameters for an external semaphore wait node in the given graphExec
+ *
+ * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec.
+ * The node is identified by the corresponding node \p hNode in the
+ * non-executable graph, from which the executable graph was instantiated.
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * Changing \p nodeParams->numExtSems is not supported.
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - semaphore wait node from the graph from which graphExec was instantiated
+ * \param nodeParams - Updated Parameters to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddExternalSemaphoresWaitNode,
+ * ::cudaImportExternalSemaphore,
+ * ::cudaSignalExternalSemaphoresAsync,
+ * ::cudaWaitExternalSemaphoresAsync,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams,
+ * ::cudaGraphExecChildGraphNodeSetParams,
+ * ::cudaGraphExecEventRecordNodeSetEvent,
+ * ::cudaGraphExecEventWaitNodeSetEvent,
+ * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ */
+#if __CUDART_API_VERSION >= 11020
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams);
+#endif
+
+/**
+ * \brief Enables or disables the specified node in the given graphExec
+ *
+ * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent 
+ * to empty nodes until they are reenabled. Existing node parameters are not affected by 
+ * disabling/enabling the node.
+ *  
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * The modifications only affect future launches of \p hGraphExec. Already
+ * enqueued or running launches of \p hGraphExec are not affected by this call.
+ * \p hNode is also not modified by this call.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Node is enabled if != 0, otherwise the node is disabled
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeGetEnabled,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ * ::cudaGraphLaunch
+ */
+#if __CUDART_API_VERSION >= 11060
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled);
+#endif
+
+/**
+ * \brief Query whether a node in the given graphExec is enabled
+ *
+ * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled.
+ *
+ * The node is identified by the corresponding node \p hNode in the non-executable 
+ * graph, from which the executable graph was instantiated.   
+ *
+ * \p hNode must not have been removed from the original graph.
+ *
+ * \note Currently only kernel, memset and memcpy nodes are supported. 
+ *
+ * \param hGraphExec - The executable graph in which to set the specified node
+ * \param hNode      - Node from the graph from which graphExec was instantiated
+ * \param isEnabled  - Location to return the enabled status of the node
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphNodeSetEnabled,
+ * ::cudaGraphExecUpdate,
+ * ::cudaGraphInstantiate
+ * ::cudaGraphLaunch
+ */
+#if __CUDART_API_VERSION >= 11060
+extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int *isEnabled);
+#endif
+
+/**
+ * \brief Check whether an executable graph can be updated with a graph and perform the update if possible
+ *
+ * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the
+ * node parameters in a topologically identical graph specified by \p hGraph.
+ *
+ * Limitations:
+ *
+ * - Kernel nodes:
+ *   - The owning context of the function cannot change.
+ *   - A node whose function originally did not use CUDA dynamic parallelism cannot be updated
+ *     to a function which uses CDP.
+ *   - A cooperative node cannot be updated to a non-cooperative node, and vice-versa.
+ *   - If the graph was instantiated with cudaGraphInstantiateFlagUseNodePriority, the
+ *     priority attribute cannot change. Equality is checked on the originally requested
+ *     priority values, before they are clamped to the device's supported range.
+ * - Memset and memcpy nodes:
+ *   - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change.
+ *   - The source/destination memory must be allocated from the same contexts as the original
+ *     source/destination memory.
+ *   - Only 1D memsets can be changed.
+ * - Additional memcpy node restrictions:
+ *   - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE,
+ *     CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+ *
+ * Note:  The API may add further restrictions in future releases.  The return code should always be checked.
+ *
+ * cudaGraphExecUpdate sets \p updateResult_out to cudaGraphExecUpdateErrorTopologyChanged under
+ * the following conditions:
+ *
+ * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out
+ *   is NULL.
+ * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out
+ *   is NULL.
+ * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is
+ *   the pairless node from \p hGraph.
+ * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph.
+ *
+ * cudaGraphExecUpdate sets \p updateResult_out to:
+ * - cudaGraphExecUpdateError if passed an invalid value.
+ * - cudaGraphExecUpdateErrorTopologyChanged if the graph topology changed
+ * - cudaGraphExecUpdateErrorNodeTypeChanged if the type of a node changed, in which case
+ *   \p hErrorNode_out is set to the node from \p hGraph.
+ * - cudaGraphExecUpdateErrorFunctionChanged if the function of a kernel node changed (CUDA driver < 11.2)
+ * - cudaGraphExecUpdateErrorUnsupportedFunctionChange if the func field of a kernel changed in an
+ *   unsupported way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorParametersChanged if any parameters to a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorAttributesChanged if any attributes of a node changed in a way 
+ *   that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph
+ * - cudaGraphExecUpdateErrorNotSupported if something about a node is unsupported, like 
+ *   the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph
+ *
+ * If \p updateResult_out isn't set in one of the situations described above, the update check passes
+ * and cudaGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph.  If an error happens
+ * during the update, \p updateResult_out will be set to cudaGraphExecUpdateError; otherwise,
+ * \p updateResult_out is set to cudaGraphExecUpdateSuccess.
+ *
+ * cudaGraphExecUpdate returns cudaSuccess when the updated was performed successfully.  It returns
+ * cudaErrorGraphExecUpdateFailure if the graph update was not performed because it included 
+ * changes which violated constraints specific to instantiated graph update.
+ *
+ * \param hGraphExec The instantiated graph to be updated
+ * \param hGraph The graph containing the updated parameters
+ * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any
+ * \param updateResult_out Whether the graph update was permitted.  If was forbidden, the reason why
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorGraphExecUpdateFailure,
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphNode_t *hErrorNode_out, enum cudaGraphExecUpdateResult *updateResult_out);
+
+/**
+ * \brief Uploads an executable graph in a stream
+ *
+ * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of
+ * the same \p hGraphExec will be serialized. Each upload is ordered behind both any
+ * previous work in \p hStream and any previous launches of \p hGraphExec.
+ * Uses memory cached by \p stream to back the allocations owned by \p graphExec.
+ *
+ * \param hGraphExec - Executable graph to upload
+ * \param hStream    - Stream in which to upload the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * \notefnerr
+ * \note_init_rt
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphLaunch,
+ * ::cudaGraphExecDestroy
+ */
+#if __CUDART_API_VERSION >= 11010
+ extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream);
+#endif
+
+/**
+ * \brief Launches an executable graph in a stream
+ *
+ * Executes \p graphExec in \p stream. Only one instance of \p graphExec may be executing
+ * at a time. Each launch is ordered behind both any previous work in \p stream
+ * and any previous launches of \p graphExec. To execute a graph concurrently, it must be
+ * instantiated multiple times into multiple executable graphs.
+ *
+ * If any allocations created by \p graphExec remain unfreed (from a previous launch) and
+ * \p graphExec was not instantiated with ::cudaGraphInstantiateFlagAutoFreeOnLaunch,
+ * the launch will fail with ::cudaErrorInvalidValue.
+ *
+ * \param graphExec - Executable graph to launch
+ * \param stream    - Stream in which to launch the graph
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphExecDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+
+/**
+ * \brief Destroys an executable graph
+ *
+ * Destroys the executable graph specified by \p graphExec.
+ *
+ * \param graphExec - Executable graph to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphUpload,
+ * ::cudaGraphLaunch
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphExecDestroy(cudaGraphExec_t graphExec);
+
+/**
+ * \brief Destroys a graph
+ *
+ * Destroys the graph specified by \p graph, as well as all of its nodes.
+ *
+ * \param graph - Graph to destroy
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ * \note_destroy_ub
+ *
+ * \sa
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph);
+
+/**
+ * \brief Write a DOT file describing graph structure
+ *
+ * Using the provided \p graph, write to \p path a DOT formatted description of the graph.
+ * By default this includes the graph topology, node types, node id, kernel names and memcpy direction.
+ * \p flags can be specified to write more detailed information about each node type such as
+ * parameter values, kernel attributes, node and function handles.
+ *
+ * \param graph - The graph to create a DOT file from
+ * \param path  - The path to write the DOT file to
+ * \param flags - Flags from cudaGraphDebugDotFlags for specifying which additional node information to write
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorOperatingSystem
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(cudaGraph_t graph, const char *path, unsigned int flags);
+
+/**
+ * \brief Create a user object
+ *
+ * Create a user object with the specified destructor callback and initial reference count. The
+ * initial references are owned by the caller.
+ *
+ * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they
+ * are executed by a shared internal thread. Another thread may be signaled to perform such
+ * actions, if it does not block forward progress of tasks scheduled through CUDA.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param ptr             - The pointer to pass to the destroy function
+ * \param destroy         - Callback to free the user object when it is no longer in use
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass ::cudaUserObjectNoDestructorSync,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy, unsigned int initialRefcount, unsigned int flags);
+
+/**
+ * \brief Retain a reference to a user object
+ *
+ * Retains new references to a user object. The new references are owned by the caller.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to retain
+ * \param count  - The number of references to retain, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1));
+
+/**
+ * \brief Release a reference to a user object
+ *
+ * Releases user object references owned by the caller. The object's destructor is invoked if
+ * the reference count reaches zero.
+ *
+ * It is undefined behavior to release references not owned by the caller, or to use a user
+ * object handle after all references are released.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param object - The object to release
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate,
+ * ::cudaUserObjectRetain,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1));
+
+/**
+ * \brief Retain a reference to a user object from a graph
+ *
+ * Creates or moves user object references that will be owned by a CUDA graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph to associate the reference with
+ * \param object - The user object to retain a reference for
+ * \param count  - The number of references to add to the graph, typically 1. Must be
+ *                 nonzero and not larger than INT_MAX.
+ * \param flags  - The optional flag ::cudaGraphUserObjectMove transfers references
+ *                 from the calling thread, rather than create new references. Pass 0
+ *                 to create new references.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphReleaseUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1), unsigned int flags __dv(0));
+
+/**
+ * \brief Release a user object reference from a graph
+ *
+ * Releases user object references owned by a graph.
+ *
+ * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects.
+ *
+ * \param graph  - The graph that will release the reference
+ * \param object - The user object to release a reference for
+ * \param count  - The number of references to release, typically 1. Must be nonzero
+ *                 and not larger than INT_MAX.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ * ::cudaUserObjectRetain,
+ * ::cudaUserObjectRelease,
+ * ::cudaGraphRetainUserObject,
+ * ::cudaGraphCreate
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1));
+
+/** @} */ /* END CUDART_GRAPH */
+
+/**
+ * \defgroup CUDART_DRIVER_ENTRY_POINT Driver Entry Point Access
+ *
+ * ___MANBRIEF___ driver entry point access functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the driver entry point access functions of CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns the requested driver API function pointer
+ *
+ * Returns in \p **funcPtr the address of the CUDA driver function for the requested flags.
+ *
+ * For a requested driver symbol, if the CUDA version in which the driver symbol was
+ * introduced is less than or equal to the CUDA runtime version, the API will return
+ * the function pointer to the corresponding versioned driver function.
+ *
+ * The pointer returned by the API should be cast to a function pointer matching the
+ * requested driver function's definition in the API header file. The function pointer
+ * typedef can be picked up from the corresponding typedefs header file. For example,
+ * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h.
+ *
+ * The API will return ::cudaErrorSymbolNotFound if the requested driver function is not
+ * supported on the platform, no ABI compatible driver function exists for the CUDA runtime
+ * version or if the driver symbol is invalid.
+ *
+ * The requested flags can be:
+ * - ::cudaEnableDefault: This is the default mode. This is equivalent to
+ *   ::cudaEnablePerThreadDefaultStream if the code is compiled with
+ *   --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM
+ *   is defined; ::cudaEnableLegacyStream otherwise.
+ * - ::cudaEnableLegacyStream: This will enable the search for all driver symbols
+ *   that match the requested driver symbol name except the corresponding per-thread versions.
+ * - ::cudaEnablePerThreadDefaultStream: This will enable the search for all
+ *   driver symbols that match the requested driver symbol name including the per-thread
+ *   versions. If a per-thread version is not found, the API will return the legacy version
+ *   of the driver function.
+ *
+ * \param symbol - The base name of the driver API function to look for. As an example,
+ *                 for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc.
+ *                 Note that the API will use the CUDA runtime version to return the
+ *                 address to the most recent ABI compatible driver symbol, ::cuMemAlloc
+ *                 or ::cuMemAlloc_v2.
+ * \param funcPtr - Location to return the function pointer to the requested driver function
+ * \param flags -  Flags to specify search options.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorSymbolNotFound
+ * \note_version_mixing
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cuGetProcAddress
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags);
+
+/** @} */ /* END CUDART_DRIVER_ENTRY_POINT */
+
+/** \cond impl_private */
+extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId);
+/** \endcond impl_private */
+
+/**
+ * \defgroup CUDART_HIGHLEVEL C++ API Routines
+ *
+ * ___MANBRIEF___ C++ high level API functions of the CUDA runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the C++ high level API functions of the CUDA runtime
+ * application programming interface. To use these functions, your
+ * application needs to be compiled with the \p nvcc compiler.
+ *
+ * \brief C++-style interface built on top of CUDA runtime API
+ */
+
+/**
+ * \defgroup CUDART_DRIVER Interactions with the CUDA Driver API
+ *
+ * ___MANBRIEF___ interactions between CUDA Driver API and CUDA Runtime API
+ * (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the interactions between the CUDA Driver API and the CUDA Runtime API
+ *
+ * @{
+ *
+ * \section CUDART_CUDA_primary Primary Contexts
+ *
+ * There exists a one to one relationship between CUDA devices in the CUDA Runtime
+ * API and ::CUcontext s in the CUDA Driver API within a process.  The specific
+ * context which the CUDA Runtime API uses for a device is called the device's
+ * primary context.  From the perspective of the CUDA Runtime API, a device and 
+ * its primary context are synonymous.
+ *
+ * \section CUDART_CUDA_init Initialization and Tear-Down
+ *
+ * CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to
+ * to the calling host thread.  
+ *
+ * The function ::cudaSetDevice() makes the primary context for the
+ * specified device current to the calling thread by calling ::cuCtxSetCurrent().
+ *
+ * The CUDA Runtime API will automatically initialize the primary context for
+ * a device at the first CUDA Runtime API call which requires an active context.
+ * If no ::CUcontext is current to the calling thread when a CUDA Runtime API call 
+ * which requires an active context is made, then the primary context for a device 
+ * will be selected, made current to the calling thread, and initialized.
+ *
+ * The context which the CUDA Runtime API initializes will be initialized using 
+ * the parameters specified by the CUDA Runtime API functions
+ * ::cudaSetDeviceFlags(), 
+ * ::cudaD3D9SetDirect3DDevice(), 
+ * ::cudaD3D10SetDirect3DDevice(), 
+ * ::cudaD3D11SetDirect3DDevice(), 
+ * ::cudaGLSetGLDevice(), and
+ * ::cudaVDPAUSetVDPAUDevice().
+ * Note that these functions will fail with ::cudaErrorSetOnActiveProcess if they are 
+ * called when the primary context for the specified device has already been initialized.
+ * (or if the current device has already been initialized, in the case of 
+ * ::cudaSetDeviceFlags()). 
+ *
+ * Primary contexts will remain active until they are explicitly deinitialized 
+ * using ::cudaDeviceReset().  The function ::cudaDeviceReset() will deinitialize the 
+ * primary context for the calling thread's current device immediately.  The context 
+ * will remain current to all of the threads that it was current to.  The next CUDA 
+ * Runtime API call on any thread which requires an active context will trigger the 
+ * reinitialization of that device's primary context.
+ *
+ * Note that primary contexts are shared resources. It is recommended that
+ * the primary context not be reset except just before exit or to recover from an
+ * unspecified launch failure.
+ * 
+ * \section CUDART_CUDA_context Context Interoperability
+ *
+ * Note that the use of multiple ::CUcontext s per device within a single process 
+ * will substantially degrade performance and is strongly discouraged.  Instead,
+ * it is highly recommended that the implicit one-to-one device-to-context mapping
+ * for the process provided by the CUDA Runtime API be used.
+ *
+ * If a non-primary ::CUcontext created by the CUDA Driver API is current to a
+ * thread then the CUDA Runtime API calls to that thread will operate on that 
+ * ::CUcontext, with some exceptions listed below.  Interoperability between data
+ * types is discussed in the following sections.
+ *
+ * The function ::cudaPointerGetAttributes() will return the error 
+ * ::cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a 
+ * non-primary context.  The function ::cudaDeviceEnablePeerAccess() and the rest of 
+ * the peer access API may not be called when a non-primary ::CUcontext is current.  
+ * To use the pointer query and peer access APIs with a context created using the 
+ * CUDA Driver API, it is necessary that the CUDA Driver API be used to access
+ * these features.
+ *
+ * All CUDA Runtime API state (e.g, global variables' addresses and values) travels
+ * with its underlying ::CUcontext.  In particular, if a ::CUcontext is moved from one 
+ * thread to another then all CUDA Runtime API state will move to that thread as well.
+ *
+ * Please note that attaching to legacy contexts (those with a version of 3010 as returned
+ * by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return
+ * ::cudaErrorIncompatibleDriverContext in such cases.
+ *
+ * \section CUDART_CUDA_stream Interactions between CUstream and cudaStream_t
+ *
+ * The types ::CUstream and ::cudaStream_t are identical and may be used interchangeably.
+ *
+ * \section CUDART_CUDA_event Interactions between CUevent and cudaEvent_t
+ *
+ * The types ::CUevent and ::cudaEvent_t are identical and may be used interchangeably.
+ *
+ * \section CUDART_CUDA_array Interactions between CUarray and cudaArray_t 
+ *
+ * The types ::CUarray and struct ::cudaArray * represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *,
+ * it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
+ *
+ * In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray,
+ * it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
+ *
+ * \section CUDART_CUDA_graphicsResource Interactions between CUgraphicsResource and cudaGraphicsResource_t
+ *
+ * The types ::CUgraphicsResource and ::cudaGraphicsResource_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a 
+ * ::cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource 
+ * to a ::cudaGraphicsResource_t.
+ *
+ * In order to use a ::cudaGraphicsResource_t in a CUDA Driver API function which takes a
+ * ::CUgraphicsResource, it is necessary to explicitly cast the ::cudaGraphicsResource_t 
+ * to a ::CUgraphicsResource.
+ *
+ * \section CUDART_CUDA_texture_objects Interactions between CUtexObject and cudaTextureObject_t
+ *
+ * The types ::CUtexObject and ::cudaTextureObject_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUtexObject in a CUDA Runtime API function which takes a ::cudaTextureObject_t,
+ * it is necessary to explicitly cast the ::CUtexObject to a ::cudaTextureObject_t.
+ *
+ * In order to use a ::cudaTextureObject_t in a CUDA Driver API function which takes a ::CUtexObject,
+ * it is necessary to explicitly cast the ::cudaTextureObject_t to a ::CUtexObject.
+ *
+ * \section CUDART_CUDA_surface_objects Interactions between CUsurfObject and cudaSurfaceObject_t
+ *
+ * The types ::CUsurfObject and ::cudaSurfaceObject_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::CUsurfObject in a CUDA Runtime API function which takes a ::cudaSurfaceObject_t,
+ * it is necessary to explicitly cast the ::CUsurfObject to a ::cudaSurfaceObject_t.
+ *
+ * In order to use a ::cudaSurfaceObject_t in a CUDA Driver API function which takes a ::CUsurfObject,
+ * it is necessary to explicitly cast the ::cudaSurfaceObject_t to a ::CUsurfObject.
+ *
+ * \section CUDART_CUDA_module Interactions between CUfunction and cudaFunction_t
+ *
+ * The types ::CUfunction and ::cudaFunction_t represent the same data type and may be used
+ * interchangeably by casting the two types between each other.
+ *
+ * In order to use a ::cudaFunction_t in a CUDA Driver API function which takes a ::CUfunction,
+ * it is necessary to explicitly cast the ::cudaFunction_t to a ::CUfunction.
+ *
+ */
+
+ /**
+  * \brief Get pointer to device entry function that matches entry function \p symbolPtr
+  *
+  * Returns in \p functionPtr the device entry function corresponding to the symbol \p symbolPtr.
+  *
+  * \param functionPtr     - Returns the device entry function
+  * \param symbolPtr       - Pointer to device entry function to search for
+  *
+  * \return
+  * ::cudaSuccess
+  *
+  */
+extern __host__ cudaError_t CUDARTAPI_CDECL cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolPtr);
+
+/** @} */ /* END CUDART_DRIVER */
+
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cudaMemcpy
+    #undef cudaMemcpyToSymbol
+    #undef cudaMemcpyFromSymbol
+    #undef cudaMemcpy2D
+    #undef cudaMemcpyToArray
+    #undef cudaMemcpy2DToArray
+    #undef cudaMemcpyFromArray
+    #undef cudaMemcpy2DFromArray
+    #undef cudaMemcpyArrayToArray
+    #undef cudaMemcpy2DArrayToArray
+    #undef cudaMemcpy3D
+    #undef cudaMemcpy3DPeer
+    #undef cudaMemset
+    #undef cudaMemset2D
+    #undef cudaMemset3D
+    #undef cudaMemcpyAsync
+    #undef cudaMemcpyToSymbolAsync
+    #undef cudaMemcpyFromSymbolAsync
+    #undef cudaMemcpy2DAsync
+    #undef cudaMemcpyToArrayAsync
+    #undef cudaMemcpy2DToArrayAsync
+    #undef cudaMemcpyFromArrayAsync
+    #undef cudaMemcpy2DFromArrayAsync
+    #undef cudaMemcpy3DAsync
+    #undef cudaMemcpy3DPeerAsync
+    #undef cudaMemsetAsync
+    #undef cudaMemset2DAsync
+    #undef cudaMemset3DAsync
+    #undef cudaStreamQuery
+    #undef cudaStreamGetFlags
+    #undef cudaStreamGetPriority
+    #undef cudaEventRecord
+    #undef cudaEventRecordWithFlags
+    #undef cudaStreamWaitEvent
+    #undef cudaStreamAddCallback
+    #undef cudaStreamAttachMemAsync
+    #undef cudaStreamSynchronize
+    #undef cudaLaunchKernel
+    #undef cudaLaunchKernelExC
+    #undef cudaLaunchHostFunc
+    #undef cudaMemPrefetchAsync
+    #undef cudaLaunchCooperativeKernel
+    #undef cudaSignalExternalSemaphoresAsync
+    #undef cudaWaitExternalSemaphoresAsync
+    #undef cudaGraphUpload
+    #undef cudaGraphLaunch
+    #undef cudaStreamBeginCapture
+    #undef cudaStreamEndCapture
+    #undef cudaStreamIsCapturing
+    #undef cudaStreamGetCaptureInfo
+    #undef cudaStreamGetCaptureInfo_v2
+    #undef cudaStreamCopyAttributes
+    #undef cudaStreamGetAttribute
+    #undef cudaStreamSetAttribute
+    #undef cudaMallocAsync
+    #undef cudaFreeAsync
+    #undef cudaMallocFromPoolAsync
+    #undef cudaGetDriverEntryPoint
+
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p);
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height);
+    extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0), unsigned int flags __dv(0));
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags);
+    extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length, unsigned int flags);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData);
+    extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync_ptsz(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync_ptsz(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), size_t *numDependencies_out __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies_ptsz(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags __dv(0));
+    extern __host__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dstStream, cudaStream_t srcStream);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamGetAttribute(cudaStream_t stream, cudaStreamAttrID attr, cudaStreamAttrValue *value);
+    extern __host__ cudaError_t CUDARTAPI cudaStreamSetAttribute(cudaStream_t stream, cudaStreamAttrID attr, const cudaStreamAttrValue *param);
+
+    extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream);
+    extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr, cudaStream_t hStream);
+    extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream);
+    extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags);
+
+#elif defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM)
+    // nvcc stubs reference the 'cudaLaunch'/'cudaLaunchKernel' identifier even if it was defined
+    // to 'cudaLaunch_ptsz'/'cudaLaunchKernel_ptsz'. Redirect through a static inline function.
+    #undef cudaLaunchKernel
+    static __inline__ __host__ cudaError_t cudaLaunchKernel(const void *func, 
+                                                            dim3 gridDim, dim3 blockDim, 
+                                                            void **args, size_t sharedMem, 
+                                                            cudaStream_t stream)
+    {
+        return cudaLaunchKernel_ptsz(func, gridDim, blockDim, args, sharedMem, stream);
+    }
+    #define cudaLaunchKernel __CUDART_API_PTSZ(cudaLaunchKernel)
+    #undef cudaLaunchKernelExC
+    static __inline__ __host__ cudaError_t cudaLaunchKernelExC(const cudaLaunchConfig_t *config,
+                                                               const void *func,
+                                                                  void **args)
+    {
+        return cudaLaunchKernelExC_ptsz(config, func, args);
+    }
+    #define cudaLaunchKernelExC __CUDART_API_PTSZ(cudaLaunchKernelExC)
+#endif
+
+#if defined(__cplusplus)
+}
+
+#endif /* __cplusplus */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+
+#undef __dv
+#undef __CUDA_DEPRECATED
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__
+#endif
+
+#endif /* !__CUDA_RUNTIME_API_H__ */
diff --git a/ext/cudart/include/cuda_surface_types.h b/ext/cudart/include/cuda_surface_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..68e88cfe0e0bbf12eee81fed496c86f39904d4e2
--- /dev/null
+++ b/ext/cudart/include/cuda_surface_types.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_SURFACE_TYPES_H__)
+#define __CUDA_SURFACE_TYPES_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+template<class T, int dim = 1>
+struct __device_builtin_surface_type__ surface : public surfaceReference
+{
+#if !defined(__CUDACC_RTC__)
+  __host__ surface(void)
+  {
+    channelDesc = cudaCreateChannelDesc<T>();
+  }
+
+  __host__ surface(struct cudaChannelFormatDesc desc)
+  {
+    channelDesc = desc;
+  }
+#endif /* !__CUDACC_RTC__ */  
+};
+
+template<int dim>
+struct  __device_builtin_surface_type__  surface<void, dim> : public surfaceReference
+{
+#if !defined(__CUDACC_RTC__)
+  __host__ surface(void)
+  {
+    channelDesc = cudaCreateChannelDesc<void>();
+  }
+#endif /* !__CUDACC_RTC__ */  
+};
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__CUDA_SURFACE_TYPES_H__ */
diff --git a/ext/cudart/include/cuda_texture_types.h b/ext/cudart/include/cuda_texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..21c8d944930456a914ba86e0972dbf6ba1bc5bba
--- /dev/null
+++ b/ext/cudart/include/cuda_texture_types.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_TEXTURE_TYPES_H__)
+#define __CUDA_TEXTURE_TYPES_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType>
+struct __device_builtin_texture_type__ texture : public textureReference
+{
+#if !defined(__CUDACC_RTC__)
+  __host__ texture(int                         norm  = 0,
+                   enum cudaTextureFilterMode  fMode = cudaFilterModePoint,
+                   enum cudaTextureAddressMode aMode = cudaAddressModeClamp)
+  {
+    normalized     = norm;
+    filterMode     = fMode;
+    addressMode[0] = aMode;
+    addressMode[1] = aMode;
+    addressMode[2] = aMode;
+    channelDesc    = cudaCreateChannelDesc<T>();
+    sRGB           = 0;
+  }
+
+  __host__ texture(int                          norm,
+                   enum cudaTextureFilterMode   fMode,
+                   enum cudaTextureAddressMode  aMode,
+                   struct cudaChannelFormatDesc desc)
+  {
+    normalized     = norm;
+    filterMode     = fMode;
+    addressMode[0] = aMode;
+    addressMode[1] = aMode;
+    addressMode[2] = aMode;
+    channelDesc    = desc;
+    sRGB           = 0;
+  }
+#endif /* !__CUDACC_RTC__ */
+};
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__CUDA_TEXTURE_TYPES_H__ */
diff --git a/ext/cudart/include/cudart_platform.h b/ext/cudart/include/cudart_platform.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f022bbe349eba2219a6b74f1ea315c1ce8551b7
--- /dev/null
+++ b/ext/cudart/include/cudart_platform.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef __CUDART_PLATFORM_H__
+#define __CUDART_PLATFORM_H__
+
+#if ((defined(__linux__) || defined(__QNX__)) && (defined(__arm__) || defined(__aarch64__) || defined(__x86_64__)))
+#define isEglSupported 1
+#endif
+
+#endif
diff --git a/ext/cudart/include/device_atomic_functions.h b/ext/cudart/include/device_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c96298f2e2e5d68f71ba6529277dd6ec2bc8daae
--- /dev/null
+++ b/ext/cudart/include/device_atomic_functions.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_H__)
+#define __DEVICE_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+#ifdef __CUDA_ARCH__
+extern "C"
+{
+extern __device__ __device_builtin__ int          __iAtomicAdd(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicExch(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicExch(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ float        __fAtomicExch(float *address, float val);
+extern __device__ __device_builtin__ int          __iAtomicMin(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicMin(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicMax(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicMax(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicInc(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicDec(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicAnd(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicAnd(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicOr(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicOr(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicXor(int *address, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicXor(unsigned int *address, unsigned int val);
+extern __device__ __device_builtin__ int          __iAtomicCAS(int *address, int compare, int val);
+extern __device__ __device_builtin__ unsigned int __uAtomicCAS(unsigned int *address, unsigned int compare, unsigned int val);
+}
+#endif /* __CUDA_ARCH__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) __DEF_IF_HOST
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
+    "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
+#else
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+extern "C"
+{
+#ifdef __CUDA_ARCH__
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val);
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val);
+extern __device__ __device_builtin__ unsigned long long int __ullAtomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val);
+#endif  /* __CUDA_ARCH__ */
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) int __any(int cond);
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) int __all(int cond);
+}
+
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) bool any(bool cond) __DEF_IF_HOST
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) bool all(bool cond) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "device_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_H__ */
diff --git a/ext/cudart/include/device_atomic_functions.hpp b/ext/cudart/include/device_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..50e427c39b164e6e145c3930cd66cc03806175a6
--- /dev/null
+++ b/ext/cudart/include/device_atomic_functions.hpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
+#define __DEVICE_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
+{
+  return __iAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
+{
+  return __iAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
+{
+  return __iAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
+{
+  return __fAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
+{
+  return __iAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
+{
+  return __iAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
+{
+  return __iAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
+{
+  return __iAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
+{
+  return __iAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
+{
+  return __iAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
+{
+  return __uAtomicCAS(address, compare, val);
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
+{
+  return __ullAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
+{
+  return (bool)__any((int)cond);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
+{
+  return (bool)__all((int)cond);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/ext/cudart/include/device_double_functions.h b/ext/cudart/include/device_double_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..82b25e59b40aeaf1e475ff3179e49640a44918b8
--- /dev/null
+++ b/ext/cudart/include/device_double_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/device_double_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/ext/cudart/include/device_functions.h b/ext/cudart/include/device_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..0094cc9a0a57f53f47421a8ecc400fb84c26babe
--- /dev/null
+++ b/ext/cudart/include/device_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/device_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/ext/cudart/include/device_launch_parameters.h b/ext/cudart/include/device_launch_parameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f552db8faab7d21e90e06a1ea2184a5563d3bf2
--- /dev/null
+++ b/ext/cudart/include/device_launch_parameters.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+#include "vector_types.h"
+
+#if !defined(__STORAGE__)
+
+#if defined(__CUDACC_RTC__)
+#define __STORAGE__ \
+        extern const __device__
+#else /* !__CUDACC_RTC__ */
+#define __STORAGE__ \
+        extern const
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __STORAGE__ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+uint3 __device_builtin__ __STORAGE__ threadIdx;
+uint3 __device_builtin__ __STORAGE__ blockIdx;
+dim3 __device_builtin__ __STORAGE__ blockDim;
+dim3 __device_builtin__ __STORAGE__ gridDim;
+int __device_builtin__ __STORAGE__ warpSize;
+
+#undef __STORAGE__
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#if !defined(__cudaGet_threadIdx)
+
+#define __cudaGet_threadIdx() \
+        threadIdx
+
+#endif /* __cudaGet_threadIdx */
+
+#if !defined(__cudaGet_blockIdx)
+
+#define __cudaGet_blockIdx() \
+        blockIdx
+
+#endif /* __cudaGet_blockIdx */
+
+#if !defined(__cudaGet_blockDim)
+
+#define __cudaGet_blockDim() \
+        blockDim
+
+#endif /* __cudaGet_blockDim */
+
+#if !defined(__cudaGet_gridDim)
+
+#define __cudaGet_gridDim() \
+        gridDim
+
+#endif /* __cudaGet_gridDim */
+
+#if !defined(__cudaGet_warpSize)
+
+#define __cudaGet_warpSize() \
+        warpSize
+
+#endif /* __cudaGet_warpSize */
+
+#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */
diff --git a/ext/cudart/include/device_types.h b/ext/cudart/include/device_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b575a1014c6cdb9bf2f722c2a67e329186079e6
--- /dev/null
+++ b/ext/cudart/include/device_types.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_TYPES_H__)
+#define __DEVICE_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+enum __device_builtin__ cudaRoundMode
+{
+    cudaRoundNearest,
+    cudaRoundZero,
+    cudaRoundPosInf,
+    cudaRoundMinInf
+};
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__
+#endif
+
+#endif /* !__DEVICE_TYPES_H__ */
diff --git a/ext/cudart/include/driver_functions.h b/ext/cudart/include/driver_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..94767974220594550d496cad4d14c45349b27737
--- /dev/null
+++ b/ext/cudart/include/driver_functions.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_FUNCTIONS_H__)
+#define __DRIVER_FUNCTIONS_H__
+
+#include "builtin_types.h"
+#include "crt/host_defines.h"
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_MEMORY
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a cudaPitchedPtr based on input parameters
+ *
+ * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
+ * \p p, \p xsz, and \p ysz.
+ *
+ * \param d   - Pointer to allocated memory
+ * \param p   - Pitch of allocated memory in bytes
+ * \param xsz - Logical width of allocation in elements
+ * \param ysz - Logical height of allocation in elements
+ *
+ * \return
+ * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
+ *
+ * \sa make_cudaExtent, make_cudaPos
+ */
+static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
+{
+  struct cudaPitchedPtr s;
+
+  s.ptr   = d;
+  s.pitch = p;
+  s.xsize = xsz;
+  s.ysize = ysz;
+
+  return s;
+}
+
+/**
+ * \brief Returns a cudaPos based on input parameters
+ *
+ * Returns a ::cudaPos based on the specified input parameters \p x,
+ * \p y, and \p z.
+ *
+ * \param x - X position
+ * \param y - Y position
+ * \param z - Z position
+ *
+ * \return
+ * ::cudaPos specified by \p x, \p y, and \p z
+ *
+ * \sa make_cudaExtent, make_cudaPitchedPtr
+ */
+static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
+{
+  struct cudaPos p;
+
+  p.x = x;
+  p.y = y;
+  p.z = z;
+
+  return p;
+}
+
+/**
+ * \brief Returns a cudaExtent based on input parameters
+ *
+ * Returns a ::cudaExtent based on the specified input parameters \p w,
+ * \p h, and \p d.
+ *
+ * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
+ * \param h - Height in elements
+ * \param d - Depth in elements
+ *
+ * \return
+ * ::cudaExtent specified by \p w, \p h, and \p d
+ *
+ * \sa make_cudaPitchedPtr, make_cudaPos
+ */
+static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
+{
+  struct cudaExtent e;
+
+  e.width  = w;
+  e.height = h;
+  e.depth  = d;
+
+  return e;
+}
+
+/** @} */ /* END CUDART_MEMORY */
+
+#endif /* !__DRIVER_FUNCTIONS_H__ */
diff --git a/ext/cudart/include/driver_types.h b/ext/cudart/include/driver_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..47b54f94d6a1dcd278ddda0dd0fa5a4ca866a5ff
--- /dev/null
+++ b/ext/cudart/include/driver_types.h
@@ -0,0 +1,3093 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_TYPES_H__)
+#define __DRIVER_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+#include "vector_types.h"
+
+
+
+/**
+ * \defgroup CUDART_TYPES Data types used by CUDA Runtime
+ * \ingroup CUDART
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*  TYPE DEFINITIONS USED BY RUNTIME API                                        *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+#if !defined(__CUDACC_RTC__)
+#include <limits.h>
+#include <stddef.h>
+#endif /* !defined(__CUDACC_RTC__) */
+
+#define cudaHostAllocDefault                0x00  /**< Default page-locked allocation flag */
+#define cudaHostAllocPortable               0x01  /**< Pinned memory accessible by all CUDA contexts */
+#define cudaHostAllocMapped                 0x02  /**< Map allocation into device space */
+#define cudaHostAllocWriteCombined          0x04  /**< Write-combined memory */
+
+#define cudaHostRegisterDefault             0x00  /**< Default host memory registration flag */
+#define cudaHostRegisterPortable            0x01  /**< Pinned memory accessible by all CUDA contexts */
+#define cudaHostRegisterMapped              0x02  /**< Map registered memory into device space */
+#define cudaHostRegisterIoMemory            0x04  /**< Memory-mapped I/O space */
+#define cudaHostRegisterReadOnly            0x08  /**< Memory-mapped read-only */
+
+#define cudaPeerAccessDefault               0x00  /**< Default peer addressing enable flag */
+
+#define cudaStreamDefault                   0x00  /**< Default stream flag */
+#define cudaStreamNonBlocking               0x01  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+
+ /**
+ * Legacy stream handle
+ *
+ * Stream handle that can be passed as a cudaStream_t to use an implicit stream
+ * with legacy synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define cudaStreamLegacy                    ((cudaStream_t)0x1)
+
+/**
+ * Per-thread stream handle
+ *
+ * Stream handle that can be passed as a cudaStream_t to use an implicit stream
+ * with per-thread synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define cudaStreamPerThread                 ((cudaStream_t)0x2)
+
+#define cudaEventDefault                    0x00  /**< Default event flag */
+#define cudaEventBlockingSync               0x01  /**< Event uses blocking synchronization */
+#define cudaEventDisableTiming              0x02  /**< Event will not record timing data */
+#define cudaEventInterprocess               0x04  /**< Event is suitable for interprocess use. cudaEventDisableTiming must be set */
+
+#define cudaEventRecordDefault              0x00  /**< Default event record flag */
+#define cudaEventRecordExternal             0x01  /**< Event is captured in the graph as an external event node when performing stream capture */
+
+#define cudaEventWaitDefault                0x00  /**< Default event wait flag */
+#define cudaEventWaitExternal               0x01  /**< Event is captured in the graph as an external event node when performing stream capture */
+
+#define cudaDeviceScheduleAuto              0x00  /**< Device flag - Automatic scheduling */
+#define cudaDeviceScheduleSpin              0x01  /**< Device flag - Spin default scheduling */
+#define cudaDeviceScheduleYield             0x02  /**< Device flag - Yield default scheduling */
+#define cudaDeviceScheduleBlockingSync      0x04  /**< Device flag - Use blocking synchronization */
+#define cudaDeviceBlockingSync              0x04  /**< Device flag - Use blocking synchronization 
+                                                    *  \deprecated This flag was deprecated as of CUDA 4.0 and
+                                                    *  replaced with ::cudaDeviceScheduleBlockingSync. */
+#define cudaDeviceScheduleMask              0x07  /**< Device schedule flags mask */
+#define cudaDeviceMapHost                   0x08  /**< Device flag - Support mapped pinned allocations */
+#define cudaDeviceLmemResizeToMax           0x10  /**< Device flag - Keep local memory allocation after launch */
+#define cudaDeviceMask                      0x1f  /**< Device flags mask */
+
+#define cudaArrayDefault                    0x00  /**< Default CUDA array allocation flag */
+#define cudaArrayLayered                    0x01  /**< Must be set in cudaMalloc3DArray to create a layered CUDA array */
+#define cudaArraySurfaceLoadStore           0x02  /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array */
+#define cudaArrayCubemap                    0x04  /**< Must be set in cudaMalloc3DArray to create a cubemap CUDA array */
+#define cudaArrayTextureGather              0x08  /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array */
+#define cudaArrayColorAttachment            0x20  /**< Must be set in cudaExternalMemoryGetMappedMipmappedArray if the mipmapped array is used as a color target in a graphics API */
+#define cudaArraySparse                     0x40  /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA mipmapped array */
+#define cudaArrayDeferredMapping            0x80  /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a deferred mapping CUDA array or CUDA mipmapped array */
+
+#define cudaIpcMemLazyEnablePeerAccess      0x01  /**< Automatically enable peer access between remote devices as needed */
+
+#define cudaMemAttachGlobal                 0x01  /**< Memory can be accessed by any stream on any device*/
+#define cudaMemAttachHost                   0x02  /**< Memory cannot be accessed by any stream on any device */
+#define cudaMemAttachSingle                 0x04  /**< Memory can only be accessed by a single stream on the associated device */
+
+#define cudaOccupancyDefault                0x00  /**< Default behavior */
+#define cudaOccupancyDisableCachingOverride 0x01  /**< Assume global caching is enabled and cannot be automatically turned off */
+
+#define cudaCpuDeviceId                     ((int)-1) /**< Device id that represents the CPU */
+#define cudaInvalidDeviceId                 ((int)-2) /**< Device id that represents an invalid device */
+
+/**
+ * If set, each kernel launched as part of ::cudaLaunchCooperativeKernelMultiDevice only
+ * waits for prior work in the stream corresponding to that GPU to complete before the
+ * kernel begins execution.
+ */
+#define cudaCooperativeLaunchMultiDeviceNoPreSync  0x01
+
+/**
+ * If set, any subsequent work pushed in a stream that participated in a call to
+ * ::cudaLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
+ * the GPU corresponding to that stream to complete before it begins execution.
+ */
+#define cudaCooperativeLaunchMultiDeviceNoPostSync 0x02
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * CUDA error types
+ */
+enum __device_builtin__ cudaError
+{
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cudaEventQuery() and ::cudaStreamQuery()).
+     */
+    cudaSuccess                           =      0,
+  
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    cudaErrorInvalidValue                 =     1,
+  
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    cudaErrorMemoryAllocation             =      2,
+  
+    /**
+     * The API call failed because the CUDA driver and runtime could not be
+     * initialized.
+     */
+    cudaErrorInitializationError          =      3,
+  
+    /**
+     * This indicates that a CUDA Runtime API call cannot be executed because
+     * it is being called during process shut down, at a point in time after
+     * CUDA driver has been unloaded.
+     */
+    cudaErrorCudartUnloading              =     4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    cudaErrorProfilerDisabled             =     5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cudaProfilerStart or
+     * ::cudaProfilerStop without initialization.
+     */
+    cudaErrorProfilerNotInitialized       =     6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cudaProfilerStart() when profiling is already enabled.
+     */
+    cudaErrorProfilerAlreadyStarted       =     7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cudaProfilerStop() when profiling is already disabled.
+     */
+     cudaErrorProfilerAlreadyStopped       =    8,
+  
+    /**
+     * This indicates that a kernel launch is requesting resources that can
+     * never be satisfied by the current device. Requesting more shared memory
+     * per block than the device supports will trigger this error, as will
+     * requesting too many threads or blocks. See ::cudaDeviceProp for more
+     * device limitations.
+     */
+    cudaErrorInvalidConfiguration         =      9,
+  
+    /**
+     * This indicates that one or more of the pitch-related parameters passed
+     * to the API call is not within the acceptable range for pitch.
+     */
+    cudaErrorInvalidPitchValue            =     12,
+  
+    /**
+     * This indicates that the symbol name/identifier passed to the API call
+     * is not a valid name or identifier.
+     */
+    cudaErrorInvalidSymbol                =     13,
+  
+    /**
+     * This indicates that at least one host pointer passed to the API call is
+     * not a valid host pointer.
+     * \deprecated
+     * This error return is deprecated as of CUDA 10.1.
+     */
+    cudaErrorInvalidHostPointer           =     16,
+  
+    /**
+     * This indicates that at least one device pointer passed to the API call is
+     * not a valid device pointer.
+     * \deprecated
+     * This error return is deprecated as of CUDA 10.1.
+     */
+    cudaErrorInvalidDevicePointer         =     17,
+  
+    /**
+     * This indicates that the texture passed to the API call is not a valid
+     * texture.
+     */
+    cudaErrorInvalidTexture               =     18,
+  
+    /**
+     * This indicates that the texture binding is not valid. This occurs if you
+     * call ::cudaGetTextureAlignmentOffset() with an unbound texture.
+     */
+    cudaErrorInvalidTextureBinding        =     19,
+  
+    /**
+     * This indicates that the channel descriptor passed to the API call is not
+     * valid. This occurs if the format is not one of the formats specified by
+     * ::cudaChannelFormatKind, or if one of the dimensions is invalid.
+     */
+    cudaErrorInvalidChannelDescriptor     =     20,
+  
+    /**
+     * This indicates that the direction of the memcpy passed to the API call is
+     * not one of the types specified by ::cudaMemcpyKind.
+     */
+    cudaErrorInvalidMemcpyDirection       =     21,
+  
+    /**
+     * This indicated that the user has taken the address of a constant variable,
+     * which was forbidden up until the CUDA 3.1 release.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Variables in constant
+     * memory may now have their address taken by the runtime via
+     * ::cudaGetSymbolAddress().
+     */
+    cudaErrorAddressOfConstant            =     22,
+  
+    /**
+     * This indicated that a texture fetch was not able to be performed.
+     * This was previously used for device emulation of texture operations.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorTextureFetchFailed           =     23,
+  
+    /**
+     * This indicated that a texture was not bound for access.
+     * This was previously used for device emulation of texture operations.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorTextureNotBound              =     24,
+  
+    /**
+     * This indicated that a synchronization operation had failed.
+     * This was previously used for some device emulation functions.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorSynchronizationError         =     25,
+  
+    /**
+     * This indicates that a non-float texture was being accessed with linear
+     * filtering. This is not supported by CUDA.
+     */
+    cudaErrorInvalidFilterSetting         =     26,
+  
+    /**
+     * This indicates that an attempt was made to read a non-float texture as a
+     * normalized float. This is not supported by CUDA.
+     */
+    cudaErrorInvalidNormSetting           =     27,
+  
+    /**
+     * Mixing of device and device emulation code was not allowed.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorMixedDeviceExecution         =     28,
+
+    /**
+     * This indicates that the API call is not yet implemented. Production
+     * releases of CUDA will never return this error.
+     * \deprecated
+     * This error return is deprecated as of CUDA 4.1.
+     */
+    cudaErrorNotYetImplemented            =     31,
+  
+    /**
+     * This indicated that an emulated device pointer exceeded the 32-bit address
+     * range.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorMemoryValueTooLarge          =     32,
+  
+    /**
+     * This indicates that the CUDA driver that the application has loaded is a
+     * stub library. Applications that run with the stub rather than a real
+     * driver loaded will result in CUDA API returning this error.
+     */
+    cudaErrorStubLibrary                  =     34,
+
+    /**
+     * This indicates that the installed NVIDIA CUDA driver is older than the
+     * CUDA runtime library. This is not a supported configuration. Users should
+     * install an updated NVIDIA display driver to allow the application to run.
+     */
+    cudaErrorInsufficientDriver           =     35,
+
+    /**
+     * This indicates that the API call requires a newer CUDA driver than the one
+     * currently installed. Users should install an updated NVIDIA CUDA driver
+     * to allow the API call to succeed.
+     */
+    cudaErrorCallRequiresNewerDriver      =     36,
+  
+    /**
+     * This indicates that the surface passed to the API call is not a valid
+     * surface.
+     */
+    cudaErrorInvalidSurface               =     37,
+  
+    /**
+     * This indicates that multiple global or constant variables (across separate
+     * CUDA source files in the application) share the same string name.
+     */
+    cudaErrorDuplicateVariableName        =     43,
+  
+    /**
+     * This indicates that multiple textures (across separate CUDA source
+     * files in the application) share the same string name.
+     */
+    cudaErrorDuplicateTextureName         =     44,
+  
+    /**
+     * This indicates that multiple surfaces (across separate CUDA source
+     * files in the application) share the same string name.
+     */
+    cudaErrorDuplicateSurfaceName         =     45,
+  
+    /**
+     * This indicates that all CUDA devices are busy or unavailable at the current
+     * time. Devices are often busy/unavailable due to use of
+     * ::cudaComputeModeProhibited, ::cudaComputeModeExclusiveProcess, or when long
+     * running CUDA kernels have filled up the GPU and are blocking new work
+     * from starting. They can also be unavailable due to memory constraints
+     * on a device that already has active CUDA work being performed.
+     */
+    cudaErrorDevicesUnavailable           =     46,
+  
+    /**
+     * This indicates that the current context is not compatible with this
+     * the CUDA Runtime. This can only occur if you are using CUDA
+     * Runtime/Driver interoperability and have created an existing Driver
+     * context using the driver API. The Driver context may be incompatible
+     * either because the Driver context was created using an older version 
+     * of the API, because the Runtime API call expects a primary driver 
+     * context and the Driver context is not primary, or because the Driver 
+     * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions 
+     * with the CUDA Driver API" for more information.
+     */
+    cudaErrorIncompatibleDriverContext    =     49,
+    
+    /**
+     * The device function being invoked (usually via ::cudaLaunchKernel()) was not
+     * previously configured via the ::cudaConfigureCall() function.
+     */
+    cudaErrorMissingConfiguration         =      52,
+  
+    /**
+     * This indicated that a previous kernel launch failed. This was previously
+     * used for device emulation of kernel launches.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorPriorLaunchFailure           =      53,
+
+    /**
+     * This error indicates that a device runtime grid launch did not occur 
+     * because the depth of the child grid would exceed the maximum supported
+     * number of nested grid launches. 
+     */
+    cudaErrorLaunchMaxDepthExceeded       =     65,
+
+    /**
+     * This error indicates that a grid launch did not occur because the kernel 
+     * uses file-scoped textures which are unsupported by the device runtime. 
+     * Kernels launched via the device runtime only support textures created with 
+     * the Texture Object API's.
+     */
+    cudaErrorLaunchFileScopedTex          =     66,
+
+    /**
+     * This error indicates that a grid launch did not occur because the kernel 
+     * uses file-scoped surfaces which are unsupported by the device runtime.
+     * Kernels launched via the device runtime only support surfaces created with
+     * the Surface Object API's.
+     */
+    cudaErrorLaunchFileScopedSurf         =     67,
+
+    /**
+     * This error indicates that a call to ::cudaDeviceSynchronize made from
+     * the device runtime failed because the call was made at grid depth greater
+     * than than either the default (2 levels of grids) or user specified device 
+     * limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on 
+     * launched grids at a greater depth successfully, the maximum nested 
+     * depth at which ::cudaDeviceSynchronize will be called must be specified 
+     * with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit
+     * api before the host-side launch of a kernel using the device runtime. 
+     * Keep in mind that additional levels of sync depth require the runtime 
+     * to reserve large amounts of device memory that cannot be used for 
+     * user allocations.
+     */
+    cudaErrorSyncDepthExceeded            =     68,
+
+    /**
+     * This error indicates that a device runtime grid launch failed because
+     * the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount.
+     * For this launch to proceed successfully, ::cudaDeviceSetLimit must be
+     * called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher 
+     * than the upper bound of outstanding launches that can be issued to the
+     * device runtime. Keep in mind that raising the limit of pending device
+     * runtime launches will require the runtime to reserve device memory that
+     * cannot be used for user allocations.
+     */
+    cudaErrorLaunchPendingCountExceeded   =     69,
+  
+    /**
+     * The requested device function does not exist or is not compiled for the
+     * proper device architecture.
+     */
+    cudaErrorInvalidDeviceFunction        =      98,
+  
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    cudaErrorNoDevice                     =     100,
+  
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device or that the action requested is
+     * invalid for the specified device.
+     */
+    cudaErrorInvalidDevice                =     101,
+
+    /**
+     * This indicates that the device doesn't have a valid Grid License.
+     */
+    cudaErrorDeviceNotLicensed            =     102,
+
+   /**
+    * By default, the CUDA runtime may perform a minimal set of self-tests,
+    * as well as CUDA driver tests, to establish the validity of both.
+    * Introduced in CUDA 11.2, this error return indicates that at least one
+    * of these tests has failed and the validity of either the runtime
+    * or the driver could not be established.
+    */
+   cudaErrorSoftwareValidityNotEstablished  =     103,
+
+    /**
+     * This indicates an internal startup failure in the CUDA runtime.
+     */
+    cudaErrorStartupFailure               =    127,
+  
+    /**
+     * This indicates that the device kernel image is invalid.
+     */
+    cudaErrorInvalidKernelImage           =     200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    cudaErrorDeviceUninitialized          =     201,
+
+    /**
+     * This indicates that the buffer object could not be mapped.
+     */
+    cudaErrorMapBufferObjectFailed        =     205,
+  
+    /**
+     * This indicates that the buffer object could not be unmapped.
+     */
+    cudaErrorUnmapBufferObjectFailed      =     206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    cudaErrorArrayIsMapped                =     207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    cudaErrorAlreadyMapped                =     208,
+  
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    cudaErrorNoKernelImageForDevice       =     209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    cudaErrorAlreadyAcquired              =     210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    cudaErrorNotMapped                    =     211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    cudaErrorNotMappedAsArray             =     212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    cudaErrorNotMappedAsPointer           =     213,
+  
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    cudaErrorECCUncorrectable             =     214,
+  
+    /**
+     * This indicates that the ::cudaLimit passed to the API call is not
+     * supported by the active device.
+     */
+    cudaErrorUnsupportedLimit             =     215,
+    
+    /**
+     * This indicates that a call tried to access an exclusive-thread device that 
+     * is already in use by a different thread.
+     */
+    cudaErrorDeviceAlreadyInUse           =     216,
+
+    /**
+     * This error indicates that P2P access is not supported across the given
+     * devices.
+     */
+    cudaErrorPeerAccessUnsupported        =     217,
+
+    /**
+     * A PTX compilation failed. The runtime may fall back to compiling PTX if
+     * an application does not contain a suitable binary for the current device.
+     */
+    cudaErrorInvalidPtx                   =     218,
+
+    /**
+     * This indicates an error with the OpenGL or DirectX context.
+     */
+    cudaErrorInvalidGraphicsContext       =     219,
+
+    /**
+     * This indicates that an uncorrectable NVLink error was detected during the
+     * execution.
+     */
+    cudaErrorNvlinkUncorrectable          =     220,
+
+    /**
+     * This indicates that the PTX JIT compiler library was not found. The JIT Compiler
+     * library is used for PTX compilation. The runtime may fall back to compiling PTX
+     * if an application does not contain a suitable binary for the current device.
+     */
+    cudaErrorJitCompilerNotFound          =     221,
+
+    /**
+     * This indicates that the provided PTX was compiled with an unsupported toolchain.
+     * The most common reason for this, is the PTX was generated by a compiler newer
+     * than what is supported by the CUDA driver and PTX JIT compiler.
+     */
+    cudaErrorUnsupportedPtxVersion        =     222,
+
+    /**
+     * This indicates that the JIT compilation was disabled. The JIT compilation compiles
+     * PTX. The runtime may fall back to compiling PTX if an application does not contain
+     * a suitable binary for the current device.
+     */
+    cudaErrorJitCompilationDisabled       =     223,
+
+    /**
+     * This indicates that the provided execution affinity is not supported by the device.
+     */
+    cudaErrorUnsupportedExecAffinity      =     224,
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    cudaErrorInvalidSource                =     300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    cudaErrorFileNotFound                 =     301,
+  
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    cudaErrorSharedObjectSymbolNotFound   =     302,
+  
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    cudaErrorSharedObjectInitFailed       =     303,
+
+    /**
+     * This error indicates that an OS call failed.
+     */
+    cudaErrorOperatingSystem              =     304,
+  
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::cudaStream_t and
+     * ::cudaEvent_t.
+     */
+    cudaErrorInvalidResourceHandle        =     400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    cudaErrorIllegalState                 =     401,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, driver function names, texture names,
+     * and surface names.
+     */
+    cudaErrorSymbolNotFound               =     500,
+  
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::cudaSuccess (which indicates completion). Calls that
+     * may return this value include ::cudaEventQuery() and ::cudaStreamQuery().
+     */
+    cudaErrorNotReady                     =     600,
+
+    /**
+     * The device encountered a load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorIllegalAddress               =     700,
+  
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. Although this error is similar to
+     * ::cudaErrorInvalidConfiguration, this error usually indicates that the
+     * user has attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register count.
+     */
+    cudaErrorLaunchOutOfResources         =      701,
+  
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device property
+     * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
+     * for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorLaunchTimeout                =      702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    cudaErrorLaunchIncompatibleTexturing  =     703,
+      
+    /**
+     * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is
+     * trying to re-enable peer addressing on from a context which has already
+     * had peer addressing enabled.
+     */
+    cudaErrorPeerAccessAlreadyEnabled     =     704,
+    
+    /**
+     * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to 
+     * disable peer addressing which has not been enabled yet via 
+     * ::cudaDeviceEnablePeerAccess().
+     */
+    cudaErrorPeerAccessNotEnabled         =     705,
+  
+    /**
+     * This indicates that the user has called ::cudaSetValidDevices(),
+     * ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(),
+     * ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or
+     * ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by
+     * calling non-device management operations (allocating memory and
+     * launching kernels are examples of non-device management operations).
+     * This error can also be returned if using runtime/driver
+     * interoperability and there is an existing ::CUcontext active on the
+     * host thread.
+     */
+    cudaErrorSetOnActiveProcess           =     708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    cudaErrorContextIsDestroyed           =     709,
+
+    /**
+     * An assert triggered in device code during kernel execution. The device
+     * cannot be used again. All existing allocations are invalid. To continue
+     * using CUDA, the process must be terminated and relaunched.
+     */
+    cudaErrorAssert                        =    710,
+  
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices 
+     * passed to ::cudaEnablePeerAccess().
+     */
+    cudaErrorTooManyPeers                 =     711,
+  
+    /**
+     * This error indicates that the memory range passed to ::cudaHostRegister()
+     * has already been registered.
+     */
+    cudaErrorHostMemoryAlreadyRegistered  =     712,
+        
+    /**
+     * This error indicates that the pointer passed to ::cudaHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    cudaErrorHostMemoryNotRegistered      =     713,
+
+    /**
+     * Device encountered an error in the call stack during kernel execution,
+     * possibly due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorHardwareStackError           =     714,
+
+    /**
+     * The device encountered an illegal instruction during kernel execution
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorIllegalInstruction           =     715,
+
+    /**
+     * The device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorMisalignedAddress            =     716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorInvalidAddressSpace          =     717,
+
+    /**
+     * The device encountered an invalid program counter.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorInvalidPc                    =     718,
+  
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorLaunchFailure                =      719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+     */
+    cudaErrorCooperativeLaunchTooLarge    =     720,
+    
+    /**
+     * This error indicates the attempted operation is not permitted.
+     */
+    cudaErrorNotPermitted                 =     800,
+
+    /**
+     * This error indicates the attempted operation is not supported
+     * on the current system or device.
+     */
+    cudaErrorNotSupported                 =     801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    cudaErrorSystemNotReady               =     802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    cudaErrorSystemDriverMismatch         =     803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    cudaErrorCompatNotSupportedOnDevice   =     804,
+
+    /**
+     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+     */
+    cudaErrorMpsConnectionFailed          =     805,
+
+    /**
+     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+     */
+    cudaErrorMpsRpcFailure                =     806,
+
+    /**
+     * This error indicates that the MPS server is not ready to accept new MPS client requests.
+     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+     */
+    cudaErrorMpsServerNotReady            =     807,
+
+    /**
+     * This error indicates that the hardware resources required to create MPS client have been exhausted.
+     */
+    cudaErrorMpsMaxClientsReached         =     808,
+
+    /**
+     * This error indicates the the hardware resources required to device connections have been exhausted.
+     */
+    cudaErrorMpsMaxConnectionsReached     =     809,
+
+    /**
+     * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
+     */
+    cudaErrorMpsClientTerminated          =     810,
+
+    /**
+     * The operation is not permitted when the stream is capturing.
+     */
+    cudaErrorStreamCaptureUnsupported     =    900,
+
+    /**
+     * The current capture sequence on the stream has been invalidated due to
+     * a previous error.
+     */
+    cudaErrorStreamCaptureInvalidated     =    901,
+
+    /**
+     * The operation would have resulted in a merge of two independent capture
+     * sequences.
+     */
+    cudaErrorStreamCaptureMerge           =    902,
+
+    /**
+     * The capture was not initiated in this stream.
+     */
+    cudaErrorStreamCaptureUnmatched       =    903,
+
+    /**
+     * The capture sequence contains a fork that was not joined to the primary
+     * stream.
+     */
+    cudaErrorStreamCaptureUnjoined        =    904,
+
+    /**
+     * A dependency would have been created which crosses the capture sequence
+     * boundary. Only implicit in-stream ordering dependencies are allowed to
+     * cross the boundary.
+     */
+    cudaErrorStreamCaptureIsolation       =    905,
+
+    /**
+     * The operation would have resulted in a disallowed implicit dependency on
+     * a current capture sequence from cudaStreamLegacy.
+     */
+    cudaErrorStreamCaptureImplicit        =    906,
+
+    /**
+     * The operation is not permitted on an event which was last recorded in a
+     * capturing stream.
+     */
+    cudaErrorCapturedEvent                =    907,
+  
+    /**
+     * A stream capture sequence not initiated with the ::cudaStreamCaptureModeRelaxed
+     * argument to ::cudaStreamBeginCapture was passed to ::cudaStreamEndCapture in a
+     * different thread.
+     */
+    cudaErrorStreamCaptureWrongThread     =    908,
+
+    /**
+     * This indicates that the wait operation has timed out.
+     */
+    cudaErrorTimeout                      =    909,
+
+    /**
+     * This error indicates that the graph update was not performed because it included 
+     * changes which violated constraints specific to instantiated graph update.
+     */
+    cudaErrorGraphExecUpdateFailure       =    910,
+
+    /**
+     * This indicates that an async error has occurred in a device outside of CUDA.
+     * If CUDA was waiting for an external device's signal before consuming shared data,
+     * the external device signaled an error indicating that the data is not valid for
+     * consumption. This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must be
+     * terminated and relaunched.
+     */
+    cudaErrorExternalDevice               =    911,
+
+    /**
+     * This indicates that a kernel launch error has occurred due to cluster
+     * misconfiguration.
+     */
+    cudaErrorInvalidClusterSize           =    912,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    cudaErrorUnknown                      =    999,
+
+    /**
+     * Any unhandled CUDA driver error is added to this value and returned via
+     * the runtime. Production releases of CUDA should not return such errors.
+     * \deprecated
+     * This error return is deprecated as of CUDA 4.1.
+     */
+    cudaErrorApiFailureBase               =  10000
+};
+
+/**
+ * Channel format kind
+ */
+enum __device_builtin__ cudaChannelFormatKind
+{
+    cudaChannelFormatKindSigned                         =   0,      /**< Signed channel format */
+    cudaChannelFormatKindUnsigned                       =   1,      /**< Unsigned channel format */
+    cudaChannelFormatKindFloat                          =   2,      /**< Float channel format */
+    cudaChannelFormatKindNone                           =   3,      /**< No channel format */
+    cudaChannelFormatKindNV12                           =   4,      /**< Unsigned 8-bit integers, planar 4:2:0 YUV format */
+    cudaChannelFormatKindUnsignedNormalized8X1          =   5,      /**< 1 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized8X2          =   6,      /**< 2 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized8X4          =   7,      /**< 4 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X1         =   8,      /**< 1 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X2         =   9,      /**< 2 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X4         =   10,     /**< 4 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X1            =   11,     /**< 1 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X2            =   12,     /**< 2 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X4            =   13,     /**< 4 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X1           =   14,     /**< 1 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X2           =   15,     /**< 2 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X4           =   16,     /**< 4 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedBlockCompressed1       =   17,     /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed1SRGB   =   18,     /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
+    cudaChannelFormatKindUnsignedBlockCompressed2       =   19,     /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed2SRGB   =   20,     /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding */
+    cudaChannelFormatKindUnsignedBlockCompressed3       =   21,     /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed3SRGB   =   22,     /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding */
+    cudaChannelFormatKindUnsignedBlockCompressed4       =   23,     /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
+    cudaChannelFormatKindSignedBlockCompressed4         =   24,     /**< 1 channel signed normalized block-compressed (BC4 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed5       =   25,     /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
+    cudaChannelFormatKindSignedBlockCompressed5         =   26,     /**< 2 channel signed normalized block-compressed (BC5 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed6H      =   27,     /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
+    cudaChannelFormatKindSignedBlockCompressed6H        =   28,     /**< 3 channel signed half-float block-compressed (BC6H compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed7       =   29,     /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed7SRGB   =   30      /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
+};
+
+/**
+ * CUDA Channel format descriptor
+ */
+struct __device_builtin__ cudaChannelFormatDesc
+{
+    int                        x; /**< x */
+    int                        y; /**< y */
+    int                        z; /**< z */
+    int                        w; /**< w */
+    enum cudaChannelFormatKind f; /**< Channel format kind */
+};
+
+/**
+ * CUDA array
+ */
+typedef struct cudaArray *cudaArray_t;
+
+/**
+ * CUDA array (as source copy argument)
+ */
+typedef const struct cudaArray *cudaArray_const_t;
+
+struct cudaArray;
+
+/**
+ * CUDA mipmapped array
+ */
+typedef struct cudaMipmappedArray *cudaMipmappedArray_t;
+
+/**
+ * CUDA mipmapped array (as source argument)
+ */
+typedef const struct cudaMipmappedArray *cudaMipmappedArray_const_t;
+
+struct cudaMipmappedArray;
+
+/**
+ * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
+ */
+#define cudaArraySparsePropertiesSingleMipTail   0x1
+
+/**
+ * Sparse CUDA array and CUDA mipmapped array properties
+ */
+struct __device_builtin__ cudaArraySparseProperties {
+    struct {
+        unsigned int width;             /**< Tile width in elements */
+        unsigned int height;            /**< Tile height in elements */
+        unsigned int depth;             /**< Tile depth in elements */
+    } tileExtent;
+    unsigned int miptailFirstLevel;     /**< First mip level at which the mip tail begins */   
+    unsigned long long miptailSize;     /**< Total size of the mip tail. */
+    unsigned int flags;                 /**< Flags will either be zero or ::cudaArraySparsePropertiesSingleMipTail */
+    unsigned int reserved[4];
+};
+
+/**
+ * CUDA array and CUDA mipmapped array memory requirements
+ */
+struct __device_builtin__ cudaArrayMemoryRequirements {
+    size_t size;                    /**< Total size of the array. */
+    size_t alignment;               /**< Alignment necessary for mapping the array. */
+    unsigned int reserved[4];
+};
+
+/**
+ * CUDA memory types
+ */
+enum __device_builtin__ cudaMemoryType
+{
+    cudaMemoryTypeUnregistered = 0, /**< Unregistered memory */
+    cudaMemoryTypeHost         = 1, /**< Host memory */
+    cudaMemoryTypeDevice       = 2, /**< Device memory */
+    cudaMemoryTypeManaged      = 3  /**< Managed memory */
+};
+
+/**
+ * CUDA memory copy types
+ */
+enum __device_builtin__ cudaMemcpyKind
+{
+    cudaMemcpyHostToHost          =   0,      /**< Host   -> Host */
+    cudaMemcpyHostToDevice        =   1,      /**< Host   -> Device */
+    cudaMemcpyDeviceToHost        =   2,      /**< Device -> Host */
+    cudaMemcpyDeviceToDevice      =   3,      /**< Device -> Device */
+    cudaMemcpyDefault             =   4       /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
+};
+
+/**
+ * CUDA Pitched memory pointer
+ *
+ * \sa ::make_cudaPitchedPtr
+ */
+struct __device_builtin__ cudaPitchedPtr
+{
+    void   *ptr;      /**< Pointer to allocated memory */
+    size_t  pitch;    /**< Pitch of allocated memory in bytes */
+    size_t  xsize;    /**< Logical width of allocation in elements */
+    size_t  ysize;    /**< Logical height of allocation in elements */
+};
+
+/**
+ * CUDA extent
+ *
+ * \sa ::make_cudaExtent
+ */
+struct __device_builtin__ cudaExtent
+{
+    size_t width;     /**< Width in elements when referring to array memory, in bytes when referring to linear memory */
+    size_t height;    /**< Height in elements */
+    size_t depth;     /**< Depth in elements */
+};
+
+/**
+ * CUDA 3D position
+ *
+ * \sa ::make_cudaPos
+ */
+struct __device_builtin__ cudaPos
+{
+    size_t x;     /**< x */
+    size_t y;     /**< y */
+    size_t z;     /**< z */
+};
+
+/**
+ * CUDA 3D memory copying parameters
+ */
+struct __device_builtin__ cudaMemcpy3DParms
+{
+    cudaArray_t            srcArray;  /**< Source memory address */
+    struct cudaPos         srcPos;    /**< Source position offset */
+    struct cudaPitchedPtr  srcPtr;    /**< Pitched source memory address */
+  
+    cudaArray_t            dstArray;  /**< Destination memory address */
+    struct cudaPos         dstPos;    /**< Destination position offset */
+    struct cudaPitchedPtr  dstPtr;    /**< Pitched destination memory address */
+  
+    struct cudaExtent      extent;    /**< Requested memory copy size */
+    enum cudaMemcpyKind    kind;      /**< Type of transfer */
+};
+
+/**
+ * CUDA 3D cross-device memory copying parameters
+ */
+struct __device_builtin__ cudaMemcpy3DPeerParms
+{
+    cudaArray_t            srcArray;  /**< Source memory address */
+    struct cudaPos         srcPos;    /**< Source position offset */
+    struct cudaPitchedPtr  srcPtr;    /**< Pitched source memory address */
+    int                    srcDevice; /**< Source device */
+  
+    cudaArray_t            dstArray;  /**< Destination memory address */
+    struct cudaPos         dstPos;    /**< Destination position offset */
+    struct cudaPitchedPtr  dstPtr;    /**< Pitched destination memory address */
+    int                    dstDevice; /**< Destination device */
+  
+    struct cudaExtent      extent;    /**< Requested memory copy size */
+};
+
+/**
+ * CUDA Memset node parameters
+ */
+struct __device_builtin__  cudaMemsetParams {
+    void *dst;                              /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+};
+
+/**
+ * Specifies performance hint with ::cudaAccessPolicyWindow for hitProp and missProp members.
+ */
+enum __device_builtin__  cudaAccessProperty {
+    cudaAccessPropertyNormal = 0,       /**< Normal cache persistence. */
+    cudaAccessPropertyStreaming = 1,    /**< Streaming access is less likely to persit from cache. */
+    cudaAccessPropertyPersisting = 2    /**< Persisting access is more likely to persist in cache.*/
+};
+
+/**
+ * Specifies an access policy for a window, a contiguous extent of memory
+ * beginning at base_ptr and ending at base_ptr + num_bytes.
+ * Partition into many segments and assign segments such that.
+ * sum of "hit segments" / window == approx. ratio.
+ * sum of "miss segments" / window == approx 1-ratio.
+ * Segments and ratio specifications are fitted to the capabilities of
+ * the architecture.
+ * Accesses in a hit segment apply the hitProp access policy.
+ * Accesses in a miss segment apply the missProp access policy.
+ */
+struct __device_builtin__ cudaAccessPolicyWindow {
+    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
+    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
+    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
+    enum cudaAccessProperty hitProp;    /**< ::CUaccessProperty set for hit. */
+    enum cudaAccessProperty missProp;   /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING. */
+};
+
+#ifdef _WIN32
+#define CUDART_CB __stdcall
+#else
+#define CUDART_CB
+#endif
+
+/**
+ * CUDA host function
+ * \param userData Argument value passed to the function
+ */
+typedef void (CUDART_CB *cudaHostFn_t)(void *userData);
+
+/**
+ * CUDA host node parameters
+ */
+struct __device_builtin__ cudaHostNodeParams {
+    cudaHostFn_t fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+};
+
+/**
+ * Possible stream capture statuses returned by ::cudaStreamIsCapturing
+ */
+enum __device_builtin__ cudaStreamCaptureStatus {
+    cudaStreamCaptureStatusNone        = 0, /**< Stream is not capturing */
+    cudaStreamCaptureStatusActive      = 1, /**< Stream is actively capturing */
+    cudaStreamCaptureStatusInvalidated = 2  /**< Stream is part of a capture sequence that
+                                                   has been invalidated, but not terminated */
+};
+
+/**
+ * Possible modes for stream capture thread interactions. For more details see
+ * ::cudaStreamBeginCapture and ::cudaThreadExchangeStreamCaptureMode
+ */
+enum __device_builtin__ cudaStreamCaptureMode {
+    cudaStreamCaptureModeGlobal      = 0,
+    cudaStreamCaptureModeThreadLocal = 1,
+    cudaStreamCaptureModeRelaxed     = 2
+};
+
+enum __device_builtin__ cudaSynchronizationPolicy {
+    cudaSyncPolicyAuto = 1,
+    cudaSyncPolicySpin = 2,
+    cudaSyncPolicyYield = 3,
+    cudaSyncPolicyBlockingSync = 4
+};
+
+/**
+ * Cluster scheduling policies. These may be passed to ::cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaClusterSchedulingPolicy {
+    cudaClusterSchedulingPolicyDefault       = 0, /**< the default policy */
+    cudaClusterSchedulingPolicySpread        = 1, /**< spread the blocks within a cluster to the SMs */
+    cudaClusterSchedulingPolicyLoadBalancing = 2  /**< allow the hardware to load-balance the blocks in a cluster to the SMs */
+};
+
+/**
+ * Flags for ::cudaStreamUpdateCaptureDependencies
+ */
+enum __device_builtin__ cudaStreamUpdateCaptureDependenciesFlags {
+    cudaStreamAddCaptureDependencies = 0x0, /**< Add new nodes to the dependency set */
+    cudaStreamSetCaptureDependencies = 0x1  /**< Replace the dependency set with the new nodes */
+};
+
+/**
+ * Flags for user objects for graphs
+ */
+enum __device_builtin__ cudaUserObjectFlags {
+    cudaUserObjectNoDestructorSync = 0x1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
+};
+
+/**
+ * Flags for retaining user object references for graphs
+ */
+enum __device_builtin__ cudaUserObjectRetainFlags {
+    cudaGraphUserObjectMove = 0x1  /**< Transfer references from the caller rather than creating new references. */
+};
+
+/**
+ * CUDA graphics interop resource
+ */
+struct cudaGraphicsResource;
+
+/**
+ * CUDA graphics interop register flags
+ */
+enum __device_builtin__ cudaGraphicsRegisterFlags
+{
+    cudaGraphicsRegisterFlagsNone             = 0,  /**< Default */
+    cudaGraphicsRegisterFlagsReadOnly         = 1,  /**< CUDA will not write to this resource */ 
+    cudaGraphicsRegisterFlagsWriteDiscard     = 2,  /**< CUDA will only write to and will not read from this resource */
+    cudaGraphicsRegisterFlagsSurfaceLoadStore = 4,  /**< CUDA will bind this resource to a surface reference */
+    cudaGraphicsRegisterFlagsTextureGather    = 8   /**< CUDA will perform texture gather operations on this resource */
+};
+
+/**
+ * CUDA graphics interop map flags
+ */
+enum __device_builtin__ cudaGraphicsMapFlags
+{
+    cudaGraphicsMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+    cudaGraphicsMapFlagsReadOnly     = 1,  /**< CUDA will not write to this resource */
+    cudaGraphicsMapFlagsWriteDiscard = 2   /**< CUDA will only write to and will not read from this resource */
+};
+
+/**
+ * CUDA graphics interop array indices for cube maps
+ */
+enum __device_builtin__ cudaGraphicsCubeFace 
+{
+    cudaGraphicsCubeFacePositiveX = 0x00, /**< Positive X face of cubemap */
+    cudaGraphicsCubeFaceNegativeX = 0x01, /**< Negative X face of cubemap */
+    cudaGraphicsCubeFacePositiveY = 0x02, /**< Positive Y face of cubemap */
+    cudaGraphicsCubeFaceNegativeY = 0x03, /**< Negative Y face of cubemap */
+    cudaGraphicsCubeFacePositiveZ = 0x04, /**< Positive Z face of cubemap */
+    cudaGraphicsCubeFaceNegativeZ = 0x05  /**< Negative Z face of cubemap */
+};
+
+/**
+ * CUDA resource types
+ */
+enum __device_builtin__ cudaResourceType
+{
+    cudaResourceTypeArray          = 0x00, /**< Array resource */
+    cudaResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */
+    cudaResourceTypeLinear         = 0x02, /**< Linear resource */
+    cudaResourceTypePitch2D        = 0x03  /**< Pitch 2D resource */
+};
+
+/**
+ * CUDA texture resource view formats
+ */
+enum __device_builtin__ cudaResourceViewFormat
+{
+    cudaResViewFormatNone                      = 0x00, /**< No resource view format (use underlying resource format) */
+    cudaResViewFormatUnsignedChar1             = 0x01, /**< 1 channel unsigned 8-bit integers */
+    cudaResViewFormatUnsignedChar2             = 0x02, /**< 2 channel unsigned 8-bit integers */
+    cudaResViewFormatUnsignedChar4             = 0x03, /**< 4 channel unsigned 8-bit integers */
+    cudaResViewFormatSignedChar1               = 0x04, /**< 1 channel signed 8-bit integers */
+    cudaResViewFormatSignedChar2               = 0x05, /**< 2 channel signed 8-bit integers */
+    cudaResViewFormatSignedChar4               = 0x06, /**< 4 channel signed 8-bit integers */
+    cudaResViewFormatUnsignedShort1            = 0x07, /**< 1 channel unsigned 16-bit integers */
+    cudaResViewFormatUnsignedShort2            = 0x08, /**< 2 channel unsigned 16-bit integers */
+    cudaResViewFormatUnsignedShort4            = 0x09, /**< 4 channel unsigned 16-bit integers */
+    cudaResViewFormatSignedShort1              = 0x0a, /**< 1 channel signed 16-bit integers */
+    cudaResViewFormatSignedShort2              = 0x0b, /**< 2 channel signed 16-bit integers */
+    cudaResViewFormatSignedShort4              = 0x0c, /**< 4 channel signed 16-bit integers */
+    cudaResViewFormatUnsignedInt1              = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    cudaResViewFormatUnsignedInt2              = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    cudaResViewFormatUnsignedInt4              = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    cudaResViewFormatSignedInt1                = 0x10, /**< 1 channel signed 32-bit integers */
+    cudaResViewFormatSignedInt2                = 0x11, /**< 2 channel signed 32-bit integers */
+    cudaResViewFormatSignedInt4                = 0x12, /**< 4 channel signed 32-bit integers */
+    cudaResViewFormatHalf1                     = 0x13, /**< 1 channel 16-bit floating point */
+    cudaResViewFormatHalf2                     = 0x14, /**< 2 channel 16-bit floating point */
+    cudaResViewFormatHalf4                     = 0x15, /**< 4 channel 16-bit floating point */
+    cudaResViewFormatFloat1                    = 0x16, /**< 1 channel 32-bit floating point */
+    cudaResViewFormatFloat2                    = 0x17, /**< 2 channel 32-bit floating point */
+    cudaResViewFormatFloat4                    = 0x18, /**< 4 channel 32-bit floating point */
+    cudaResViewFormatUnsignedBlockCompressed1  = 0x19, /**< Block compressed 1 */
+    cudaResViewFormatUnsignedBlockCompressed2  = 0x1a, /**< Block compressed 2 */
+    cudaResViewFormatUnsignedBlockCompressed3  = 0x1b, /**< Block compressed 3 */
+    cudaResViewFormatUnsignedBlockCompressed4  = 0x1c, /**< Block compressed 4 unsigned */
+    cudaResViewFormatSignedBlockCompressed4    = 0x1d, /**< Block compressed 4 signed */
+    cudaResViewFormatUnsignedBlockCompressed5  = 0x1e, /**< Block compressed 5 unsigned */
+    cudaResViewFormatSignedBlockCompressed5    = 0x1f, /**< Block compressed 5 signed */
+    cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    cudaResViewFormatSignedBlockCompressed6H   = 0x21, /**< Block compressed 6 signed half-float */
+    cudaResViewFormatUnsignedBlockCompressed7  = 0x22  /**< Block compressed 7 */
+};
+
+/**
+ * CUDA resource descriptor
+ */
+struct __device_builtin__ cudaResourceDesc {
+    enum cudaResourceType resType;             /**< Resource type */
+    
+    union {
+        struct {
+            cudaArray_t array;                 /**< CUDA array */
+        } array;
+        struct {
+            cudaMipmappedArray_t mipmap;       /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            void *devPtr;                      /**< Device pointer */
+            struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+            size_t sizeInBytes;                /**< Size in bytes */
+        } linear;
+        struct {
+            void *devPtr;                      /**< Device pointer */
+            struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+            size_t width;                      /**< Width of the array in elements */
+            size_t height;                     /**< Height of the array in elements */
+            size_t pitchInBytes;               /**< Pitch between two rows in bytes */
+        } pitch2D;
+    } res;
+};
+
+/**
+ * CUDA resource view descriptor
+ */
+struct __device_builtin__ cudaResourceViewDesc
+{
+    enum cudaResourceViewFormat format;           /**< Resource view format */
+    size_t                      width;            /**< Width of the resource view */
+    size_t                      height;           /**< Height of the resource view */
+    size_t                      depth;            /**< Depth of the resource view */
+    unsigned int                firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int                lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int                firstLayer;       /**< First layer index */
+    unsigned int                lastLayer;        /**< Last layer index */
+};
+
+/**
+ * CUDA pointer attributes
+ */
+struct __device_builtin__ cudaPointerAttributes
+{
+    /**
+     * The type of memory - ::cudaMemoryTypeUnregistered, ::cudaMemoryTypeHost,
+     * ::cudaMemoryTypeDevice or ::cudaMemoryTypeManaged.
+     */
+    enum cudaMemoryType type;
+
+    /** 
+     * The device against which the memory was allocated or registered.
+     * If the memory type is ::cudaMemoryTypeDevice then this identifies 
+     * the device on which the memory referred physically resides.  If
+     * the memory type is ::cudaMemoryTypeHost or::cudaMemoryTypeManaged then
+     * this identifies the device which was current when the memory was allocated
+     * or registered (and if that device is deinitialized then this allocation
+     * will vanish with that device's state).
+     */
+    int device;
+
+    /**
+     * The address which may be dereferenced on the current device to access 
+     * the memory or NULL if no such address exists.
+     */
+    void *devicePointer;
+
+    /**
+     * The address which may be dereferenced on the host to access the
+     * memory or NULL if no such address exists.
+     *
+     * \note CUDA doesn't check if unregistered memory is allocated so this field
+     * may contain invalid pointer if an invalid pointer has been passed to CUDA.
+     */
+    void *hostPointer;
+};
+
+/**
+ * CUDA function attributes
+ */
+struct __device_builtin__ cudaFuncAttributes
+{
+   /**
+    * The size in bytes of statically-allocated shared memory per block
+    * required by this function. This does not include dynamically-allocated
+    * shared memory requested by the user at runtime.
+    */
+   size_t sharedSizeBytes;
+
+   /**
+    * The size in bytes of user-allocated constant memory required by this
+    * function.
+    */
+   size_t constSizeBytes;
+
+   /**
+    * The size in bytes of local memory used by each thread of this function.
+    */
+   size_t localSizeBytes;
+
+   /**
+    * The maximum number of threads per block, beyond which a launch of the
+    * function would fail. This number depends on both the function and the
+    * device on which the function is currently loaded.
+    */
+   int maxThreadsPerBlock;
+
+   /**
+    * The number of registers used by each thread of this function.
+    */
+   int numRegs;
+
+   /**
+    * The PTX virtual architecture version for which the function was
+    * compiled. This value is the major PTX version * 10 + the minor PTX
+    * version, so a PTX version 1.3 function would return the value 13.
+    */
+   int ptxVersion;
+
+   /**
+    * The binary architecture version for which the function was compiled.
+    * This value is the major binary version * 10 + the minor binary version,
+    * so a binary version 1.3 function would return the value 13.
+    */
+   int binaryVersion;
+
+   /**
+    * The attribute to indicate whether the function has been compiled with 
+    * user specified option "-Xptxas --dlcm=ca" set.
+    */
+   int cacheModeCA;
+
+   /**
+    * The maximum size in bytes of dynamic shared memory per block for 
+    * this function. Any launch must have a dynamic shared memory size
+    * smaller than this value.
+    */
+   int maxDynamicSharedSizeBytes;
+
+   /**
+    * On devices where the L1 cache and shared memory use the same hardware resources, 
+    * this sets the shared memory carveout preference, in percent of the maximum shared memory. 
+    * Refer to ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+    * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+    * See ::cudaFuncSetAttribute
+    */
+   int preferredShmemCarveout;
+};
+
+/**
+ * CUDA function attributes that can be set using ::cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaFuncAttribute
+{
+    cudaFuncAttributeMaxDynamicSharedMemorySize = 8, /**< Maximum dynamic shared memory size */
+    cudaFuncAttributePreferredSharedMemoryCarveout = 9, /**< Preferred shared memory-L1 cache split */
+    cudaFuncAttributeClusterDimMustBeSet = 10, /**< Indicator to enforce valid cluster dimension specification on kernel launch */
+    cudaFuncAttributeRequiredClusterWidth = 11, /**< Required cluster width */
+    cudaFuncAttributeRequiredClusterHeight = 12, /**< Required cluster height */
+    cudaFuncAttributeRequiredClusterDepth = 13, /**< Required cluster depth */
+    cudaFuncAttributeNonPortableClusterSizeAllowed = 14, /**< Whether non-portable cluster scheduling policy is supported */
+    cudaFuncAttributeClusterSchedulingPolicyPreference = 15, /**< Required cluster scheduling policy preference */
+    cudaFuncAttributeMax
+};
+
+/**
+ * CUDA function cache configurations
+ */
+enum __device_builtin__ cudaFuncCache
+{
+    cudaFuncCachePreferNone   = 0,    /**< Default function cache configuration, no preference */
+    cudaFuncCachePreferShared = 1,    /**< Prefer larger shared memory and smaller L1 cache  */
+    cudaFuncCachePreferL1     = 2,    /**< Prefer larger L1 cache and smaller shared memory */
+    cudaFuncCachePreferEqual  = 3     /**< Prefer equal size L1 cache and shared memory */
+};
+
+/**
+ * CUDA shared memory configuration
+ */
+
+enum __device_builtin__ cudaSharedMemConfig
+{
+    cudaSharedMemBankSizeDefault   = 0,
+    cudaSharedMemBankSizeFourByte  = 1,
+    cudaSharedMemBankSizeEightByte = 2
+};
+
+/** 
+ * Shared memory carveout configurations. These may be passed to cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaSharedCarveout {
+    cudaSharedmemCarveoutDefault      = -1,  /**< No preference for shared memory or L1 (default) */
+    cudaSharedmemCarveoutMaxShared    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
+    cudaSharedmemCarveoutMaxL1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
+};
+
+/**
+ * CUDA device compute modes
+ */
+enum __device_builtin__ cudaComputeMode
+{
+    cudaComputeModeDefault          = 0,  /**< Default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */
+    cudaComputeModeExclusive        = 1,  /**< Compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */
+    cudaComputeModeProhibited       = 2,  /**< Compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */
+    cudaComputeModeExclusiveProcess = 3   /**< Compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */
+};
+
+/**
+ * CUDA Limits
+ */
+enum __device_builtin__ cudaLimit
+{
+    cudaLimitStackSize                    = 0x00, /**< GPU thread stack size */
+    cudaLimitPrintfFifoSize               = 0x01, /**< GPU printf FIFO size */
+    cudaLimitMallocHeapSize               = 0x02, /**< GPU malloc heap size */
+    cudaLimitDevRuntimeSyncDepth          = 0x03, /**< GPU device runtime synchronize depth */
+    cudaLimitDevRuntimePendingLaunchCount = 0x04, /**< GPU device runtime pending launch count */
+    cudaLimitMaxL2FetchGranularity        = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    cudaLimitPersistingL2CacheSize        = 0x06  /**< A size in bytes for L2 persisting lines cache size */
+};
+
+/**
+ * CUDA Memory Advise values
+ */
+enum __device_builtin__ cudaMemoryAdvise
+{
+    cudaMemAdviseSetReadMostly          = 1, /**< Data will mostly be read and only occassionally be written to */
+    cudaMemAdviseUnsetReadMostly        = 2, /**< Undo the effect of ::cudaMemAdviseSetReadMostly */
+    cudaMemAdviseSetPreferredLocation   = 3, /**< Set the preferred location for the data as the specified device */
+    cudaMemAdviseUnsetPreferredLocation = 4, /**< Clear the preferred location for the data */
+    cudaMemAdviseSetAccessedBy          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
+    cudaMemAdviseUnsetAccessedBy        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
+};
+
+/**
+ * CUDA range attributes
+ */
+enum __device_builtin__ cudaMemRangeAttribute
+{
+    cudaMemRangeAttributeReadMostly           = 1, /**< Whether the range will mostly be read and only occassionally be written to */
+    cudaMemRangeAttributePreferredLocation    = 2, /**< The preferred location of the range */
+    cudaMemRangeAttributeAccessedBy           = 3, /**< Memory range has ::cudaMemAdviseSetAccessedBy set for specified device */
+    cudaMemRangeAttributeLastPrefetchLocation = 4  /**< The last location to which the range was prefetched */
+};
+
+/**
+ * CUDA Profiler Output modes
+ */
+enum __device_builtin__ cudaOutputMode
+{
+    cudaKeyValuePair    = 0x00, /**< Output mode Key-Value pair format. */
+    cudaCSV             = 0x01  /**< Output mode Comma separated values format. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes APIs supported on the device
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesOptions {
+    cudaFlushGPUDirectRDMAWritesOptionHost   = 1<<0, /**< ::cudaDeviceFlushGPUDirectRDMAWrites() and its CUDA Driver API counterpart are supported on the device. */
+    cudaFlushGPUDirectRDMAWritesOptionMemOps = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the CUDA device. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes ordering features of the device
+ */
+enum __device_builtin__ cudaGPUDirectRDMAWritesOrdering {
+    cudaGPUDirectRDMAWritesOrderingNone       = 0,   /**< The device does not natively support ordering of GPUDirect RDMA writes. ::cudaFlushGPUDirectRDMAWrites() can be leveraged if supported. */
+    cudaGPUDirectRDMAWritesOrderingOwner      = 100, /**< Natively, the device can consistently consume GPUDirect RDMA writes, although other CUDA devices may not. */
+    cudaGPUDirectRDMAWritesOrderingAllDevices = 200  /**< Any CUDA device in the system can consistently consume GPUDirect RDMA writes to this device. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes scopes
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesScope {
+    cudaFlushGPUDirectRDMAWritesToOwner      = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
+    cudaFlushGPUDirectRDMAWritesToAllDevices = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes targets
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesTarget {
+    cudaFlushGPUDirectRDMAWritesTargetCurrentDevice /**< Sets the target for ::cudaDeviceFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
+};
+
+
+/**
+ * CUDA device attributes
+ */
+enum __device_builtin__ cudaDeviceAttr
+{
+    cudaDevAttrMaxThreadsPerBlock             = 1,  /**< Maximum number of threads per block */
+    cudaDevAttrMaxBlockDimX                   = 2,  /**< Maximum block dimension X */
+    cudaDevAttrMaxBlockDimY                   = 3,  /**< Maximum block dimension Y */
+    cudaDevAttrMaxBlockDimZ                   = 4,  /**< Maximum block dimension Z */
+    cudaDevAttrMaxGridDimX                    = 5,  /**< Maximum grid dimension X */
+    cudaDevAttrMaxGridDimY                    = 6,  /**< Maximum grid dimension Y */
+    cudaDevAttrMaxGridDimZ                    = 7,  /**< Maximum grid dimension Z */
+    cudaDevAttrMaxSharedMemoryPerBlock        = 8,  /**< Maximum shared memory available per block in bytes */
+    cudaDevAttrTotalConstantMemory            = 9,  /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    cudaDevAttrWarpSize                       = 10, /**< Warp size in threads */
+    cudaDevAttrMaxPitch                       = 11, /**< Maximum pitch in bytes allowed by memory copies */
+    cudaDevAttrMaxRegistersPerBlock           = 12, /**< Maximum number of 32-bit registers available per block */
+    cudaDevAttrClockRate                      = 13, /**< Peak clock frequency in kilohertz */
+    cudaDevAttrTextureAlignment               = 14, /**< Alignment requirement for textures */
+    cudaDevAttrGpuOverlap                     = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
+    cudaDevAttrMultiProcessorCount            = 16, /**< Number of multiprocessors on device */
+    cudaDevAttrKernelExecTimeout              = 17, /**< Specifies whether there is a run time limit on kernels */
+    cudaDevAttrIntegrated                     = 18, /**< Device is integrated with host memory */
+    cudaDevAttrCanMapHostMemory               = 19, /**< Device can map host memory into CUDA address space */
+    cudaDevAttrComputeMode                    = 20, /**< Compute mode (See ::cudaComputeMode for details) */
+    cudaDevAttrMaxTexture1DWidth              = 21, /**< Maximum 1D texture width */
+    cudaDevAttrMaxTexture2DWidth              = 22, /**< Maximum 2D texture width */
+    cudaDevAttrMaxTexture2DHeight             = 23, /**< Maximum 2D texture height */
+    cudaDevAttrMaxTexture3DWidth              = 24, /**< Maximum 3D texture width */
+    cudaDevAttrMaxTexture3DHeight             = 25, /**< Maximum 3D texture height */
+    cudaDevAttrMaxTexture3DDepth              = 26, /**< Maximum 3D texture depth */
+    cudaDevAttrMaxTexture2DLayeredWidth       = 27, /**< Maximum 2D layered texture width */
+    cudaDevAttrMaxTexture2DLayeredHeight      = 28, /**< Maximum 2D layered texture height */
+    cudaDevAttrMaxTexture2DLayeredLayers      = 29, /**< Maximum layers in a 2D layered texture */
+    cudaDevAttrSurfaceAlignment               = 30, /**< Alignment requirement for surfaces */
+    cudaDevAttrConcurrentKernels              = 31, /**< Device can possibly execute multiple kernels concurrently */
+    cudaDevAttrEccEnabled                     = 32, /**< Device has ECC support enabled */
+    cudaDevAttrPciBusId                       = 33, /**< PCI bus ID of the device */
+    cudaDevAttrPciDeviceId                    = 34, /**< PCI device ID of the device */
+    cudaDevAttrTccDriver                      = 35, /**< Device is using TCC driver model */
+    cudaDevAttrMemoryClockRate                = 36, /**< Peak memory clock frequency in kilohertz */
+    cudaDevAttrGlobalMemoryBusWidth           = 37, /**< Global memory bus width in bits */
+    cudaDevAttrL2CacheSize                    = 38, /**< Size of L2 cache in bytes */
+    cudaDevAttrMaxThreadsPerMultiProcessor    = 39, /**< Maximum resident threads per multiprocessor */
+    cudaDevAttrAsyncEngineCount               = 40, /**< Number of asynchronous engines */
+    cudaDevAttrUnifiedAddressing              = 41, /**< Device shares a unified address space with the host */    
+    cudaDevAttrMaxTexture1DLayeredWidth       = 42, /**< Maximum 1D layered texture width */
+    cudaDevAttrMaxTexture1DLayeredLayers      = 43, /**< Maximum layers in a 1D layered texture */
+    cudaDevAttrMaxTexture2DGatherWidth        = 45, /**< Maximum 2D texture width if cudaArrayTextureGather is set */
+    cudaDevAttrMaxTexture2DGatherHeight       = 46, /**< Maximum 2D texture height if cudaArrayTextureGather is set */
+    cudaDevAttrMaxTexture3DWidthAlt           = 47, /**< Alternate maximum 3D texture width */
+    cudaDevAttrMaxTexture3DHeightAlt          = 48, /**< Alternate maximum 3D texture height */
+    cudaDevAttrMaxTexture3DDepthAlt           = 49, /**< Alternate maximum 3D texture depth */
+    cudaDevAttrPciDomainId                    = 50, /**< PCI domain ID of the device */
+    cudaDevAttrTexturePitchAlignment          = 51, /**< Pitch alignment requirement for textures */
+    cudaDevAttrMaxTextureCubemapWidth         = 52, /**< Maximum cubemap texture width/height */
+    cudaDevAttrMaxTextureCubemapLayeredWidth  = 53, /**< Maximum cubemap layered texture width/height */
+    cudaDevAttrMaxTextureCubemapLayeredLayers = 54, /**< Maximum layers in a cubemap layered texture */
+    cudaDevAttrMaxSurface1DWidth              = 55, /**< Maximum 1D surface width */
+    cudaDevAttrMaxSurface2DWidth              = 56, /**< Maximum 2D surface width */
+    cudaDevAttrMaxSurface2DHeight             = 57, /**< Maximum 2D surface height */
+    cudaDevAttrMaxSurface3DWidth              = 58, /**< Maximum 3D surface width */
+    cudaDevAttrMaxSurface3DHeight             = 59, /**< Maximum 3D surface height */
+    cudaDevAttrMaxSurface3DDepth              = 60, /**< Maximum 3D surface depth */
+    cudaDevAttrMaxSurface1DLayeredWidth       = 61, /**< Maximum 1D layered surface width */
+    cudaDevAttrMaxSurface1DLayeredLayers      = 62, /**< Maximum layers in a 1D layered surface */
+    cudaDevAttrMaxSurface2DLayeredWidth       = 63, /**< Maximum 2D layered surface width */
+    cudaDevAttrMaxSurface2DLayeredHeight      = 64, /**< Maximum 2D layered surface height */
+    cudaDevAttrMaxSurface2DLayeredLayers      = 65, /**< Maximum layers in a 2D layered surface */
+    cudaDevAttrMaxSurfaceCubemapWidth         = 66, /**< Maximum cubemap surface width */
+    cudaDevAttrMaxSurfaceCubemapLayeredWidth  = 67, /**< Maximum cubemap layered surface width */
+    cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68, /**< Maximum layers in a cubemap layered surface */
+    cudaDevAttrMaxTexture1DLinearWidth        = 69, /**< Maximum 1D linear texture width */
+    cudaDevAttrMaxTexture2DLinearWidth        = 70, /**< Maximum 2D linear texture width */
+    cudaDevAttrMaxTexture2DLinearHeight       = 71, /**< Maximum 2D linear texture height */
+    cudaDevAttrMaxTexture2DLinearPitch        = 72, /**< Maximum 2D linear texture pitch in bytes */
+    cudaDevAttrMaxTexture2DMipmappedWidth     = 73, /**< Maximum mipmapped 2D texture width */
+    cudaDevAttrMaxTexture2DMipmappedHeight    = 74, /**< Maximum mipmapped 2D texture height */
+    cudaDevAttrComputeCapabilityMajor         = 75, /**< Major compute capability version number */ 
+    cudaDevAttrComputeCapabilityMinor         = 76, /**< Minor compute capability version number */
+    cudaDevAttrMaxTexture1DMipmappedWidth     = 77, /**< Maximum mipmapped 1D texture width */
+    cudaDevAttrStreamPrioritiesSupported      = 78, /**< Device supports stream priorities */
+    cudaDevAttrGlobalL1CacheSupported         = 79, /**< Device supports caching globals in L1 */
+    cudaDevAttrLocalL1CacheSupported          = 80, /**< Device supports caching locals in L1 */
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81, /**< Maximum shared memory available per multiprocessor in bytes */
+    cudaDevAttrMaxRegistersPerMultiprocessor  = 82, /**< Maximum number of 32-bit registers available per multiprocessor */
+    cudaDevAttrManagedMemory                  = 83, /**< Device can allocate managed memory on this system */
+    cudaDevAttrIsMultiGpuBoard                = 84, /**< Device is on a multi-GPU board */
+    cudaDevAttrMultiGpuBoardGroupID           = 85, /**< Unique identifier for a group of devices on the same multi-GPU board */
+    cudaDevAttrHostNativeAtomicSupported      = 86, /**< Link between the device and the host supports native atomic operations */
+    cudaDevAttrSingleToDoublePrecisionPerfRatio = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    cudaDevAttrPageableMemoryAccess           = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    cudaDevAttrConcurrentManagedAccess        = 89, /**< Device can coherently access managed memory concurrently with the CPU */
+    cudaDevAttrComputePreemptionSupported     = 90, /**< Device supports Compute Preemption */
+    cudaDevAttrCanUseHostPointerForRegisteredMem = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
+    cudaDevAttrReserved92                     = 92,
+    cudaDevAttrReserved93                     = 93,
+    cudaDevAttrReserved94                     = 94,
+    cudaDevAttrCooperativeLaunch              = 95, /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/
+    cudaDevAttrCooperativeMultiDeviceLaunch   = 96, /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
+    cudaDevAttrMaxSharedMemoryPerBlockOptin   = 97, /**< The maximum optin shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */
+    cudaDevAttrCanFlushRemoteWrites           = 98, /**< Device supports flushing of outstanding remote writes. */
+    cudaDevAttrHostRegisterSupported          = 99, /**< Device supports host memory registration via ::cudaHostRegister. */
+    cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100, /**< Device accesses pageable memory via the host's page tables. */
+    cudaDevAttrDirectManagedMemAccessFromHost = 101, /**< Host can directly access managed memory on the device without migration. */
+    cudaDevAttrMaxBlocksPerMultiprocessor     = 106, /**< Maximum number of blocks per multiprocessor */
+    cudaDevAttrMaxPersistingL2CacheSize       = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */
+    cudaDevAttrMaxAccessPolicyWindowSize      = 109, /**< Maximum value of cudaAccessPolicyWindow::num_bytes. */
+    cudaDevAttrReservedSharedMemoryPerBlock   = 111, /**< Shared memory reserved by CUDA driver per block in bytes */
+    cudaDevAttrSparseCudaArraySupported       = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+    cudaDevAttrHostRegisterReadOnlySupported  = 113,  /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */
+    cudaDevAttrTimelineSemaphoreInteropSupported = 114,  /**< External timeline semaphore interop is supported on the device */
+    cudaDevAttrMaxTimelineSemaphoreInteropSupported = 114,  /**< Deprecated, External timeline semaphore interop is supported on the device */
+    cudaDevAttrMemoryPoolsSupported           = 115, /**< Device supports using the ::cudaMallocAsync and ::cudaMemPool family of APIs */
+    cudaDevAttrGPUDirectRDMASupported         = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+    cudaDevAttrGPUDirectRDMAFlushWritesOptions = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are listed in the ::cudaFlushGPUDirectRDMAWritesOptions enum */
+    cudaDevAttrGPUDirectRDMAWritesOrdering    = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::cudaGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+    cudaDevAttrMemoryPoolSupportedHandleTypes = 119, /**< Handle types supported with mempool based IPC */
+    cudaDevAttrClusterLaunch                  = 120, /**< Indicates device supports cluster launch */
+    cudaDevAttrDeferredMappingCudaArraySupported = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+    cudaDevAttrMax
+};
+
+/**
+ * CUDA memory pool attributes
+ */
+enum __device_builtin__ cudaMemPoolAttr
+{
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to use memory asynchronously freed
+     * in another streams as long as a stream ordering dependency
+     * of the allocating stream on the free action exists.
+     * Cuda events and null stream interactions can create the required
+     * stream ordered dependencies. (default enabled)
+     */
+    cudaMemPoolReuseFollowEventDependencies   = 0x1,
+
+    /**
+     * (value type = int)
+     * Allow reuse of already completed frees when there is no dependency
+     * between the free and allocation. (default enabled)
+     */
+    cudaMemPoolReuseAllowOpportunistic        = 0x2,
+
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to insert new stream dependencies
+     * in order to establish the stream ordering required to reuse
+     * a piece of memory released by cuFreeAsync (default enabled).
+     */
+    cudaMemPoolReuseAllowInternalDependencies = 0x3,
+
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of reserved memory in bytes to hold onto before trying
+     * to release memory back to the OS. When more than the release
+     * threshold bytes of memory are held by the memory pool, the
+     * allocator will try to release memory back to the OS on the
+     * next call to stream, event or context synchronize. (default 0)
+     */
+    cudaMemPoolAttrReleaseThreshold           = 0x4,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of backing memory currently allocated for the mempool.
+     */
+    cudaMemPoolAttrReservedMemCurrent         = 0x5,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of backing memory allocated for the mempool since the
+     * last time it was reset. High watermark can only be reset to zero.
+     */
+    cudaMemPoolAttrReservedMemHigh            = 0x6,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory from the pool that is currently in use by the application.
+     */
+    cudaMemPoolAttrUsedMemCurrent             = 0x7,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of the amount of memory from the pool that was in use by the application since
+     * the last time it was reset. High watermark can only be reset to zero.
+     */
+    cudaMemPoolAttrUsedMemHigh                = 0x8
+};
+
+/**
+ * Specifies the type of location 
+ */
+enum __device_builtin__ cudaMemLocationType {
+    cudaMemLocationTypeInvalid = 0,
+    cudaMemLocationTypeDevice = 1  /**< Location is a device location, thus id is a device ordinal */
+};
+
+/**
+ * Specifies a memory location.
+ *
+ * To specify a gpu, set type = ::cudaMemLocationTypeDevice and set id = the gpu's device ordinal.
+ */
+struct __device_builtin__ cudaMemLocation {
+    enum cudaMemLocationType type;  /**< Specifies the location type, which modifies the meaning of id. */
+    int id;                         /**< identifier for a given this location's ::CUmemLocationType. */
+};
+
+/**
+ * Specifies the memory protection flags for mapping.
+ */
+enum __device_builtin__ cudaMemAccessFlags {
+    cudaMemAccessFlagsProtNone      = 0,  /**< Default, make the address range not accessible */
+    cudaMemAccessFlagsProtRead      = 1,  /**< Make the address range read accessible */
+    cudaMemAccessFlagsProtReadWrite = 3   /**< Make the address range read-write accessible */
+};
+
+/**
+ * Memory access descriptor
+ */
+struct __device_builtin__ cudaMemAccessDesc {
+    struct cudaMemLocation  location; /**< Location on which the request is to change it's accessibility */
+    enum cudaMemAccessFlags flags;    /**< ::CUmemProt accessibility flags to set on the request */
+};
+
+/**
+ * Defines the allocation types available
+ */
+enum __device_builtin__ cudaMemAllocationType {
+    cudaMemAllocationTypeInvalid = 0x0,
+    /** This allocation type is 'pinned', i.e. cannot migrate from its current
+      * location while the application is actively using it
+      */
+    cudaMemAllocationTypePinned  = 0x1,
+    cudaMemAllocationTypeMax     = 0x7FFFFFFF 
+};
+
+/**
+ * Flags for specifying particular handle types
+ */
+enum __device_builtin__ cudaMemAllocationHandleType {
+    cudaMemHandleTypeNone                    = 0x0,  /**< Does not allow any export mechanism. > */
+    cudaMemHandleTypePosixFileDescriptor     = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
+    cudaMemHandleTypeWin32                   = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
+    cudaMemHandleTypeWin32Kmt                = 0x4   /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
+};
+
+/**
+ * Specifies the properties of allocations made from the pool.
+ */
+struct __device_builtin__ cudaMemPoolProps {
+    enum cudaMemAllocationType         allocType;   /**< Allocation type. Currently must be specified as cudaMemAllocationTypePinned */
+    enum cudaMemAllocationHandleType   handleTypes; /**< Handle types that will be supported by allocations from the pool. */
+    struct cudaMemLocation             location;    /**< Location allocations should reside. */
+    /**
+     * Windows-specific LPSECURITYATTRIBUTES required when
+     * ::cudaMemHandleTypeWin32 is specified.  This security attribute defines
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void                              *win32SecurityAttributes;
+    unsigned char                      reserved[64]; /**< reserved for future use, must be 0 */
+};
+
+/**
+ * Opaque data for exporting a pool allocation
+ */
+struct __device_builtin__ cudaMemPoolPtrExportData {
+    unsigned char reserved[64];
+};
+
+/**
+ * Memory allocation node parameters
+ */
+struct __device_builtin__ cudaMemAllocNodeParams {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::cudaMemHandleTypeNone. IPC is not supported.
+    */
+    struct cudaMemPoolProps         poolProps;       /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    const struct cudaMemAccessDesc *accessDescs;     /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t                          accessDescCount; /**< in: Number of `accessDescs`s */
+    size_t                          bytesize;        /**< in: size in bytes of the requested allocation */
+    void                           *dptr;            /**< out: address of the allocation returned by CUDA */
+};
+
+/**
+ * Graph memory attributes
+ */
+enum __device_builtin__ cudaGraphMemAttributeType {
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently associated with graphs.
+     */
+    cudaGraphMemAttrUsedMemCurrent      = 0x0,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, associated with graphs since the
+     * last time it was reset.  High watermark can only be reset to zero.
+     */
+    cudaGraphMemAttrUsedMemHigh         = 0x1,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    cudaGraphMemAttrReservedMemCurrent  = 0x2,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    cudaGraphMemAttrReservedMemHigh     = 0x3
+};
+
+/**
+ * CUDA device P2P attributes
+ */
+
+enum __device_builtin__ cudaDeviceP2PAttr {
+    cudaDevP2PAttrPerformanceRank              = 1, /**< A relative value indicating the performance of the link between two devices */
+    cudaDevP2PAttrAccessSupported              = 2, /**< Peer access is enabled */
+    cudaDevP2PAttrNativeAtomicSupported        = 3, /**< Native atomic operation over the link supported */
+    cudaDevP2PAttrCudaArrayAccessSupported     = 4  /**< Accessing CUDA arrays over the link supported */
+};
+
+/**
+ * CUDA UUID types
+ */
+#ifndef CU_UUID_HAS_BEEN_DEFINED
+#define CU_UUID_HAS_BEEN_DEFINED
+struct __device_builtin__ CUuuid_st {     /**< CUDA definition of UUID */
+    char bytes[16];
+};
+typedef __device_builtin__ struct CUuuid_st CUuuid;
+#endif
+typedef __device_builtin__ struct CUuuid_st cudaUUID_t;
+
+/**
+ * CUDA device properties
+ */
+struct __device_builtin__ cudaDeviceProp
+{
+    char         name[256];                  /**< ASCII string identifying device */
+    cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
+    char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
+    unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
+    size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
+    size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
+    int          regsPerBlock;               /**< 32-bit registers available per block */
+    int          warpSize;                   /**< Warp size in threads */
+    size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
+    int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
+    int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
+    int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
+    int          clockRate;                  /**< Clock frequency in kilohertz */
+    size_t       totalConstMem;              /**< Constant memory available on device in bytes */
+    int          major;                      /**< Major compute capability */
+    int          minor;                      /**< Minor compute capability */
+    size_t       textureAlignment;           /**< Alignment requirement for textures */
+    size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
+    int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
+    int          multiProcessorCount;        /**< Number of multiprocessors on device */
+    int          kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */
+    int          integrated;                 /**< Device is integrated as opposed to discrete */
+    int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
+    int          computeMode;                /**< Compute mode (See ::cudaComputeMode) */
+    int          maxTexture1D;               /**< Maximum 1D texture size */
+    int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
+    int          maxTexture1DLinear;         /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
+    int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
+    int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
+    int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
+    int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
+    int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
+    int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
+    int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
+    int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
+    int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
+    int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
+    int          maxSurface1D;               /**< Maximum 1D surface size */
+    int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
+    int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
+    int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
+    int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
+    int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
+    int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
+    size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
+    int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
+    int          ECCEnabled;                 /**< Device has ECC support enabled */
+    int          pciBusID;                   /**< PCI bus ID of the device */
+    int          pciDeviceID;                /**< PCI device ID of the device */
+    int          pciDomainID;                /**< PCI domain ID of the device */
+    int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
+    int          asyncEngineCount;           /**< Number of asynchronous engines */
+    int          unifiedAddressing;          /**< Device shares a unified address space with the host */
+    int          memoryClockRate;            /**< Peak memory clock frequency in kilohertz */
+    int          memoryBusWidth;             /**< Global memory bus width in bits */
+    int          l2CacheSize;                /**< Size of L2 cache in bytes */
+    int          persistingL2CacheMaxSize;   /**< Device's maximum l2 persisting lines capacity setting in bytes */
+    int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
+    int          streamPrioritiesSupported;  /**< Device supports stream priorities */
+    int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
+    int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
+    size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
+    int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
+    int          managedMemory;              /**< Device supports allocating managed memory on this system */
+    int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
+    int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
+    int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
+    int          singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
+    int          computePreemptionSupported; /**< Device supports Compute Preemption */
+    int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
+    int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
+    int          cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
+    size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
+    int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
+    int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
+    int          maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
+    int          accessPolicyMaxWindowSize;  /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
+    size_t       reservedSharedMemPerBlock;  /**< Shared memory reserved by CUDA driver per block in bytes */
+};
+
+#define cudaDevicePropDontCare                                 \
+        {                                                      \
+          {'\0'},    /* char         name[256];               */ \
+          {{0}},     /* cudaUUID_t   uuid;                    */ \
+          {'\0'},    /* char         luid[8];                 */ \
+          0,         /* unsigned int luidDeviceNodeMask       */ \
+          0,         /* size_t       totalGlobalMem;          */ \
+          0,         /* size_t       sharedMemPerBlock;       */ \
+          0,         /* int          regsPerBlock;            */ \
+          0,         /* int          warpSize;                */ \
+          0,         /* size_t       memPitch;                */ \
+          0,         /* int          maxThreadsPerBlock;      */ \
+          {0, 0, 0}, /* int          maxThreadsDim[3];        */ \
+          {0, 0, 0}, /* int          maxGridSize[3];          */ \
+          0,         /* int          clockRate;               */ \
+          0,         /* size_t       totalConstMem;           */ \
+          -1,        /* int          major;                   */ \
+          -1,        /* int          minor;                   */ \
+          0,         /* size_t       textureAlignment;        */ \
+          0,         /* size_t       texturePitchAlignment    */ \
+          -1,        /* int          deviceOverlap;           */ \
+          0,         /* int          multiProcessorCount;     */ \
+          0,         /* int          kernelExecTimeoutEnabled */ \
+          0,         /* int          integrated               */ \
+          0,         /* int          canMapHostMemory         */ \
+          0,         /* int          computeMode              */ \
+          0,         /* int          maxTexture1D             */ \
+          0,         /* int          maxTexture1DMipmap       */ \
+          0,         /* int          maxTexture1DLinear       */ \
+          {0, 0},    /* int          maxTexture2D[2]          */ \
+          {0, 0},    /* int          maxTexture2DMipmap[2]    */ \
+          {0, 0, 0}, /* int          maxTexture2DLinear[3]    */ \
+          {0, 0},    /* int          maxTexture2DGather[2]    */ \
+          {0, 0, 0}, /* int          maxTexture3D[3]          */ \
+          {0, 0, 0}, /* int          maxTexture3DAlt[3]       */ \
+          0,         /* int          maxTextureCubemap        */ \
+          {0, 0},    /* int          maxTexture1DLayered[2]   */ \
+          {0, 0, 0}, /* int          maxTexture2DLayered[3]   */ \
+          {0, 0},    /* int          maxTextureCubemapLayered[2] */ \
+          0,         /* int          maxSurface1D             */ \
+          {0, 0},    /* int          maxSurface2D[2]          */ \
+          {0, 0, 0}, /* int          maxSurface3D[3]          */ \
+          {0, 0},    /* int          maxSurface1DLayered[2]   */ \
+          {0, 0, 0}, /* int          maxSurface2DLayered[3]   */ \
+          0,         /* int          maxSurfaceCubemap        */ \
+          {0, 0},    /* int          maxSurfaceCubemapLayered[2] */ \
+          0,         /* size_t       surfaceAlignment         */ \
+          0,         /* int          concurrentKernels        */ \
+          0,         /* int          ECCEnabled               */ \
+          0,         /* int          pciBusID                 */ \
+          0,         /* int          pciDeviceID              */ \
+          0,         /* int          pciDomainID              */ \
+          0,         /* int          tccDriver                */ \
+          0,         /* int          asyncEngineCount         */ \
+          0,         /* int          unifiedAddressing        */ \
+          0,         /* int          memoryClockRate          */ \
+          0,         /* int          memoryBusWidth           */ \
+          0,         /* int          l2CacheSize              */ \
+          0,         /* int          persistingL2CacheMaxSize   */ \
+          0,         /* int          maxThreadsPerMultiProcessor */ \
+          0,         /* int          streamPrioritiesSupported */ \
+          0,         /* int          globalL1CacheSupported   */ \
+          0,         /* int          localL1CacheSupported    */ \
+          0,         /* size_t       sharedMemPerMultiprocessor; */ \
+          0,         /* int          regsPerMultiprocessor;   */ \
+          0,         /* int          managedMemory            */ \
+          0,         /* int          isMultiGpuBoard          */ \
+          0,         /* int          multiGpuBoardGroupID     */ \
+          0,         /* int          hostNativeAtomicSupported */ \
+          0,         /* int          singleToDoublePrecisionPerfRatio */ \
+          0,         /* int          pageableMemoryAccess     */ \
+          0,         /* int          concurrentManagedAccess  */ \
+          0,         /* int          computePreemptionSupported */ \
+          0,         /* int          canUseHostPointerForRegisteredMem */ \
+          0,         /* int          cooperativeLaunch */ \
+          0,         /* int          cooperativeMultiDeviceLaunch */ \
+          0,         /* size_t       sharedMemPerBlockOptin */ \
+          0,         /* int          pageableMemoryAccessUsesHostPageTables */ \
+          0,         /* int          directManagedMemAccessFromHost */ \
+          0,         /* int          accessPolicyMaxWindowSize */ \
+          0,         /* size_t       reservedSharedMemPerBlock */ \
+        } /**< Empty device properties */
+
+/**
+ * CUDA IPC Handle Size
+ */
+#define CUDA_IPC_HANDLE_SIZE 64
+
+/**
+ * CUDA IPC event handle
+ */
+typedef __device_builtin__ struct __device_builtin__ cudaIpcEventHandle_st
+{
+    char reserved[CUDA_IPC_HANDLE_SIZE];
+}cudaIpcEventHandle_t;
+
+/**
+ * CUDA IPC memory handle
+ */
+typedef __device_builtin__ struct __device_builtin__ cudaIpcMemHandle_st 
+{
+    char reserved[CUDA_IPC_HANDLE_SIZE];
+}cudaIpcMemHandle_t;
+
+/**
+ * External memory handle types
+ */
+enum __device_builtin__ cudaExternalMemoryHandleType {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    cudaExternalMemoryHandleTypeOpaqueFd         = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    cudaExternalMemoryHandleTypeOpaqueWin32      = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    cudaExternalMemoryHandleTypeOpaqueWin32Kmt   = 3,
+    /**
+     * Handle is a D3D12 heap object
+     */
+    cudaExternalMemoryHandleTypeD3D12Heap        = 4,
+    /**
+     * Handle is a D3D12 committed resource
+     */
+    cudaExternalMemoryHandleTypeD3D12Resource    = 5,
+    /**
+    *  Handle is a shared NT handle to a D3D11 resource
+    */
+    cudaExternalMemoryHandleTypeD3D11Resource    = 6,
+    /**
+    *  Handle is a globally shared handle to a D3D11 resource
+    */
+    cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+    /**
+    *  Handle is an NvSciBuf object
+    */
+    cudaExternalMemoryHandleTypeNvSciBuf         = 8
+};
+
+/**
+ * Indicates that the external memory object is a dedicated resource
+ */
+#define cudaExternalMemoryDedicated   0x1
+
+/** When the /p flags parameter of ::cudaExternalSemaphoreSignalParams
+ * contains this flag, it indicates that signaling an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define cudaExternalSemaphoreSignalSkipNvSciBufMemSync     0x01
+
+/** When the /p flags parameter of ::cudaExternalSemaphoreWaitParams
+ * contains this flag, it indicates that waiting an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define cudaExternalSemaphoreWaitSkipNvSciBufMemSync       0x02
+
+/**
+ * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application need signaler specific NvSciSyncAttr
+ * to be filled by ::cudaDeviceGetNvSciSyncAttributes.
+ */
+#define cudaNvSciSyncAttrSignal       0x1
+
+/**
+ * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application need waiter specific NvSciSyncAttr
+ * to be filled by ::cudaDeviceGetNvSciSyncAttributes.
+ */
+#define cudaNvSciSyncAttrWait         0x2
+
+/**
+ * External memory handle descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryHandleDesc {
+    /**
+     * Type of the handle
+     */
+    enum cudaExternalMemoryHandleType type;
+    union {
+        /**
+         * File descriptor referencing the memory object. Valid
+         * when type is
+         * ::cudaExternalMemoryHandleTypeOpaqueFd
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalMemoryHandleTypeOpaqueWin32
+         * - ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt
+         * - ::cudaExternalMemoryHandleTypeD3D12Heap 
+         * - ::cudaExternalMemoryHandleTypeD3D12Resource
+		 * - ::cudaExternalMemoryHandleTypeD3D11Resource
+		 * - ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following: 
+         * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt
+         * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid memory object.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * A handle representing NvSciBuf Object. Valid when type
+         * is ::cudaExternalMemoryHandleTypeNvSciBuf
+         */
+        const void *nvSciBufObject;
+    } handle;
+    /**
+     * Size of the memory allocation
+     */
+    unsigned long long size;
+    /**
+     * Flags must either be zero or ::cudaExternalMemoryDedicated
+     */
+    unsigned int flags;
+};
+
+/**
+ * External memory buffer descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryBufferDesc {
+    /**
+     * Offset into the memory object where the buffer's base is
+     */
+    unsigned long long offset;
+    /**
+     * Size of the buffer
+     */
+    unsigned long long size;
+    /**
+     * Flags reserved for future use. Must be zero.
+     */
+    unsigned int flags;
+};
+ 
+/**
+ * External memory mipmap descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryMipmappedArrayDesc {
+    /**
+     * Offset into the memory object where the base level of the
+     * mipmap chain is.
+     */
+    unsigned long long offset;
+    /**
+     * Format of base level of the mipmap chain
+     */
+    struct cudaChannelFormatDesc formatDesc;
+    /**
+     * Dimensions of base level of the mipmap chain
+     */
+    struct cudaExtent extent;
+    /**
+     * Flags associated with CUDA mipmapped arrays.
+     * See ::cudaMallocMipmappedArray
+     */
+    unsigned int flags;
+    /**
+     * Total number of levels in the mipmap chain
+     */
+    unsigned int numLevels;
+};
+ 
+/**
+ * External semaphore handle types
+ */
+enum __device_builtin__ cudaExternalSemaphoreHandleType {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueFd       = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueWin32    = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3,
+    /**
+     * Handle is a shared NT handle referencing a D3D12 fence object
+     */
+    cudaExternalSemaphoreHandleTypeD3D12Fence     = 4,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 fence object
+     */
+    cudaExternalSemaphoreHandleTypeD3D11Fence     = 5,
+    /**
+     * Opaque handle to NvSciSync Object
+     */
+     cudaExternalSemaphoreHandleTypeNvSciSync     = 6,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
+     */
+    cudaExternalSemaphoreHandleTypeKeyedMutex     = 7,
+    /**
+     * Handle is a shared KMT handle referencing a D3D11 keyed mutex object
+     */
+    cudaExternalSemaphoreHandleTypeKeyedMutexKmt  = 8,
+    /**
+     * Handle is an opaque handle file descriptor referencing a timeline semaphore
+     */
+    cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd  = 9,
+    /**
+     * Handle is an opaque handle file descriptor referencing a timeline semaphore
+     */
+    cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32  = 10
+};
+
+/**
+ * External semaphore handle descriptor
+ */
+struct __device_builtin__ cudaExternalSemaphoreHandleDesc {
+    /**
+     * Type of the handle
+     */
+    enum cudaExternalSemaphoreHandleType type;
+    union {
+        /**
+         * File descriptor referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueFd
+         * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+         * - ::cudaExternalSemaphoreHandleTypeD3D12Fence
+         * - ::cudaExternalSemaphoreHandleTypeD3D11Fence
+         * - ::cudaExternalSemaphoreHandleTypeKeyedMutex
+         * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+         * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid synchronization primitive.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * Valid NvSciSyncObj. Must be non NULL
+         */
+        const void* nvSciSyncObj;
+    } handle;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+ * External semaphore signal parameters(deprecated)
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalParams_v1 {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /*
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while signaling the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+* External semaphore wait parameters(deprecated)
+*/
+struct __device_builtin__ cudaExternalSemaphoreWaitParams_v1 {
+    struct {
+        /**
+        * Parameters for fence objects
+        */
+        struct {
+            /**
+            * Value of fence to be waited on
+            */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while waiting for the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+ * External semaphore signal parameters, compatible with driver type
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalParams{
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /*
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+        unsigned int reserved[12];
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while signaling the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+};
+
+/**
+ * External semaphore wait parameters, compatible with driver type
+ */
+struct __device_builtin__ cudaExternalSemaphoreWaitParams {
+    struct {
+        /**
+        * Parameters for fence objects
+        */
+        struct {
+            /**
+            * Value of fence to be waited on
+            */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+        unsigned int reserved[10];
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while waiting for the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+};
+
+/*******************************************************************************
+*                                                                              *
+*  SHORTHAND TYPE DEFINITION USED BY RUNTIME API                               *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * CUDA Error types
+ */
+typedef __device_builtin__ enum cudaError cudaError_t;
+
+/**
+ * CUDA stream
+ */
+typedef __device_builtin__ struct CUstream_st *cudaStream_t;
+
+/**
+ * CUDA event types
+ */
+typedef __device_builtin__ struct CUevent_st *cudaEvent_t;
+
+/**
+ * CUDA graphics resource types
+ */
+typedef __device_builtin__ struct cudaGraphicsResource *cudaGraphicsResource_t;
+
+/**
+ * CUDA output file modes
+ */
+typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;
+
+/**
+ * CUDA external memory
+ */
+typedef __device_builtin__ struct CUexternalMemory_st *cudaExternalMemory_t;
+
+/**
+ * CUDA external semaphore
+ */
+typedef __device_builtin__ struct CUexternalSemaphore_st *cudaExternalSemaphore_t;
+
+/**
+ * CUDA graph
+ */
+typedef __device_builtin__ struct CUgraph_st *cudaGraph_t;
+
+/**
+ * CUDA graph node.
+ */
+typedef __device_builtin__ struct CUgraphNode_st *cudaGraphNode_t;
+
+/**
+ * CUDA user object for graphs
+ */
+typedef __device_builtin__ struct CUuserObject_st *cudaUserObject_t;
+
+/**
+ * CUDA function
+ */
+typedef __device_builtin__ struct CUfunc_st *cudaFunction_t;
+
+/**
+ * CUDA memory pool
+ */
+typedef __device_builtin__ struct CUmemPoolHandle_st *cudaMemPool_t;
+
+/**
+ * CUDA cooperative group scope
+ */
+enum __device_builtin__ cudaCGScope {
+    cudaCGScopeInvalid   = 0, /**< Invalid cooperative group scope */
+    cudaCGScopeGrid      = 1, /**< Scope represented by a grid_group */
+    cudaCGScopeMultiGrid = 2  /**< Scope represented by a multi_grid_group */
+};
+
+/**
+ * CUDA launch parameters
+ */
+struct __device_builtin__ cudaLaunchParams
+{
+    void *func;          /**< Device function symbol */
+    dim3 gridDim;        /**< Grid dimentions */
+    dim3 blockDim;       /**< Block dimentions */
+    void **args;         /**< Arguments */
+    size_t sharedMem;    /**< Shared memory */
+    cudaStream_t stream; /**< Stream identifier */
+};
+
+/**
+ * CUDA GPU kernel node parameters
+ */
+struct __device_builtin__ cudaKernelNodeParams {
+    void* func;                     /**< Kernel to launch */
+    dim3 gridDim;                   /**< Grid dimensions */
+    dim3 blockDim;                  /**< Block dimensions */
+    unsigned int sharedMemBytes;    /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;            /**< Array of pointers to individual kernel arguments*/
+    void **extra;                   /**< Pointer to kernel arguments in the "extra" format */
+};
+
+/**
+ * External semaphore signal node parameters
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalNodeParams {
+    cudaExternalSemaphore_t* extSemArray;                        /**< Array of external semaphore handles. */
+    const struct cudaExternalSemaphoreSignalParams* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                     /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+};
+
+/**
+ * External semaphore wait node parameters
+ */
+struct __device_builtin__ cudaExternalSemaphoreWaitNodeParams {
+    cudaExternalSemaphore_t* extSemArray;                      /**< Array of external semaphore handles. */
+    const struct cudaExternalSemaphoreWaitParams* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                   /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+};
+
+/**
+* CUDA Graph node types
+*/
+enum __device_builtin__ cudaGraphNodeType {
+    cudaGraphNodeTypeKernel      = 0x00, /**< GPU kernel node */
+    cudaGraphNodeTypeMemcpy      = 0x01, /**< Memcpy node */
+    cudaGraphNodeTypeMemset      = 0x02, /**< Memset node */
+    cudaGraphNodeTypeHost        = 0x03, /**< Host (executable) node */
+    cudaGraphNodeTypeGraph       = 0x04, /**< Node which executes an embedded graph */
+    cudaGraphNodeTypeEmpty       = 0x05, /**< Empty (no-op) node */
+    cudaGraphNodeTypeWaitEvent   = 0x06, /**< External event wait node */
+    cudaGraphNodeTypeEventRecord = 0x07, /**< External event record node */
+    cudaGraphNodeTypeExtSemaphoreSignal = 0x08, /**< External semaphore signal node */
+    cudaGraphNodeTypeExtSemaphoreWait = 0x09, /**< External semaphore wait node */
+    cudaGraphNodeTypeMemAlloc    = 0x0a, /**< Memory allocation node */
+    cudaGraphNodeTypeMemFree     = 0x0b, /**< Memory free node */
+    cudaGraphNodeTypeCount
+};
+
+/**
+ * CUDA executable (launchable) graph
+ */
+typedef struct CUgraphExec_st* cudaGraphExec_t;
+
+/**
+* CUDA Graph Update error types
+*/
+enum __device_builtin__ cudaGraphExecUpdateResult {
+    cudaGraphExecUpdateSuccess                = 0x0, /**< The update succeeded */
+    cudaGraphExecUpdateError                  = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
+    cudaGraphExecUpdateErrorTopologyChanged   = 0x2, /**< The update failed because the topology changed */
+    cudaGraphExecUpdateErrorNodeTypeChanged   = 0x3, /**< The update failed because a node type changed */
+    cudaGraphExecUpdateErrorFunctionChanged   = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
+    cudaGraphExecUpdateErrorParametersChanged = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
+    cudaGraphExecUpdateErrorNotSupported      = 0x6, /**< The update failed because something about the node is not supported */
+    cudaGraphExecUpdateErrorUnsupportedFunctionChange = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */
+    cudaGraphExecUpdateErrorAttributesChanged = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */
+};
+
+/**
+ * Flags to specify search options to be used with ::cudaGetDriverEntryPoint
+ * For more details see ::cuGetProcAddress
+ */ 
+enum __device_builtin__ cudaGetDriverEntryPointFlags {
+    cudaEnableDefault                = 0x0, /**< Default search mode for driver symbols. */
+    cudaEnableLegacyStream           = 0x1, /**< Search for legacy versions of driver symbols. */
+    cudaEnablePerThreadDefaultStream = 0x2  /**< Search for per-thread versions of driver symbols. */
+};
+
+/**
+ * CUDA Graph debug write options
+ */
+enum __device_builtin__ cudaGraphDebugDotFlags {
+    cudaGraphDebugDotFlagsVerbose                  = 1<<0,  /**< Output all debug data as if every debug flag is enabled */
+    cudaGraphDebugDotFlagsKernelNodeParams         = 1<<2,  /**< Adds cudaKernelNodeParams to output */
+    cudaGraphDebugDotFlagsMemcpyNodeParams         = 1<<3,  /**< Adds cudaMemcpy3DParms to output */
+    cudaGraphDebugDotFlagsMemsetNodeParams         = 1<<4,  /**< Adds cudaMemsetParams to output */
+    cudaGraphDebugDotFlagsHostNodeParams           = 1<<5,  /**< Adds cudaHostNodeParams to output */
+    cudaGraphDebugDotFlagsEventNodeParams          = 1<<6,  /**< Adds cudaEvent_t handle from record and wait nodes to output */
+    cudaGraphDebugDotFlagsExtSemasSignalNodeParams = 1<<7,  /**< Adds cudaExternalSemaphoreSignalNodeParams values to output */
+    cudaGraphDebugDotFlagsExtSemasWaitNodeParams   = 1<<8,  /**< Adds cudaExternalSemaphoreWaitNodeParams to output */
+    cudaGraphDebugDotFlagsKernelNodeAttributes     = 1<<9,  /**< Adds cudaKernelNodeAttrID values to output */
+    cudaGraphDebugDotFlagsHandles                  = 1<<10  /**< Adds node handles and every kernel function handle to output */
+};
+
+/**
+ * Flags for instantiating a graph
+ */
+enum __device_builtin__ cudaGraphInstantiateFlags {
+    cudaGraphInstantiateFlagAutoFreeOnLaunch = 1 /**< Automatically free memory allocated in a graph before relaunching. */
+  , cudaGraphInstantiateFlagUseNodePriority  = 8 /**< Run the graph using the per-node priority attributes rather than the
+                                                      priority of the stream it is launched into. */
+};
+
+/**
+ * Launch attributes enum; used as id field of ::cudaLaunchAttribute
+ */
+typedef __device_builtin__ enum cudaLaunchAttributeID {
+    cudaLaunchAttributeIgnore                = 0 /**< Ignored entry, for convenient composition */
+  , cudaLaunchAttributeAccessPolicyWindow    = 1 /**< Valid for streams, graph nodes, launches. */
+  , cudaLaunchAttributeCooperative           = 2 /**< Valid for graph nodes, launches. */
+  , cudaLaunchAttributeSynchronizationPolicy = 3 /**< Valid for streams. */
+  , cudaLaunchAttributeClusterDimension                  = 4 /**< Valid for graph nodes, launches. */
+  , cudaLaunchAttributeClusterSchedulingPolicyPreference = 5 /**< Valid for graph nodes, launches. */
+  , cudaLaunchAttributeProgrammaticStreamSerialization   = 6 /**< Valid for launches. Setting
+                                                                  programmaticStreamSerializationAllowed to non-0
+                                                                  signals that the kernel will use programmatic
+                                                                  means to resolve its stream dependency, so that
+                                                                  the CUDA runtime should opportunistically allow
+                                                                  the grid's execution to overlap with the previous
+                                                                  kernel in the stream, if that kernel requests the
+                                                                  overlap. */
+  , cudaLaunchAttributeProgrammaticEvent                 = 7 /**< Valid for launches. Event recorded through this launch
+                                                                  attribute is guaranteed to only trigger after all
+                                                                  block in the associated kernel trigger the event. A
+                                                                  block can trigger the event through PTX
+                                                                  griddepcontrol.launch_dependents. A trigger can also
+                                                                  be inserted at the beginning of each block's execution
+                                                                  if triggerAtBlockStart is set to non-0. Note that
+                                                                  dependents (including the CPU thread calling
+                                                                  cudaEventSynchronize()) are not guaranteed to observe
+                                                                  the release precisely when it is released. For
+                                                                  example, cudaEventSynchronize() may only observe the
+                                                                  event trigger long after the associated kernel has
+                                                                  completed. This recording type is primarily meant for
+                                                                  establishing programmatic dependency between device
+                                                                  tasks. The event supplied must not be an interprocess
+                                                                  or interop event. The event must disable timing
+                                                                  (i.e. created with ::cudaEventDisableTiming flag
+                                                                  set). */
+  , cudaLaunchAttributePriority              = 8 /**< Valid for graph nodes. */
+} cudaLaunchAttributeID;
+
+/**
+ * Launch attributes union; used as value field of ::cudaLaunchAttribute
+ */
+typedef __device_builtin__ union cudaLaunchAttributeValue {
+    char pad[64]; /* Pad to 64 bytes */
+    struct cudaAccessPolicyWindow accessPolicyWindow;
+    int cooperative;
+    enum cudaSynchronizationPolicy syncPolicy;
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } clusterDim;
+    enum cudaClusterSchedulingPolicy clusterSchedulingPolicyPreference;
+    int programmaticStreamSerializationAllowed;
+    struct {
+        cudaEvent_t event;
+        int flags;
+        int triggerAtBlockStart;
+    } programmaticEvent;
+    int priority;
+} cudaLaunchAttributeValue;
+
+/**
+ * Launch attribute
+ */
+typedef __device_builtin__ struct cudaLaunchAttribute_st {
+    cudaLaunchAttributeID id;
+    char pad[8 - sizeof(cudaLaunchAttributeID)];
+    cudaLaunchAttributeValue val;
+} cudaLaunchAttribute;
+
+/**
+ * CUDA extensible launch configuration
+ */
+typedef __device_builtin__ struct cudaLaunchConfig_st {
+    dim3 gridDim;               /**< Grid dimentions */
+    dim3 blockDim;              /**< Block dimentions */
+    size_t dynamicSmemBytes;    /**< Dynamic shared-memory size per thread block in bytes */
+    cudaStream_t stream;        /**< Stream identifier */
+    cudaLaunchAttribute *attrs; /**< nullable if numAttrs == 0 */
+    unsigned int numAttrs;      /**< Number of attributes populated in attrs */
+} cudaLaunchConfig_t;
+
+/**
+ * Stream Attributes
+ */
+#define cudaStreamAttrID cudaLaunchAttributeID
+#define cudaStreamAttributeAccessPolicyWindow    cudaLaunchAttributeAccessPolicyWindow
+#define cudaStreamAttributeSynchronizationPolicy cudaLaunchAttributeSynchronizationPolicy
+
+/**
+ * Stream attributes union used with ::cudaStreamSetAttribute/::cudaStreamGetAttribute
+ */
+#define cudaStreamAttrValue cudaLaunchAttributeValue
+
+/**
+ * Graph kernel node Attributes
+ */
+#define cudaKernelNodeAttrID cudaLaunchAttributeID
+#define cudaKernelNodeAttributeAccessPolicyWindow cudaLaunchAttributeAccessPolicyWindow
+#define cudaKernelNodeAttributeCooperative        cudaLaunchAttributeCooperative
+#define cudaKernelNodeAttributePriority           cudaLaunchAttributePriority
+#define cudaKernelNodeAttributeClusterDimension                     cudaLaunchAttributeClusterDimension
+#define cudaKernelNodeAttributeClusterSchedulingPolicyPreference    cudaLaunchAttributeClusterSchedulingPolicyPreference
+
+/**
+ * Graph kernel node attributes union, used with ::cudaGraphKernelNodeSetAttribute/::cudaGraphKernelNodeGetAttribute
+ */
+#define cudaKernelNodeAttrValue cudaLaunchAttributeValue
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif /* !__DRIVER_TYPES_H__ */
diff --git a/ext/cudart/include/host_config.h b/ext/cudart/include/host_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..785bec4e5c0652f9605ccf9341b7f761a85471ab
--- /dev/null
+++ b/ext/cudart/include/host_config.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
+
+#include "crt/host_config.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
diff --git a/ext/cudart/include/host_defines.h b/ext/cudart/include/host_defines.h
new file mode 100644
index 0000000000000000000000000000000000000000..98a9c98a957e8f60e872b94fde762516c5523367
--- /dev/null
+++ b/ext/cudart/include/host_defines.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
+
+#include "crt/host_defines.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
diff --git a/ext/cudart/include/library_types.h b/ext/cudart/include/library_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a7e42c6b89ba4b446d4cf3d52c8bacd74e73b0d
--- /dev/null
+++ b/ext/cudart/include/library_types.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__LIBRARY_TYPES_H__)
+#define __LIBRARY_TYPES_H__
+
+
+
+typedef enum cudaDataType_t
+{
+    CUDA_R_16F  =  2, /* real as a half */
+    CUDA_C_16F  =  6, /* complex as a pair of half numbers */
+    CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
+    CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
+    CUDA_R_32F  =  0, /* real as a float */
+    CUDA_C_32F  =  4, /* complex as a pair of float numbers */
+    CUDA_R_64F  =  1, /* real as a double */
+    CUDA_C_64F  =  5, /* complex as a pair of double numbers */
+    CUDA_R_4I   = 16, /* real as a signed 4-bit int */
+    CUDA_C_4I   = 17, /* complex as a pair of signed 4-bit int numbers */
+    CUDA_R_4U   = 18, /* real as a unsigned 4-bit int */
+    CUDA_C_4U   = 19, /* complex as a pair of unsigned 4-bit int numbers */
+    CUDA_R_8I   =  3, /* real as a signed 8-bit int */
+    CUDA_C_8I   =  7, /* complex as a pair of signed 8-bit int numbers */
+    CUDA_R_8U   =  8, /* real as a unsigned 8-bit int */
+    CUDA_C_8U   =  9, /* complex as a pair of unsigned 8-bit int numbers */
+    CUDA_R_16I  = 20, /* real as a signed 16-bit int */
+    CUDA_C_16I  = 21, /* complex as a pair of signed 16-bit int numbers */
+    CUDA_R_16U  = 22, /* real as a unsigned 16-bit int */
+    CUDA_C_16U  = 23, /* complex as a pair of unsigned 16-bit int numbers */
+    CUDA_R_32I  = 10, /* real as a signed 32-bit int */
+    CUDA_C_32I  = 11, /* complex as a pair of signed 32-bit int numbers */
+    CUDA_R_32U  = 12, /* real as a unsigned 32-bit int */
+    CUDA_C_32U  = 13, /* complex as a pair of unsigned 32-bit int numbers */
+    CUDA_R_64I  = 24, /* real as a signed 64-bit int */
+    CUDA_C_64I  = 25, /* complex as a pair of signed 64-bit int numbers */
+    CUDA_R_64U  = 26, /* real as a unsigned 64-bit int */
+    CUDA_C_64U  = 27, /* complex as a pair of unsigned 64-bit int numbers */
+    CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */
+    CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */
+} cudaDataType;
+
+
+typedef enum libraryPropertyType_t
+{
+    MAJOR_VERSION,
+    MINOR_VERSION,
+    PATCH_LEVEL
+} libraryPropertyType;
+
+
+#ifndef __cplusplus
+typedef enum cudaDataType_t cudaDataType_t;
+typedef enum libraryPropertyType_t libraryPropertyType_t;
+#endif
+
+#endif /* !__LIBRARY_TYPES_H__ */
diff --git a/ext/cudart/include/math_constants.h b/ext/cudart/include/math_constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..39937e980f88a614d847154f9e4364bd9ba95cbd
--- /dev/null
+++ b/ext/cudart/include/math_constants.h
@@ -0,0 +1,152 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__MATH_CONSTANTS_H__)
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+#define CUDART_INF_F            __int_as_float(0x7f800000U)
+#define CUDART_NAN_F            __int_as_float(0x7fffffffU)
+#define CUDART_MIN_DENORM_F     __int_as_float(0x00000001U)
+#define CUDART_MAX_NORMAL_F     __int_as_float(0x7f7fffffU)
+#define CUDART_NEG_ZERO_F       __int_as_float(0x80000000U)
+#define CUDART_ZERO_F           0.0F
+#define CUDART_ONE_F            1.0F
+#define CUDART_SQRT_HALF_F      0.707106781F
+#define CUDART_SQRT_HALF_HI_F   0.707106781F
+#define CUDART_SQRT_HALF_LO_F   1.210161749e-08F
+#define CUDART_SQRT_TWO_F       1.414213562F
+#define CUDART_THIRD_F          0.333333333F
+#define CUDART_PIO4_F           0.785398163F
+#define CUDART_PIO2_F           1.570796327F
+#define CUDART_3PIO4_F          2.356194490F
+#define CUDART_2_OVER_PI_F      0.636619772F
+#define CUDART_SQRT_2_OVER_PI_F 0.797884561F
+#define CUDART_PI_F             3.141592654F
+#define CUDART_L2E_F            1.442695041F
+#define CUDART_L2T_F            3.321928094F
+#define CUDART_LG2_F            0.301029996F
+#define CUDART_LGE_F            0.434294482F
+#define CUDART_LN2_F            0.693147181F
+#define CUDART_LNT_F            2.302585093F
+#define CUDART_LNPI_F           1.144729886F
+#define CUDART_TWO_TO_M126_F    1.175494351e-38F
+#define CUDART_TWO_TO_126_F     8.507059173e37F
+#define CUDART_NORM_HUGE_F      3.402823466e38F
+#define CUDART_TWO_TO_23_F      8388608.0F
+#define CUDART_TWO_TO_24_F      16777216.0F
+#define CUDART_TWO_TO_31_F      2147483648.0F
+#define CUDART_TWO_TO_32_F      4294967296.0F
+#define CUDART_REMQUO_BITS_F    3U
+#define CUDART_REMQUO_MASK_F    (~((~0U)<<CUDART_REMQUO_BITS_F))
+#define CUDART_TRIG_PLOSS_F     105615.0F
+
+/* double precision constants */
+#define CUDART_INF              __longlong_as_double(0x7ff0000000000000ULL)
+#define CUDART_NAN              __longlong_as_double(0xfff8000000000000ULL)
+#define CUDART_NEG_ZERO         __longlong_as_double(0x8000000000000000ULL)
+#define CUDART_MIN_DENORM       __longlong_as_double(0x0000000000000001ULL)
+#define CUDART_ZERO             0.0
+#define CUDART_ONE              1.0
+#define CUDART_SQRT_TWO         1.4142135623730951e+0
+#define CUDART_SQRT_HALF        7.0710678118654757e-1
+#define CUDART_SQRT_HALF_HI     7.0710678118654757e-1
+#define CUDART_SQRT_HALF_LO   (-4.8336466567264567e-17)
+#define CUDART_THIRD            3.3333333333333333e-1
+#define CUDART_TWOTHIRD         6.6666666666666667e-1
+#define CUDART_PIO4             7.8539816339744828e-1
+#define CUDART_PIO4_HI          7.8539816339744828e-1
+#define CUDART_PIO4_LO          3.0616169978683830e-17
+#define CUDART_PIO2             1.5707963267948966e+0
+#define CUDART_PIO2_HI          1.5707963267948966e+0
+#define CUDART_PIO2_LO          6.1232339957367660e-17
+#define CUDART_3PIO4            2.3561944901923448e+0
+#define CUDART_2_OVER_PI        6.3661977236758138e-1
+#define CUDART_PI               3.1415926535897931e+0
+#define CUDART_PI_HI            3.1415926535897931e+0
+#define CUDART_PI_LO            1.2246467991473532e-16
+#define CUDART_SQRT_2PI         2.5066282746310007e+0
+#define CUDART_SQRT_2PI_HI      2.5066282746310007e+0
+#define CUDART_SQRT_2PI_LO    (-1.8328579980459167e-16)
+#define CUDART_SQRT_PIO2        1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_HI     1.2533141373155003e+0
+#define CUDART_SQRT_PIO2_LO   (-9.1642899902295834e-17)
+#define CUDART_SQRT_2OPI        7.9788456080286536e-1
+#define CUDART_L2E              1.4426950408889634e+0
+#define CUDART_L2E_HI           1.4426950408889634e+0
+#define CUDART_L2E_LO           2.0355273740931033e-17
+#define CUDART_L2T              3.3219280948873622e+0
+#define CUDART_LG2              3.0102999566398120e-1
+#define CUDART_LG2_HI           3.0102999566398120e-1
+#define CUDART_LG2_LO         (-2.8037281277851704e-18)
+#define CUDART_LGE              4.3429448190325182e-1
+#define CUDART_LGE_HI           4.3429448190325182e-1
+#define CUDART_LGE_LO           1.09831965021676510e-17
+#define CUDART_LN2              6.9314718055994529e-1
+#define CUDART_LN2_HI           6.9314718055994529e-1
+#define CUDART_LN2_LO           2.3190468138462996e-17
+#define CUDART_LNT              2.3025850929940459e+0
+#define CUDART_LNT_HI           2.3025850929940459e+0
+#define CUDART_LNT_LO         (-2.1707562233822494e-16)
+#define CUDART_LNPI             1.1447298858494002e+0
+#define CUDART_LN2_X_1024       7.0978271289338397e+2
+#define CUDART_LN2_X_1025       7.1047586007394398e+2
+#define CUDART_LN2_X_1075       7.4513321910194122e+2
+#define CUDART_LG2_X_1024       3.0825471555991675e+2
+#define CUDART_LG2_X_1075       3.2360724533877976e+2
+#define CUDART_TWO_TO_23        8388608.0
+#define CUDART_TWO_TO_52        4503599627370496.0
+#define CUDART_TWO_TO_53        9007199254740992.0
+#define CUDART_TWO_TO_54        18014398509481984.0
+#define CUDART_TWO_TO_M54       5.5511151231257827e-17
+#define CUDART_TWO_TO_M1022     2.22507385850720140e-308
+#define CUDART_TRIG_PLOSS       2147483648.0
+#define CUDART_DBL2INT_CVT      6755399441055744.0
+
+#endif /* !__MATH_CONSTANTS_H__ */
diff --git a/ext/cudart/include/math_functions.h b/ext/cudart/include/math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc806976784e494edc905d8b8bd9ad138054bbea
--- /dev/null
+++ b/ext/cudart/include/math_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/math_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/ext/cudart/include/mma.h b/ext/cudart/include/mma.h
new file mode 100644
index 0000000000000000000000000000000000000000..9f36f671c0b3a4e95cbb7bddbe41e75ac783b722
--- /dev/null
+++ b/ext/cudart/include/mma.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif
+
+#include "crt/mma.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif
diff --git a/ext/cudart/include/nvfunctional b/ext/cudart/include/nvfunctional
new file mode 100644
index 0000000000000000000000000000000000000000..4fdeeecf6b63f92c5d684a03bb461cd935c0fd35
--- /dev/null
+++ b/ext/cudart/include/nvfunctional
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2014-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__
+#endif
+
+#include "crt/nvfunctional"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__
+#endif
diff --git a/ext/cudart/include/sm_20_atomic_functions.h b/ext/cudart/include/sm_20_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..12b74c94deef2bdea5bd14c9247814427308870b
--- /dev/null
+++ b/ext/cudart/include/sm_20_atomic_functions.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_H__)
+#define __SM_20_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+#ifdef __CUDA_ARCH__
+extern "C"
+{
+extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val);
+}
+#endif /* __CUDA_ARCH__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) __DEF_IF_HOST
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_20_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */
diff --git a/ext/cudart/include/sm_20_atomic_functions.hpp b/ext/cudart/include/sm_20_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac4aa9bfc6b8d5d4d240e05a2fd557889f30c47f
--- /dev/null
+++ b/ext/cudart/include/sm_20_atomic_functions.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_20_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val)
+{
+  return __fAtomicAdd(address, val);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/ext/cudart/include/sm_20_intrinsics.h b/ext/cudart/include/sm_20_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..6965e3ef357042574a411a88fa4f88ae72487618
--- /dev/null
+++ b/ext/cudart/include/sm_20_intrinsics.h
@@ -0,0 +1,1551 @@
+/*
+ * Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_INTRINSICS_H__)
+#define __SM_20_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_INTRINSICS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\
+    "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70."
+#else
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+extern "C"
+{
+extern __device__ __device_builtin__ void                   __threadfence_system(void);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-to-nearest-even mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-towards-zero mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-up mode.
+ * 
+ * Divides two floating-point values \p x by \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Divide two floating-point values in round-down mode.
+ *
+ * Divides two floating-point values \p x by \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x / \p y.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __ddiv_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the reciprocal of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-towards-zero mode.
+ *
+ * Compute the reciprocal of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-up mode.
+ * 
+ * Compute the reciprocal of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-down mode.
+ * 
+ * Compute the reciprocal of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \latexonly $\frac{1}{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mfrac>
+ *     <m:mn>1</m:mn>
+ *     <m:mi>x</m:mi>
+ *   </m:mfrac>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __drcp_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-to-nearest-even mode.
+ * 
+ * Compute the square root of \p x in round-to-nearest-even mode.
+ *
+ * \return Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-towards-zero mode.
+ * 
+ * Compute the square root of \p x in round-towards-zero mode.
+ *
+ * \return Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-up mode.
+ * 
+ * Compute the square root of \p x in round-up (to positive infinity) mode.
+ *
+ * \return Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  in round-down mode.
+ * 
+ * Compute the square root of \p x in round-down (to negative infinity) mode.
+ *
+ * \return Returns 
+ * \latexonly $\sqrt{x}$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:msqrt>
+ *     <m:mi>x</m:mi>
+ *   </m:msqrt>
+ * </m:math>
+ * </d4p_MathML>\endxmlonly.
+ *
+ * \note_accuracy_double
+ * \note_requires_fermi
+ */
+extern __device__ __device_builtin__ double                __dsqrt_rd(double x);
+extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int __ballot(int);
+extern __device__ __device_builtin__ int                   __syncthreads_count(int);
+extern __device__ __device_builtin__ int                   __syncthreads_and(int);
+extern __device__ __device_builtin__ int                   __syncthreads_or(int);
+extern __device__ __device_builtin__ long long int         clock64(void);
+
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-to-nearest-even mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rn(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rn(float x, float y, float z);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-down mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rd(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rd(float x, float y, float z);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-up mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_ru(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_ru(float x, float y, float z);
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_SINGLE
+ * \brief Compute fused multiply-add operation in round-towards-zero mode, ignore \p -ftz=true compiler flag
+ *
+ * Behavior is the same as ::__fmaf_rz(\p x, \p y, \p z), the difference is in
+ * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect.
+ */
+extern __device__ __device_builtin__ float                  __fmaf_ieee_rz(float x, float y, float z);
+
+
+// SM_13 intrinsics
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a double as a 64-bit signed integer.
+ *
+ * Reinterpret the bits in the double-precision floating-point value \p x
+ * as a signed 64-bit integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ long long int         __double_as_longlong(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret bits in a 64-bit signed integer as a double.
+ *
+ * Reinterpret the bits in the 64-bit signed integer value \p x as
+ * a double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                __longlong_as_double(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-to-nearest-even mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-to-nearest-even mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rn(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-towards-zero mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-towards-zero mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rz(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-up mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-up (to positive infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_ru(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Compute 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation in round-down mode.
+ *
+ * Computes the value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single ternary operation, rounding the
+ * result once in round-down (to negative infinity) mode.
+ *
+ * \return Returns the rounded value of 
+ * \latexonly $x \times y + z$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ *   <m:mo>+</m:mo>
+ *   <m:mi>z</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  as a single operation.
+ * - fmaf(
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(
+ * \latexonly $\pm 0$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>0</m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , 
+ * \latexonly $\pm \infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>&#x00B1;<!-- &PlusMinus; --></m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * , \p z) returns NaN.
+ * - fmaf(\p x, \p y, 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ * - fmaf(\p x, \p y, 
+ * \latexonly $+\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>+</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * ) returns NaN if
+ * \latexonly $x \times y$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mi>x</m:mi>
+ *   <m:mo>&#x00D7;<!-- &Multiply; --></m:mo>
+ *   <m:mi>y</m:mi>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ *  is an exact 
+ * \latexonly $-\infty$ \endlatexonly
+ * \xmlonly
+ * <d4p_MathML outputclass="xmlonly">
+ * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML">
+ *   <m:mo>-</m:mo>
+ *   <m:mn>&#x221E;<!-- &Infinity; --></m:mn>
+ * </m:math>
+ * </d4p_MathML>
+ * \endxmlonly
+ * .
+ *
+ * \note_accuracy_double
+ */
+extern __device__ __device_builtin__ double                __fma_rd(double x, double y, double z);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-to-nearest-even mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-towards-zero mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-up mode.
+ * 
+ * Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */ 
+extern __device__ __device_builtin__ double                __dadd_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Add two floating-point values in round-down mode.
+ *
+ * Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x + \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dadd_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-to-nearest-even mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-towards-zero mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-up mode.
+ * 
+ * Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */ 
+extern __device__ __device_builtin__ double                __dsub_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Subtract two floating-point values in round-down mode.
+ *
+ * Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x - \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dsub_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-to-nearest-even mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rn(double x, double y);
+/**      
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-towards-zero mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-towards-zero mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rz(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-up mode.
+ * 
+ * Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode.
+ *    
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_ru(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_DOUBLE
+ * \brief Multiply two floating-point values in round-down mode.
+ *
+ * Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode.
+ *
+ * \return Returns \p x * \p y.
+ *
+ * \note_accuracy_double
+ * \note_nofma
+ */
+extern __device__ __device_builtin__ double                __dmul_rd(double x, double y);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-towards-zero mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rz(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a float in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a single-precision
+ * floating-point value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ float                 __double2float_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ int                   __double2int_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned int          __double2uint_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to a signed 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to a
+ * signed 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ long long int          __double2ll_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-up mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a double to an unsigned 64-bit int in round-down mode.
+ *
+ * Convert the double-precision floating-point value \p x to an
+ * unsigned 64-bit integer value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed int to a double.
+ *
+ * Convert the signed integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __int2double_rn(int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned int to a double.
+ *
+ * Convert the unsigned integer value \p x to a double-precision floating-point value.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __uint2double_rn(unsigned int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rn(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rz(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-up mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_ru(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert a signed 64-bit int to a double in round-down mode.
+ *
+ * Convert the signed 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ll2double_rd(long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-to-nearest-even mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rn(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-towards-zero mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rz(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-up mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-up (to positive infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_ru(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Convert an unsigned 64-bit int to a double in round-down mode.
+ *
+ * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point
+ * value in round-down (to negative infinity) mode.
+ * \return Returns converted value.
+ */
+extern __device__ __device_builtin__ double                 __ull2double_rd(unsigned long long int x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the high 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2hiint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret low 32 bits in a double as a signed integer.
+ *
+ * Reinterpret the low 32 bits in the double-precision floating-point value \p x
+ * as a signed integer.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ int                    __double2loint(double x);
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_CAST
+ * \brief Reinterpret high and low 32-bit integer values as a double.
+ *
+ * Reinterpret the integer value of \p hi as the high 32 bits of a 
+ * double-precision floating-point value and the integer value of \p lo
+ * as the low 32 bits of the same double-precision floating-point value.
+ * \return Returns reinterpreted value.
+ */
+extern __device__ __device_builtin__ double                 __hiloint2double(int hi, int lo);
+
+
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_20_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int ballot(bool pred) __DEF_IF_HOST
+
+__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred) __DEF_IF_HOST
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred) __DEF_IF_HOST
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *ptr) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *ptr) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits) __DEF_IF_HOST
+__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits) __DEF_IF_HOST
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits) __DEF_IF_HOST
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_20_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_20_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ */
+#endif /* !__SM_20_INTRINSICS_H__ && defined(__CUDA_ARCH__) */
diff --git a/ext/cudart/include/sm_20_intrinsics.hpp b/ext/cudart/include/sm_20_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..30c1ab99e0d66ebbceb8fe88b1122443cbf5f998
--- /dev/null
+++ b/ext/cudart/include/sm_20_intrinsics.hpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_20_INTRINSICS_HPP__)
+#define __SM_20_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_20_INTRINSICS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_20_INTRINSICS_DECL__ unsigned int ballot(bool pred)
+{
+  return __ballot((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred)
+{
+  return __syncthreads_count((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred)
+{
+  return (bool)__syncthreads_and((int)pred);
+}
+
+__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred)
+{
+  return (bool)__syncthreads_or((int)pred);
+}
+
+
+extern "C" {
+  __device__ unsigned __nv_isGlobal_impl(const void *);
+  __device__ unsigned __nv_isShared_impl(const void *);
+  __device__ unsigned __nv_isConstant_impl(const void *);
+  __device__ unsigned __nv_isLocal_impl(const void *);
+  __device__ unsigned __nv_isGridConstant_impl(const void *);
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr)
+{
+  return __nv_isGlobal_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr)
+{
+  return __nv_isShared_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr)
+{
+  return __nv_isConstant_impl(ptr); 
+}
+
+__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr)
+{
+  return __nv_isLocal_impl(ptr); 
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr)
+{
+  return __nv_isGridConstant_impl(ptr); 
+}
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+extern "C" {
+  __device__ size_t __nv_cvta_generic_to_global_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_shared_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_constant_impl(const void *);
+  __device__ size_t __nv_cvta_generic_to_local_impl(const void *);
+  __device__ void * __nv_cvta_global_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_shared_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_constant_to_generic_impl(size_t);
+  __device__ void * __nv_cvta_local_to_generic_impl(size_t);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *p)
+{
+  return __nv_cvta_generic_to_global_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *p)
+{
+  return __nv_cvta_generic_to_shared_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *p)
+{
+  return __nv_cvta_generic_to_constant_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *p)
+{
+  return __nv_cvta_generic_to_local_impl(p);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits)
+{
+  return __nv_cvta_global_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits)
+{
+  return __nv_cvta_shared_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits)
+{
+  return __nv_cvta_constant_to_generic_impl(rawbits);
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits)
+{
+  return __nv_cvta_local_to_generic_impl(rawbits);
+}
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __CVTA_PTR_64 1
+#endif
+
+__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr)
+{
+#if __CVTA_PTR_64  
+  unsigned long long ret;
+  asm("cvta.to.param.u64 %0, %1;"  : "=l"(ret) : "l"(ptr));
+#else  /* !__CVTA_PTR_64 */
+  unsigned ret;
+  asm("cvta.to.param.u32 %0, %1;"  : "=r"(ret) : "r"(ptr));
+#endif  /* __CVTA_PTR_64 */  
+  return (size_t)ret;
+  
+}
+
+__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits)
+{
+  void *ret;
+#if __CVTA_PTR_64  
+  unsigned long long in = rawbits;
+  asm("cvta.param.u64 %0, %1;" : "=l"(ret) : "l"(in));
+#else  /* !__CVTA_PTR_64 */
+  unsigned in = rawbits;
+  asm("cvta.param.u32 %0, %1;" : "=r"(ret) : "r"(in));
+#endif  /* __CVTA_PTR_64 */
+  return ret;
+}
+#undef __CVTA_PTR_64
+#endif  /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */
+
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_20_INTRINSICS_DECL__
+
+#endif /* !__SM_20_INTRINSICS_HPP__ */
+
diff --git a/ext/cudart/include/sm_30_intrinsics.h b/ext/cudart/include/sm_30_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..393ddfcb38e0bc21631affe3dca370b01761a464
--- /dev/null
+++ b/ext/cudart/include/sm_30_intrinsics.h
@@ -0,0 +1,215 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_30_INTRINSICS_H__)
+#define __SM_30_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-3.0 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#else
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#endif
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)."
+#endif
+
+__SM_30_INTRINSICS_DECL__ unsigned  __fns(unsigned mask, unsigned base, int offset) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __barrier_sync(unsigned id) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __barrier_sync_count(unsigned id, unsigned cnt) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ void  __syncwarp(unsigned mask=0xFFFFFFFF) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned __activemask() __DEF_IF_HOST
+
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) int __shfl(int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned int __shfl(unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) int __shfl_up(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned int __shfl_up(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) int __shfl_down(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned int __shfl_down(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) int __shfl_xor(int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned int __shfl_xor(unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) float __shfl(float var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) float __shfl_up(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) float __shfl_down(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) float __shfl_xor(float var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+// 64-bits SHFL
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long long __shfl(unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long long __shfl(long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long long __shfl_up(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long long __shfl_down(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long long __shfl_xor(long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) double __shfl(double var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) double __shfl_up(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) double __shfl_down(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) double __shfl_xor(double var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+// long needs some help to choose between 32-bits and 64-bits
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long __shfl(long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long __shfl(unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long __shfl_up(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long __shfl_up(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long __shfl_down(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long __shfl_down(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long __shfl_xor(long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long __shfl_xor(unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+#endif
+
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST
+
+#undef __DEPRECATED__
+#undef __WSB_DEPRECATION_MESSAGE
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_30_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_30_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_30_INTRINSICS_H__ */
diff --git a/ext/cudart/include/sm_30_intrinsics.hpp b/ext/cudart/include/sm_30_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..c5d484255e85ce1e7faa660347d29b8c17d43639
--- /dev/null
+++ b/ext/cudart/include/sm_30_intrinsics.hpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_30_INTRINSICS_HPP__)
+#define __SM_30_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_30_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+}
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.0 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined warpSize && !defined __local_warpSize
+#define warpSize    32
+#define __local_warpSize
+#endif
+
+__SM_30_INTRINSICS_DECL__
+unsigned __fns(unsigned mask, unsigned base, int offset) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_fns(unsigned int mask, unsigned int base, int offset);
+  return __nvvm_fns(mask, base, offset);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __barrier_sync(unsigned id) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync(unsigned id);
+  return __nvvm_barrier_sync(id);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __barrier_sync_count(unsigned id, unsigned cnt) {
+  extern __device__ __device_builtin__ void __nvvm_barrier_sync_cnt(unsigned id, unsigned cnt);
+  return __nvvm_barrier_sync_cnt(id, cnt);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+void  __syncwarp(unsigned mask) {
+  extern __device__ __device_builtin__ void __nvvm_bar_warp_sync(unsigned mask);
+  return __nvvm_bar_warp_sync(mask);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __all_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_all_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __any_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_any_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+int __uni_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_uni_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__ 
+unsigned __ballot_sync(unsigned mask, int pred) {
+  extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred); 
+  return __nvvm_vote_ballot_sync(mask, pred);
+}
+
+__SM_30_INTRINSICS_DECL__
+unsigned __activemask() {
+    unsigned ret;
+    asm volatile ("activemask.b32 %0;" : "=r"(ret));
+    return ret;
+}
+
+// These are removed starting with compute_70 and onwards
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
+
+__SM_30_INTRINSICS_DECL__ int __shfl(int var, int srcLane, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl(unsigned int var, int srcLane, int width) {
+	return (unsigned int) __shfl((int)var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_up(int var, unsigned int delta, int width) {
+	int ret;
+	int c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_up((int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_down(int var, unsigned int delta, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down(unsigned int var, unsigned int delta, int width) {
+	return (unsigned int) __shfl_down((int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_xor(int var, int laneMask, int width) {
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor(unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor((int)var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl(float var, int srcLane, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_up(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = (warpSize-width) << 8;
+	asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_down(float var, unsigned int delta, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c));
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_xor(float var, int laneMask, int width) {
+	float ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+	asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
+	return ret;
+}
+
+// 64-bits SHFL
+
+__SM_30_INTRINSICS_DECL__ long long __shfl(long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl(unsigned long long var, int srcLane, int width) {
+	return (unsigned long long) __shfl((long long) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_up(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_up((long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_down(long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width) {
+	return (unsigned long long) __shfl_down((long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor(long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width) {
+	return (unsigned long long) __shfl_xor((long long) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl(double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl(hi, srcLane, width);
+	lo = __shfl(lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_up(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up(hi, delta, width);
+	lo = __shfl_up(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_down(double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down(hi, delta, width);
+	lo = __shfl_down(lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_xor(double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor(hi, laneMask, width);
+	lo = __shfl_xor(lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl(long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((long long) var, srcLane, width) :
+		__shfl((int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl(unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl((unsigned long long) var, srcLane, width) :
+		__shfl((unsigned int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_up(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((long long) var, delta, width) :
+		__shfl_up((int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up((unsigned long long) var, delta, width) :
+		__shfl_up((unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_down(long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((long long) var, delta, width) :
+		__shfl_down((int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down(unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down((unsigned long long) var, delta, width) :
+		__shfl_down((unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_xor(long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((long long) var, laneMask, width) :
+		__shfl_xor((int) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor(unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor((unsigned long long) var, laneMask, width) :
+		__shfl_xor((unsigned int) var, laneMask, width);
+}
+
+#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */
+
+// Warp register exchange (shuffle) intrinsics.
+// Notes:
+// a) Warp size is hardcoded to 32 here, because the compiler does not know
+//    the "warpSize" constant at this time
+// b) we cannot map the float __shfl to the int __shfl because it'll mess with
+//    the register number (especially if you're doing two shfls to move a double).
+__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, var, srcLane, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width) {
+        return (unsigned int) __shfl_sync(mask, (int)var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, var, delta, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_up_sync(mask, (int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, var, delta, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width) {
+        return (unsigned int) __shfl_down_sync(mask, (int)var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+	int c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, var, laneMask, c);
+	return ret;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width) {
+	return (unsigned int) __shfl_xor_sync(mask, (int)var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+        int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_idx_sync(mask, __float_as_int(var), srcLane, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = (warpSize-width) << 8;
+        ret = __nvvm_shfl_up_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_down_sync(mask, __float_as_int(var), delta, c);
+	return __int_as_float(ret);
+}
+
+__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width) {
+        extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c);
+	int ret;
+        int c;
+	c = ((warpSize-width) << 8) | 0x1f;
+        ret = __nvvm_shfl_bfly_sync(mask, __float_as_int(var), laneMask, c);
+	return __int_as_float(ret);
+}
+
+// 64-bits SHFL
+__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width) {
+        return (unsigned long long) __shfl_sync(mask, (long long) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_up_sync(mask, (long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) {
+        return (unsigned long long) __shfl_down_sync(mask, (long long) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width) {
+	int lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width) {
+        return (unsigned long long) __shfl_xor_sync(mask, (long long) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_sync(mask, hi, srcLane, width);
+	lo = __shfl_sync(mask, lo, srcLane, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_up_sync(mask, hi, delta, width);
+	lo = __shfl_up_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_down_sync(mask, hi, delta, width);
+	lo = __shfl_down_sync(mask, lo, delta, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width) {
+	unsigned lo, hi;
+	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var));
+	hi = __shfl_xor_sync(mask, hi, laneMask, width);
+	lo = __shfl_xor_sync(mask, lo, laneMask, width);
+	asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi));
+	return var;
+}
+
+// long needs some help to choose between 32-bits and 64-bits
+
+__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (long long) var, srcLane, width) :
+		__shfl_sync(mask, (int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+                __shfl_sync(mask, (unsigned long long) var, srcLane, width) :
+		__shfl_sync(mask, (unsigned int) var, srcLane, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (long long) var, delta, width) :
+		__shfl_up_sync(mask, (int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_up_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_up_sync(mask, (unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (long long) var, delta, width) :
+		__shfl_down_sync(mask, (int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_down_sync(mask, (unsigned long long) var, delta, width) :
+		__shfl_down_sync(mask, (unsigned int) var, delta, width);
+}
+
+__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (int) var, laneMask, width);
+}
+
+__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width) {
+	return (sizeof(long) == sizeof(long long)) ?
+		__shfl_xor_sync(mask, (unsigned long long) var, laneMask, width) :
+		__shfl_xor_sync(mask, (unsigned int) var, laneMask, width);
+}
+
+#if defined(__local_warpSize)
+#undef warpSize
+#undef __local_warpSize
+#endif
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_30_INTRINSICS_DECL__
+
+#endif /* !__SM_30_INTRINSICS_HPP__ */
+
diff --git a/ext/cudart/include/sm_32_atomic_functions.h b/ext/cudart/include/sm_32_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..905732018c9a8ab1866aed932bd292ffa7ba2ac4
--- /dev/null
+++ b/ext/cudart/include/sm_32_atomic_functions.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_H__)
+#define __SM_32_ATOMIC_FUNCTIONS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+#ifdef __CUDA_ARCH__
+extern "C"
+{
+extern __device__ __device_builtin__ long long __illAtomicMin(long long *address, long long val);
+extern __device__ __device_builtin__ long long __illAtomicMax(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicAnd(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicOr(long long *address, long long val);
+extern __device__ __device_builtin__ long long __llAtomicXor(long long *address, long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicMin(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicMax(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicAnd(unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicOr (unsigned long long *address, unsigned long long val);
+extern __device__ __device_builtin__ unsigned long long __ullAtomicXor(unsigned long long *address, unsigned long long val);
+}
+#endif /* __CUDA_ARCH__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_32_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_H__ */
diff --git a/ext/cudart/include/sm_32_atomic_functions.hpp b/ext/cudart/include/sm_32_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ebe60b8ca83666f07464b06ff04e6fc432c31b7b
--- /dev/null
+++ b/ext/cudart/include/sm_32_atomic_functions.hpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_32_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val)
+{
+    return __illAtomicMin(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val)
+{
+    return __illAtomicMax(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val)
+{
+    return __llAtomicAnd(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val)
+{
+    return __llAtomicOr(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val)
+{
+    return __llAtomicXor(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMin(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMax(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicAnd(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicOr(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicXor(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/ext/cudart/include/sm_32_intrinsics.h b/ext/cudart/include/sm_32_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..33a805f765400883ddabf405578abb241755aa88
--- /dev/null
+++ b/ext/cudart/include/sm_32_intrinsics.h
@@ -0,0 +1,510 @@
+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_INTRINSICS_H__)
+#define __SM_32_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-3.5 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+/******************************************************************************
+ *                                   __ldg                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldcg                                   *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldca                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldcs                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldlu                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __ldcv                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stwb                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stcg                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stcs                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) __DEF_IF_HOST
+/******************************************************************************
+ *                                   __stwt                                    *
+ ******************************************************************************/
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST
+
+__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) __DEF_IF_HOST
+__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) __DEF_IF_HOST
+
+
+// SHF is the "funnel shift" operation - an accelerated left/right shift with carry
+// operating on 64-bit quantities, which are concatenations of two 32-bit registers.
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift left by \p shift & 31 bits, return the most significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi left by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted left by the wrapped value of \p shift (\p shift & 31).
+ * The most significant 32-bits of the result are returned.
+ *
+ * \return Returns the most significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift left by min(\p shift, 32) bits, return the most significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi left by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted left by the clamped value of \p shift (min(\p shift, 32)).
+ * The most significant 32-bits of the result are returned.
+ *
+ * \return Returns the most significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift right by \p shift & 31 bits, return the least significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi right by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted right by the wrapped value of \p shift (\p shift & 31).
+ * The least significant 32-bits of the result are returned.
+ *
+ * \return Returns the least significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+/**
+ * \ingroup CUDA_MATH_INTRINSIC_INT
+ * \brief Concatenate \p hi : \p lo, shift right by min(\p shift, 32) bits, return the least significant 32 bits.
+ *
+ * Shift the 64-bit value formed by concatenating argument \p lo and \p hi right by the amount specified by the argument \p shift.
+ * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value.
+ * The source is shifted right by the clamped value of \p shift (min(\p shift, 32)).
+ * The least significant 32-bits of the result are returned.
+ *
+ * \return Returns the least significant 32 bits of the shifted 64-bit value.
+ */
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST
+
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_32_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__)  */
+
+#endif /* !__SM_32_INTRINSICS_H__ */
diff --git a/ext/cudart/include/sm_32_intrinsics.hpp b/ext/cudart/include/sm_32_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..af5f6634434ff690d9d07a8bdcb7a44702b6fe48
--- /dev/null
+++ b/ext/cudart/include/sm_32_intrinsics.hpp
@@ -0,0 +1,588 @@
+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_INTRINSICS_HPP__)
+#define __SM_32_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+    // There are no intrinsics built in to the compiler for SM-3.5,
+    // all intrinsics are now implemented as inline PTX below.
+}
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.5 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+// LDG is a "load from global via texture path" command which can exhibit higher
+// bandwidth on GK110 than a regular LD.
+// Define a different pointer storage size for 64 and 32 bit
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+
+/******************************************************************************
+ *                                   __ldg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.nc.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.nc.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.nc.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.nc.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.nc.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.nc.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.nc.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.nc.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.nc.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+
+/******************************************************************************
+ *                                   __ldcg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cg.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cg.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cg.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cg.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cg.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cg.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cg.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cg.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cg.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cg.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cg.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) { float ret; asm volatile ("ld.global.cg.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) { double ret; asm volatile ("ld.global.cg.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cg.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cg.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldca                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.ca.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.ca.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) { short2 ret; asm volatile ("ld.global.ca.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) { short4 ret; asm volatile ("ld.global.ca.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) { int2 ret; asm volatile ("ld.global.ca.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) { int4 ret; asm volatile ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.ca.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.ca.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.ca.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.ca.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.ca.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.ca.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.ca.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.ca.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) { float ret; asm volatile ("ld.global.ca.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) { double ret; asm volatile ("ld.global.ca.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) { float2 ret; asm volatile ("ld.global.ca.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) { float4 ret; asm volatile ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) { double2 ret; asm volatile ("ld.global.ca.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldcs                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cs.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cs.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cs.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cs.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cs.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cs.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cs.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cs.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cs.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cs.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cs.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cs.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) { float ret; asm volatile ("ld.global.cs.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) { double ret; asm volatile ("ld.global.cs.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cs.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cs.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+
+/******************************************************************************
+ *                                   __ldlu                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) { unsigned short ret; asm ("ld.global.lu.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) { unsigned int ret; asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) { unsigned long long ret; asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.lu.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.lu.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) { short2 ret; asm ("ld.global.lu.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) { short4 ret; asm ("ld.global.lu.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) { int2 ret; asm ("ld.global.lu.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) { int4 ret; asm ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.lu.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.lu.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory");  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.lu.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.lu.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.lu.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.lu.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.lu.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.lu.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) { uint2 ret; asm ("ld.global.lu.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) { uint4 ret; asm ("ld.global.lu.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.lu.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) { float ret; asm ("ld.global.lu.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) { double ret; asm ("ld.global.lu.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) { float2 ret; asm ("ld.global.lu.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) { float4 ret; asm ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) { double2 ret; asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+/******************************************************************************
+ *                                   __ldcv                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) { unsigned short ret; asm ("ld.global.cv.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) { unsigned int ret; asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) { unsigned long long ret; asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.cv.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.cv.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) { short2 ret; asm ("ld.global.cv.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) { short4 ret; asm ("ld.global.cv.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) { int2 ret; asm ("ld.global.cv.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) { int4 ret; asm ("ld.global.cv.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.cv.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.cv.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory");  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.cv.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.cv.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.cv.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.cv.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.cv.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.cv.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) { uint2 ret; asm ("ld.global.cv.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) { uint4 ret; asm ("ld.global.cv.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.cv.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) { float ret; asm ("ld.global.cv.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) { double ret; asm ("ld.global.cv.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) { float2 ret; asm ("ld.global.cv.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) { float4 ret; asm ("ld.global.cv.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) { double2 ret; asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+
+/******************************************************************************
+ *                                   __stwb                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) { asm ("st.global.wb.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) { asm ("st.global.wb.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) { asm ("st.global.wb.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) { asm ("st.global.wb.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) { asm ("st.global.wb.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) { asm ("st.global.wb.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) { asm ("st.global.wb.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) { asm ("st.global.wb.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) { asm ("st.global.wb.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) { asm ("st.global.wb.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) { asm ("st.global.wb.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) { asm ("st.global.wb.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) { asm ("st.global.wb.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wb.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) { asm ("st.global.wb.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) { asm ("st.global.wb.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) { asm ("st.global.wb.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) { asm ("st.global.wb.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wb.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) { asm ("st.global.wb.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) { asm ("st.global.wb.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) { asm ("st.global.wb.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) { asm ("st.global.wb.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) { asm ("st.global.wb.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stcg                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) { asm ("st.global.cg.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) { asm ("st.global.cg.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) { asm ("st.global.cg.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) { asm ("st.global.cg.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) { asm ("st.global.cg.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) { asm ("st.global.cg.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) { asm ("st.global.cg.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) { asm ("st.global.cg.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) { asm ("st.global.cg.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) { asm ("st.global.cg.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) { asm ("st.global.cg.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) { asm ("st.global.cg.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) { asm ("st.global.cg.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cg.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) { asm ("st.global.cg.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) { asm ("st.global.cg.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) { asm ("st.global.cg.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) { asm ("st.global.cg.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cg.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) { asm ("st.global.cg.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) { asm ("st.global.cg.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) { asm ("st.global.cg.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) { asm ("st.global.cg.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) { asm ("st.global.cg.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stcs                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) { asm ("st.global.cs.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) { asm ("st.global.cs.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) { asm ("st.global.cs.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) { asm ("st.global.cs.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) { asm ("st.global.cs.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) { asm ("st.global.cs.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) { asm ("st.global.cs.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) { asm ("st.global.cs.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) { asm ("st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) { asm ("st.global.cs.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) { asm ("st.global.cs.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) { asm ("st.global.cs.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) { asm ("st.global.cs.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cs.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) { asm ("st.global.cs.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) { asm ("st.global.cs.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) { asm ("st.global.cs.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) { asm ("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cs.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) { asm ("st.global.cs.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) { asm ("st.global.cs.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) { asm ("st.global.cs.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) { asm ("st.global.cs.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) { asm ("st.global.cs.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+/******************************************************************************
+ *                                   __stwt                                    *
+ ******************************************************************************/
+
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+
+
+__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) { asm ("st.global.wt.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) { asm ("st.global.wt.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) { asm ("st.global.wt.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) { asm ("st.global.wt.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) { asm ("st.global.wt.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) { asm ("st.global.wt.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) { asm ("st.global.wt.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) { asm ("st.global.wt.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) { asm ("st.global.wt.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) { asm ("st.global.wt.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) { asm ("st.global.wt.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) { asm ("st.global.wt.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) { asm ("st.global.wt.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wt.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) { asm ("st.global.wt.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) { asm ("st.global.wt.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) { asm ("st.global.wt.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) { asm ("st.global.wt.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wt.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+
+__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) { asm ("st.global.wt.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) { asm ("st.global.wt.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) { asm ("st.global.wt.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) { asm ("st.global.wt.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) { asm ("st.global.wt.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+
+#undef __LDG_PTR
+
+
+// SHF is the "funnel shift" operation - an accelerated left/right shift with carry
+// operating on 64-bit quantities, which are concatenations of two 32-bit registers.
+
+// This shifts [b:a] left by "shift" bits, returning the most significant bits of the result.
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.l.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+
+// This shifts [b:a] right by "shift" bits, returning the least significant bits of the result.
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_INTRINSICS_DECL__
+
+#endif /* !__SM_32_INTRINSICS_HPP__ */
+
diff --git a/ext/cudart/include/sm_35_atomic_functions.h b/ext/cudart/include/sm_35_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8961079aeac4c9e73a7c2825cf9ea10b171af09
--- /dev/null
+++ b/ext/cudart/include/sm_35_atomic_functions.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_35_ATOMIC_FUNCTIONS_H__)
+#define __SM_35_ATOMIC_FUNCTIONS_H__
+
+/*******************************************************************************
+* All sm_35 atomics are supported by sm_32 so simply include its header file   *
+*******************************************************************************/
+#include "sm_32_atomic_functions.h"
+
+#endif /* !__SM_35_ATOMIC_FUNCTIONS_H__ */
diff --git a/ext/cudart/include/sm_35_intrinsics.h b/ext/cudart/include/sm_35_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..da1e823a24171ed1ca9414955c6c68159a4411f5
--- /dev/null
+++ b/ext/cudart/include/sm_35_intrinsics.h
@@ -0,0 +1,116 @@
+/*
+
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+
+ *
+
+ * NOTICE TO LICENSEE:
+
+ *
+
+ * This source code and/or documentation ("Licensed Deliverables") are
+
+ * subject to NVIDIA intellectual property rights under U.S. and
+
+ * international Copyright laws.
+
+ *
+
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+
+ * conditions of a form of NVIDIA software license agreement by and
+
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+
+ * the contrary in the License Agreement, reproduction or disclosure
+
+ * of the Licensed Deliverables to any third party without the express
+
+ * written consent of NVIDIA is prohibited.
+
+ *
+
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+
+ * OF THESE LICENSED DELIVERABLES.
+
+ *
+
+ * U.S. Government End Users.  These Licensed Deliverables are a
+
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+
+ * 1995), consisting of "commercial computer software" and "commercial
+
+ * computer software documentation" as such terms are used in 48
+
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+
+ * U.S. Government End Users acquire the Licensed Deliverables with
+
+ * only those rights set forth herein.
+
+ *
+
+ * Any use of the Licensed Deliverables in individual and commercial
+
+ * software must include, in the user documentation and internal
+
+ * comments to the code, the above Disclaimer and U.S. Government End
+
+ * Users Notice.
+
+ */
+
+
+
+#if !defined(__SM_35_INTRINSICS_H__)
+
+#define __SM_35_INTRINSICS_H__
+
+
+
+/**********************************************************************************
+
+* All sm_35 intrinsics are supported by sm_32 so simply include its header file   *
+
+**********************************************************************************/
+
+#include "sm_32_intrinsics.h"
+
+
+
+#endif /* !__SM_35_INTRINSICS_H__ */
+
diff --git a/ext/cudart/include/sm_60_atomic_functions.h b/ext/cudart/include/sm_60_atomic_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..4eae20ab38c6efe8af1fb20a1a3c7a3783ec6834
--- /dev/null
+++ b/ext/cudart/include/sm_60_atomic_functions.h
@@ -0,0 +1,539 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_H__)
+#define __SM_60_ATOMIC_FUNCTIONS_H__
+
+
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+
+
+#ifdef __CUDA_ARCH__ 
+extern "C"
+{
+extern __device__ __device_builtin__ double __dAtomicAdd(double *address, double val);
+
+extern __device__ __device_builtin__
+int __iAtomicAdd_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAdd_system(int *address, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAdd_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAdd_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAdd_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAdd_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+float __fAtomicAdd_block(float *address, float val);
+
+extern __device__ __device_builtin__
+float __fAtomicAdd_system(float *address, float val);
+
+extern __device__ __device_builtin__
+double __dAtomicAdd_block(double *address, double val);
+
+extern __device__ __device_builtin__
+double __dAtomicAdd_system(double *address, double val);
+
+extern __device__ __device_builtin__
+int __iAtomicExch_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicExch_system(int *address, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicExch_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicExch_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicExch_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicExch_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+float __fAtomicExch_block(float *address, float val);
+
+extern __device__ __device_builtin__
+float __fAtomicExch_system(float *address, float val);
+
+extern __device__ __device_builtin__
+int __iAtomicMin_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicMin_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMin_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMin_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMin_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMin_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMin_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMin_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicMax_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicMax_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMax_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __illAtomicMax_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMax_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicMax_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMax_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicMax_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicInc_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicInc_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicDec_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicDec_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+int __iAtomicCAS_block(int *address, int compare, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicCAS_system(int *address, int compare, int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicCAS_block(unsigned int *address, unsigned int compare,
+                                unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicCAS_system(unsigned int *address, unsigned int compare,
+                                 unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicCAS_block(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicCAS_system(unsigned long long int *address,
+                                         unsigned long long int compare,
+                                         unsigned long long int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAnd_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicAnd_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicAnd_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicAnd_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAnd_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicAnd_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAnd_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicAnd_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicOr_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicOr_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicOr_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicOr_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicOr_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicOr_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicOr_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicOr_system(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+int __iAtomicXor_block(int *address, int val);
+
+extern __device__ __device_builtin__
+int __iAtomicXor_system(int *address, int val);
+
+extern __device__ __device_builtin__
+long long __llAtomicXor_block(long long *address, long long val);
+
+extern __device__ __device_builtin__
+long long __llAtomicXor_system(long long *address, long long val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicXor_block(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned int __uAtomicXor_system(unsigned int *address, unsigned int val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicXor_block(unsigned long long *address, unsigned long long val);
+
+extern __device__ __device_builtin__
+unsigned long long __ullAtomicXor_system(unsigned long long *address, unsigned long long val);
+}
+#endif /* __CUDA_ARCH__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+#undef __DEF_IF_HOST
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_60_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__)  */
+
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_H__ */
+
diff --git a/ext/cudart/include/sm_60_atomic_functions.hpp b/ext/cudart/include/sm_60_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4d5227023221116868e8446fdac23efb96e94ae
--- /dev/null
+++ b/ext/cudart/include/sm_60_atomic_functions.hpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_60_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val)
+{
+  return __dAtomicAdd(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val)
+{
+  return __fAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val)
+{
+  return __fAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val)
+{
+  return __dAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val)
+{
+  return __dAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val)
+{
+  return __iAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val)
+{
+  return __iAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val)
+{
+  return __fAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val)
+{
+  return __fAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val)
+{
+  return __iAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val)
+{
+  return __iAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val)
+{
+  return __illAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val)
+{
+  return __illAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val)
+{
+  return __iAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val)
+{
+  return __iAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val)
+{
+  return __illAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val)
+{
+  return __illAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val)
+{
+  return __iAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val)
+{
+  return __iAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val)
+{
+  return __uAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val)
+{
+  return __uAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val)
+{
+  return __ullAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val)
+{
+  return __ullAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val)
+{
+  return __iAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val)
+{
+  return __iAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val)
+{
+  return __llAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val)
+{
+  return __llAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val)
+{
+  return __iAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val)
+{
+  return __iAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val)
+{
+  return __llAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val)
+{
+  return __llAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val)
+{
+  return __iAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val)
+{
+  return __iAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val)
+{
+  return __llAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val)
+{
+  return __llAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_system(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/ext/cudart/include/sm_61_intrinsics.h b/ext/cudart/include/sm_61_intrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf2dce47bb458fc3c093d710b78a72e582bc0fdd
--- /dev/null
+++ b/ext/cudart/include/sm_61_intrinsics.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_61_INTRINSICS_H__)
+#define __SM_61_INTRINSICS_H__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#ifndef __CUDA_ARCH__
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+
+/*******************************************************************************
+*                                                                              *
+*  Below are declarations of SM-6.1 intrinsics which are included as           *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+
+/******************************************************************************
+ *                                   __dp2a                                   *
+ ******************************************************************************/
+// Generic [_lo]
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style [_lo]
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+// Generic [_hi]
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style [_hi]
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+
+
+/******************************************************************************
+ *                                   __dp4a                                   *
+ ******************************************************************************/
+// Generic
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST
+// Vector-style
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) __DEF_IF_HOST
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEF_IF_HOST
+#undef __SM_61_INTRINSICS_DECL__
+
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_61_intrinsics.hpp"
+#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
+
+#endif /* !__SM_61_INTRINSICS_H__ */
diff --git a/ext/cudart/include/sm_61_intrinsics.hpp b/ext/cudart/include/sm_61_intrinsics.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f603d04ca4aa7bfef97539fb404150b81e490d67
--- /dev/null
+++ b/ext/cudart/include/sm_61_intrinsics.hpp
@@ -0,0 +1,161 @@
+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_61_INTRINSICS_HPP__)
+#define __SM_61_INTRINSICS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-6.1 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+
+// 4a
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+// 2a.lo
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+// 2a.hi
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_61_INTRINSICS_DECL__
+
+#endif /* !__SM_61_INTRINSICS_HPP__ */
+
diff --git a/ext/cudart/include/surface_functions.h b/ext/cudart/include/surface_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..587a995d0ea8a697b028706e824f9437276401e6
--- /dev/null
+++ b/ext/cudart/include/surface_functions.h
@@ -0,0 +1,439 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_FUNCTIONS_H__)
+#define __SURFACE_FUNCTIONS_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+#include "cuda_surface_types.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+
+
+
+#ifdef __CUDA_ARCH__
+template <typename T> struct __nv_surf_trait {  typedef void * cast_type; };
+
+template<> struct __nv_surf_trait<char> {  typedef char * cast_type; };
+template<> struct __nv_surf_trait<signed char> {  typedef signed char * cast_type; };
+template<> struct __nv_surf_trait<unsigned char> {  typedef unsigned char * cast_type; };
+template<> struct __nv_surf_trait<char1> {  typedef char1 * cast_type; };
+template<> struct __nv_surf_trait<uchar1> {  typedef uchar1 * cast_type; };
+template<> struct __nv_surf_trait<char2> {  typedef char2 * cast_type; };
+template<> struct __nv_surf_trait<uchar2> {  typedef uchar2 * cast_type; };
+template<> struct __nv_surf_trait<char4> {  typedef char4 * cast_type; };
+template<> struct __nv_surf_trait<uchar4> {  typedef uchar4 * cast_type; };
+template<> struct __nv_surf_trait<short> {  typedef short * cast_type; };
+template<> struct __nv_surf_trait<unsigned short> {  typedef unsigned short * cast_type; };
+template<> struct __nv_surf_trait<short1> {  typedef short1 * cast_type; };
+template<> struct __nv_surf_trait<ushort1> {  typedef ushort1 * cast_type; };
+template<> struct __nv_surf_trait<short2> {  typedef short2 * cast_type; };
+template<> struct __nv_surf_trait<ushort2> {  typedef ushort2 * cast_type; };
+template<> struct __nv_surf_trait<short4> {  typedef short4 * cast_type; };
+template<> struct __nv_surf_trait<ushort4> {  typedef ushort4 * cast_type; };
+template<> struct __nv_surf_trait<int> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned int> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<int1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<uint1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<int2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<uint2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<int4> {  typedef int4 * cast_type; };
+template<> struct __nv_surf_trait<uint4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<long long> {  typedef long long * cast_type; };
+template<> struct __nv_surf_trait<unsigned long long> {  typedef unsigned long long * cast_type; };
+template<> struct __nv_surf_trait<longlong1> {  typedef longlong1 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong1> {  typedef ulonglong1 * cast_type; };
+template<> struct __nv_surf_trait<longlong2> {  typedef longlong2 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong2> {  typedef ulonglong2 * cast_type; };
+#if !defined(__LP64__)
+template<> struct __nv_surf_trait<long> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned long> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<long1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<ulong1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<long2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<ulong2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<long4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<ulong4> {  typedef int4 * cast_type; };
+#endif
+template<> struct __nv_surf_trait<float> {  typedef float * cast_type; };
+template<> struct __nv_surf_trait<float1> {  typedef float1 * cast_type; };
+template<> struct __nv_surf_trait<float2> {  typedef float2 * cast_type; };
+template<> struct __nv_surf_trait<float4> {  typedef float4 * cast_type; };
+#endif /* defined(__CUDA_ARCH__) */
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1Dread_v2", (void *)res, s, surf, x, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf1Dread(surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf1Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, mode);
+  return temp;
+#endif
+}
+  
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surf1Dread<T>(surf, x, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2Dread_v2", (void *)res, s, surf, x, y, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf2Dread(surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf2Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, mode);
+  return temp;
+#endif
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  *res = surf2Dread<T>(surf, x, y, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf3Dread_v2", (void *)res, s, surf, x, y, z, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf3Dread(surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf3Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, z, mode);
+  return temp;
+#endif
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  *res = surf3Dread<T>(surf, x, y, z, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int  layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1DLayeredread_v2", (void *)res, s, surf, x,  layer, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf1DLayeredread(surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf1DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, layer, mode);
+  return temp;
+#endif
+}
+
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  *res = surf1DLayeredread<T>(surf, x, layer, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x,  int y, int  layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2DLayeredread_v2", (void *)res, s, surf, x, y, layer, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf2DLayeredread(surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf2DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layer, mode);
+  return temp;
+#endif
+}
+
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  *res = surf2DLayeredread<T>(surf, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __device__  __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x,  int y, int  face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapread_v2", (void *)res, s, surf, x, y, face, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surfCubemapread(surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+
+  __nv_tex_surf_handler("__surfCubemapread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, face, mode);
+  return temp;
+#endif
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__  
+  *res = surfCubemapread<T>(surf, x, y, face, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x,  int y, int  layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (void *)res, s, surf, x, y, layerFace, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surfCubemapLayeredread(surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layerFace, mode);
+  return temp;
+#endif
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__  
+  *res = surfCubemapLayeredread<T>(surf, x, y, layerFace, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surf1Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1Dwrite_v2", (void *)&val, s, surf, x, mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf1Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+//surf2Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2Dwrite_v2", (void *)&val,  s, surf, x, y, mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf2Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surf3Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf3Dwrite_v2", (void *)&val,  s, surf, x, y, z,mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf3Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, z,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surf1DLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (void *)&val,  s, surf, x, layer,mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val,  (int)sizeof(T), surf, x, layer, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surf2DLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (void *)&val, s, surf, x, y, layer,mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val,  (int)sizeof(T), surf, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surfCubemapwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapwrite_v2", (void *)&val, s, surf, x, y, face, mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surfCubemapwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, face,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+//surfCubemapLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (void *)&val, s, surf, x, y, layerFace,  mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layerFace,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+#undef __DEPRECATED__
+
+
+#endif /* __cplusplus && __CUDACC__ */
+#endif /* !__SURFACE_FUNCTIONS_H__ */
diff --git a/ext/cudart/include/surface_indirect_functions.h b/ext/cudart/include/surface_indirect_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d5c4b641e32fbff81ba639460cee3c9d517a0c5
--- /dev/null
+++ b/ext/cudart/include/surface_indirect_functions.h
@@ -0,0 +1,286 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+#ifndef __SURFACE_INDIRECT_FUNCTIONS_H__
+#define __SURFACE_INDIRECT_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "cuda_runtime_api.h"
+
+template<typename T> struct __nv_isurf_trait { };
+template<> struct __nv_isurf_trait<char> { typedef void type; };
+template<> struct __nv_isurf_trait<signed char> { typedef void type; };
+template<> struct __nv_isurf_trait<char1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned char> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar1> { typedef void type; };
+template<> struct __nv_isurf_trait<short> { typedef void type; };
+template<> struct __nv_isurf_trait<short1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned short> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort1> { typedef void type; };
+template<> struct __nv_isurf_trait<int> { typedef void type; };
+template<> struct __nv_isurf_trait<int1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned int> { typedef void type; };
+template<> struct __nv_isurf_trait<uint1> { typedef void type; };
+template<> struct __nv_isurf_trait<long long> { typedef void type; };
+template<> struct __nv_isurf_trait<longlong1> { typedef void type; };
+template<> struct __nv_isurf_trait<unsigned long long> { typedef void type; };
+template<> struct __nv_isurf_trait<ulonglong1> { typedef void type; };
+template<> struct __nv_isurf_trait<float> { typedef void type; };
+template<> struct __nv_isurf_trait<float1> { typedef void type; };
+
+template<> struct __nv_isurf_trait<char2> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar2> { typedef void type; };
+template<> struct __nv_isurf_trait<short2> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort2> { typedef void type; };
+template<> struct __nv_isurf_trait<int2> { typedef void type; };
+template<> struct __nv_isurf_trait<uint2> { typedef void type; };
+template<> struct __nv_isurf_trait<longlong2> { typedef void type; };
+template<> struct __nv_isurf_trait<ulonglong2> { typedef void type; };
+template<> struct __nv_isurf_trait<float2> { typedef void type; };
+
+template<> struct __nv_isurf_trait<char4> { typedef void type; };
+template<> struct __nv_isurf_trait<uchar4> { typedef void type; };
+template<> struct __nv_isurf_trait<short4> { typedef void type; };
+template<> struct __nv_isurf_trait<ushort4> { typedef void type; };
+template<> struct __nv_isurf_trait<int4> { typedef void type; };
+template<> struct __nv_isurf_trait<uint4> { typedef void type; };
+template<> struct __nv_isurf_trait<float4> { typedef void type; };
+
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surf1Dread(T *ptr, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf1Dread", ptr, obj, x, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <class T>
+static __device__ T surf1Dread(cudaSurfaceObject_t surfObject, int x, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__  
+   T ret;
+   surf1Dread(&ret, surfObject, x, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */   
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surf2Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf2Dread", ptr, obj, x, y, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <class T>
+static __device__ T surf2Dread(cudaSurfaceObject_t surfObject, int x, int y, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__   
+   T ret;
+   surf2Dread(&ret, surfObject, x, y, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */   
+}
+
+
+template <typename T>
+static __device__ typename  __nv_isurf_trait<T>::type  surf3Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf3Dread", ptr, obj, x, y, z, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <class T>
+static __device__ T surf3Dread(cudaSurfaceObject_t surfObject, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__   
+   T ret;
+   surf3Dread(&ret, surfObject, x, y, z, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */   
+}
+
+template <typename T>
+static __device__ typename  __nv_isurf_trait<T>::type  surf1DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf1DLayeredread", ptr, obj, x, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <class T>
+static __device__ T surf1DLayeredread(cudaSurfaceObject_t surfObject, int x, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__   
+   T ret;
+   surf1DLayeredread(&ret, surfObject, x, layer, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */   
+}
+
+template <typename T>
+static __device__  typename __nv_isurf_trait<T>::type  surf2DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf2DLayeredread", ptr, obj, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <class T>
+static __device__ T surf2DLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__   
+   T ret;
+   surf2DLayeredread(&ret, surfObject, x, y, layer, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */   
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type  surfCubemapread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurfCubemapread", ptr, obj, x, y, face, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <class T>
+static __device__ T surfCubemapread(cudaSurfaceObject_t surfObject, int x, int y, int face, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__   
+   T ret;
+   surfCubemapread(&ret, surfObject, x, y, face, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */   
+}
+
+template <typename T>
+static __device__  typename __nv_isurf_trait<T>::type  surfCubemapLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurfCubemapLayeredread", ptr, obj, x, y, layerface, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <class T>
+static __device__ T surfCubemapLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layerface, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__   
+   T ret;
+   surfCubemapLayeredread(&ret, surfObject, x, y, layerface, boundaryMode);
+   return ret;
+#endif /* __CUDA_ARCH__ */   
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf1Dwrite(T val, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, obj, x, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf2Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, obj, x, y, mode);
+#endif /* __CUDA_ARCH__ */ 
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf3Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, obj, x, y, z, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, obj, x, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, obj, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surfCubemapwrite(T val, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, obj, x, y, face, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+template <typename T>
+static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, obj, x, y, layerface, mode);
+#endif /* __CUDA_ARCH__ */
+}
+
+#endif // __cplusplus && __CUDACC__
+
+#endif // __SURFACE_INDIRECT_FUNCTIONS_H__
+
+
diff --git a/ext/cudart/include/surface_types.h b/ext/cudart/include/surface_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..95ff57ca1bbaa517ade86424a9c5dbe8a2a4b8ee
--- /dev/null
+++ b/ext/cudart/include/surface_types.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_TYPES_H__)
+#define __SURFACE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaSurfaceType1D              0x01
+#define cudaSurfaceType2D              0x02
+#define cudaSurfaceType3D              0x03
+#define cudaSurfaceTypeCubemap         0x0C
+#define cudaSurfaceType1DLayered       0xF1
+#define cudaSurfaceType2DLayered       0xF2
+#define cudaSurfaceTypeCubemapLayered  0xFC
+
+/**
+ * CUDA Surface boundary modes
+ */
+enum __device_builtin__ cudaSurfaceBoundaryMode
+{
+    cudaBoundaryModeZero  = 0,    /**< Zero boundary mode */
+    cudaBoundaryModeClamp = 1,    /**< Clamp boundary mode */
+    cudaBoundaryModeTrap  = 2     /**< Trap boundary mode */
+};
+
+/**
+ * CUDA Surface format modes
+ */
+enum __device_builtin__  cudaSurfaceFormatMode
+{
+    cudaFormatModeForced = 0,     /**< Forced format mode */
+    cudaFormatModeAuto = 1        /**< Auto format mode */
+};
+
+/**
+ * CUDA Surface reference
+ */
+struct __device_builtin__ surfaceReference
+{
+    /**
+     * Channel descriptor for surface reference
+     */
+    struct cudaChannelFormatDesc channelDesc;
+};
+
+/**
+ * An opaque value that represents a CUDA Surface object
+ */
+typedef __device_builtin__ unsigned long long cudaSurfaceObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif /* !__SURFACE_TYPES_H__ */
diff --git a/ext/cudart/include/texture_fetch_functions.h b/ext/cudart/include/texture_fetch_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad970aea7a04023822ba01515a10c6e83c4d7def
--- /dev/null
+++ b/ext/cudart/include/texture_fetch_functions.h
@@ -0,0 +1,739 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__TEXTURE_FETCH_FUNCTIONS_H__)
+#define __TEXTURE_FETCH_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+#include "cuda_texture_types.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+
+
+template <typename T>
+struct __nv_tex_rmet_ret { };
+
+template<> struct __nv_tex_rmet_ret<char> { typedef char type; };
+template<> struct __nv_tex_rmet_ret<signed char> { typedef signed char type; };
+template<> struct __nv_tex_rmet_ret<unsigned char> { typedef unsigned char type; };
+template<> struct __nv_tex_rmet_ret<char1> { typedef char1 type; };
+template<> struct __nv_tex_rmet_ret<uchar1> { typedef uchar1 type; };
+template<> struct __nv_tex_rmet_ret<char2> { typedef char2 type; };
+template<> struct __nv_tex_rmet_ret<uchar2> { typedef uchar2 type; };
+template<> struct __nv_tex_rmet_ret<char4> { typedef char4 type; };
+template<> struct __nv_tex_rmet_ret<uchar4> { typedef uchar4 type; };
+
+template<> struct __nv_tex_rmet_ret<short> { typedef short type; };
+template<> struct __nv_tex_rmet_ret<unsigned short> { typedef unsigned short type; };
+template<> struct __nv_tex_rmet_ret<short1> { typedef short1 type; };
+template<> struct __nv_tex_rmet_ret<ushort1> { typedef ushort1 type; };
+template<> struct __nv_tex_rmet_ret<short2> { typedef short2 type; };
+template<> struct __nv_tex_rmet_ret<ushort2> { typedef ushort2 type; };
+template<> struct __nv_tex_rmet_ret<short4> { typedef short4 type; };
+template<> struct __nv_tex_rmet_ret<ushort4> { typedef ushort4 type; };
+
+template<> struct __nv_tex_rmet_ret<int> { typedef int type; };
+template<> struct __nv_tex_rmet_ret<unsigned int> { typedef unsigned int type; };
+template<> struct __nv_tex_rmet_ret<int1> { typedef int1 type; };
+template<> struct __nv_tex_rmet_ret<uint1> { typedef uint1 type; };
+template<> struct __nv_tex_rmet_ret<int2> { typedef int2 type; };
+template<> struct __nv_tex_rmet_ret<uint2> { typedef uint2 type; };
+template<> struct __nv_tex_rmet_ret<int4> { typedef int4 type; };
+template<> struct __nv_tex_rmet_ret<uint4> { typedef uint4 type; };
+
+#if !defined(__LP64__)
+template<> struct __nv_tex_rmet_ret<long> { typedef long type; };
+template<> struct __nv_tex_rmet_ret<unsigned long> { typedef unsigned long type; };
+template<> struct __nv_tex_rmet_ret<long1> { typedef long1 type; };
+template<> struct __nv_tex_rmet_ret<ulong1> { typedef ulong1 type; };
+template<> struct __nv_tex_rmet_ret<long2> { typedef long2 type; };
+template<> struct __nv_tex_rmet_ret<ulong2> { typedef ulong2 type; };
+template<> struct __nv_tex_rmet_ret<long4> { typedef long4 type; };
+template<> struct __nv_tex_rmet_ret<ulong4> { typedef ulong4 type; };
+#endif /* !__LP64__ */
+template<> struct __nv_tex_rmet_ret<float> { typedef float type; };
+template<> struct __nv_tex_rmet_ret<float1> { typedef float1 type; };
+template<> struct __nv_tex_rmet_ret<float2> { typedef float2 type; };
+template<> struct __nv_tex_rmet_ret<float4> { typedef float4 type; };
+
+
+template <typename T> struct __nv_tex_rmet_cast { typedef T* type;  };
+#if !defined(__LP64__)
+template<> struct __nv_tex_rmet_cast<long> { typedef int *type; };
+template<> struct __nv_tex_rmet_cast<unsigned long> { typedef unsigned int *type; };
+template<> struct __nv_tex_rmet_cast<long1> { typedef int1 *type; };
+template<> struct __nv_tex_rmet_cast<ulong1> { typedef uint1 *type; };
+template<> struct __nv_tex_rmet_cast<long2> { typedef int2 *type; };
+template<> struct __nv_tex_rmet_cast<ulong2> { typedef uint2 *type; };
+template<> struct __nv_tex_rmet_cast<long4> { typedef int4 *type; };
+template<> struct __nv_tex_rmet_cast<ulong4> { typedef uint4 *type; };
+#endif /* !__LP64__ */
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__  typename __nv_tex_rmet_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeElementType> t, int x)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1Dfetch_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x);
+  return temp;
+#endif
+}
+
+template <typename T>
+struct __nv_tex_rmnf_ret { };
+
+template <> struct __nv_tex_rmnf_ret<char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<signed char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<unsigned char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<short> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<unsigned short> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<char1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<uchar1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<short1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<ushort1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<char2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<uchar2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<short2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<ushort2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<char4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<uchar4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<short4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<ushort4> { typedef float4 type; };
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, int x) 
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1Dfetch_rmnf_v2", &type_dummy, &retval, t, x);
+  return retval;
+#endif /* __CUDA_ARCH__ */  
+}
+
+// tex1D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1D_rmnf_v2", &type_dummy, &retval, t, x);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+//tex2D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+
+  __nv_tex_surf_handler("__tex2D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2D_rmnf_v2", &type_dummy, &retval, t, x, y);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+//tex1DLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, layer);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayered_rmnf_v2", &type_dummy, &retval, t, x, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+//tex2DLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, layer);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayered_rmnf_v2", &type_dummy, &retval, t, x, y, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex3D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3D_rmnf_v2", &type_dummy, &retval, t, x, y, z);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// texCubemap
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemap_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemap_rmnf_v2", &type_dummy, &retval, t, x, y, z);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+template <typename T>
+struct __nv_tex2dgather_ret { };
+template <> struct __nv_tex2dgather_ret<char> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<signed char> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char1> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char2> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char3> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char4> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned char> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar1> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar2> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar3> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar4> { typedef uchar4 type; };
+
+template <> struct __nv_tex2dgather_ret<short> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short1> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short2> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short3> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short4> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned short> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort1> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort2> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort3> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort4> { typedef ushort4 type; };
+
+template <> struct __nv_tex2dgather_ret<int> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int1> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int2> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int3> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int4> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned int> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint1> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint2> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint3> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint4> { typedef uint4 type; };
+
+template <> struct __nv_tex2dgather_ret<float> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float1> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float2> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float3> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float4> { typedef float4 type; };
+
+template <typename T>
+static __device__ __forceinline__ typename __nv_tex2dgather_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, int comp=0)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex2dgather_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2Dgather_v2", &type_dummy, &retval, t, x, y, comp);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+template<typename T> struct __nv_tex2dgather_rmnf_ret { };
+template<> struct __nv_tex2dgather_rmnf_ret<char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<signed char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<unsigned char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<signed short> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<unsigned short> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort4> { typedef float4 type; };
+
+template <typename T>
+static __device__ __forceinline__  typename __nv_tex2dgather_rmnf_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, int comp = 0)
+{  
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex2dgather_rmnf_ret<T>::type  retval;
+  __nv_tex_surf_handler("__tex2Dgather_rmnf_v2", &type_dummy, &retval, t, x, y, comp);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// tex1DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLod_rmnf_v2", &type_dummy, &retval, t, x, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex2DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLod_rmnf_v2", &type_dummy, &retval, t, x, y, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex1DLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex2DLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex3DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3DLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3DLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// texCubemapLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// texCubemapLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayered_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayered_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+
+
+// texCubemapLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// texCubemapGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapGrad_rmnf_v2", &type_dummy, &retval, t,  x, y, z, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// texCubemapLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayeredGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) 
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, layer, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// tex1DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, dPdx, dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DGrad_rmnf_v2", &type_dummy, &retval,t, x,dPdx, dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// tex2DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex1DLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, dPdx, dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, layer, dPdx, dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex2DLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, layer, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex3DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+#undef __DEPRECATED__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */
diff --git a/ext/cudart/include/texture_indirect_functions.h b/ext/cudart/include/texture_indirect_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a70d7ee355195bc2e2d8833234f80bf4797741e
--- /dev/null
+++ b/ext/cudart/include/texture_indirect_functions.h
@@ -0,0 +1,771 @@
+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+
+#ifndef __TEXTURE_INDIRECT_FUNCTIONS_H__
+#define __TEXTURE_INDIRECT_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#include "cuda_runtime_api.h"
+
+
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)
+#define __NV_TEX_SPARSE 1
+#endif  /* endif */
+
+template <typename T> struct __nv_itex_trait {   };
+template<> struct __nv_itex_trait<char> { typedef void type; };
+template<> struct __nv_itex_trait<signed char> { typedef void type; };
+template<> struct __nv_itex_trait<char1> { typedef void type; };
+template<> struct __nv_itex_trait<char2> { typedef void type; };
+template<> struct __nv_itex_trait<char4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned char> { typedef void type; };
+template<> struct __nv_itex_trait<uchar1> { typedef void type; };
+template<> struct __nv_itex_trait<uchar2> { typedef void type; };
+template<> struct __nv_itex_trait<uchar4> { typedef void type; };
+template<> struct __nv_itex_trait<short> { typedef void type; };
+template<> struct __nv_itex_trait<short1> { typedef void type; };
+template<> struct __nv_itex_trait<short2> { typedef void type; };
+template<> struct __nv_itex_trait<short4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned short> { typedef void type; };
+template<> struct __nv_itex_trait<ushort1> { typedef void type; };
+template<> struct __nv_itex_trait<ushort2> { typedef void type; };
+template<> struct __nv_itex_trait<ushort4> { typedef void type; };
+template<> struct __nv_itex_trait<int> { typedef void type; };
+template<> struct __nv_itex_trait<int1> { typedef void type; };
+template<> struct __nv_itex_trait<int2> { typedef void type; };
+template<> struct __nv_itex_trait<int4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned int> { typedef void type; };
+template<> struct __nv_itex_trait<uint1> { typedef void type; };
+template<> struct __nv_itex_trait<uint2> { typedef void type; };
+template<> struct __nv_itex_trait<uint4> { typedef void type; };
+#if !defined(__LP64__)
+template<> struct __nv_itex_trait<long> { typedef void type; };
+template<> struct __nv_itex_trait<long1> { typedef void type; };
+template<> struct __nv_itex_trait<long2> { typedef void type; };
+template<> struct __nv_itex_trait<long4> { typedef void type; };
+template<> struct __nv_itex_trait<unsigned long> { typedef void type; };
+template<> struct __nv_itex_trait<ulong1> { typedef void type; };
+template<> struct __nv_itex_trait<ulong2> { typedef void type; };
+template<> struct __nv_itex_trait<ulong4> { typedef void type; };
+#endif /* !__LP64__ */
+template<> struct __nv_itex_trait<float> { typedef void type; };
+template<> struct __nv_itex_trait<float1> { typedef void type; };
+template<> struct __nv_itex_trait<float2> { typedef void type; };
+template<> struct __nv_itex_trait<float4> { typedef void type; };
+
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1Dfetch(T *ptr, cudaTextureObject_t obj, int x)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex1Dfetch", ptr, obj, x);
+#endif   
+}
+
+template <class T>
+static __device__ T tex1Dfetch(cudaTextureObject_t texObject, int x)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1Dfetch(&ret, texObject, x);
+  return ret;
+#endif  
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1D(T *ptr, cudaTextureObject_t obj, float x)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex1D", ptr, obj, x);
+#endif
+}
+
+
+template <class T>
+static __device__  T tex1D(cudaTextureObject_t texObject, float x)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1D(&ret, texObject, x);
+  return ret;
+#endif
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex2D", ptr, obj, x, y);
+#endif
+}
+
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2D(&ret, texObject, x, y);
+  return ret;
+#endif  
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y, 
+                                                          bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+   __nv_tex_surf_handler("__itex2D_sparse", ptr, obj, x, y, &res);
+   *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex2D(cudaTextureObject_t texObject, float x, float y, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2D(&ret, texObject, x, y, isResident);
+  return ret;
+#endif  
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex3D", ptr, obj, x, y, z);
+#endif
+}
+
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3D(&ret, texObject, x, y, z);
+  return ret;
+#endif
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z, 
+                                                          bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+   __nv_tex_surf_handler("__itex3D_sparse", ptr, obj, x, y, z, &res);
+   *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex3D(cudaTextureObject_t texObject, float x, float y, float z, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3D(&ret, texObject, x, y, z, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayered(T *ptr, cudaTextureObject_t obj, float x, int layer)
+{
+#ifdef __CUDA_ARCH__
+   __nv_tex_surf_handler("__itex1DLayered", ptr, obj, x, layer);
+#endif
+}
+
+template <class T>
+static __device__  T tex1DLayered(cudaTextureObject_t texObject, float x, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DLayered(&ret, texObject, x, layer);
+  return ret;
+#endif
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DLayered", ptr, obj, x, y, layer);
+#endif
+}
+
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer);
+  return ret;
+#endif
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayered_sparse", ptr, obj, x, y, layer, &res);
+  *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayered(&ret, texObject, x, y, layer, isResident);
+  return ret;
+#endif
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemap(T *ptr, cudaTextureObject_t obj, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemap", ptr, obj, x, y, z);
+#endif
+}
+
+
+template <class T>
+static __device__  T texCubemap(cudaTextureObject_t texObject, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemap(&ret, texObject, x, y, z);
+  return ret;
+#endif
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayered(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapLayered", ptr, obj, x, y, z, layer);
+#endif
+}
+
+template <class T>
+static __device__  T texCubemapLayered(cudaTextureObject_t texObject, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapLayered(&ret, texObject, x, y, z, layer);
+  return ret;
+#endif  
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2Dgather", ptr, obj, x, y, comp);
+#endif
+}
+
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2Dgather(&ret, to, x, y, comp);
+  return ret;
+#endif
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, bool* isResident, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2Dgather_sparse", ptr, obj, x, y, comp,  &res);
+  *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex2Dgather(cudaTextureObject_t to, float x, float y, bool* isResident, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2Dgather(&ret, to, x, y,  isResident, comp);
+  return ret;
+#endif
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLod(T *ptr, cudaTextureObject_t obj, float x, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex1DLod", ptr, obj, x, level);
+#endif
+}
+
+template <class T>
+static __device__  T tex1DLod(cudaTextureObject_t texObject, float x, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DLod(&ret, texObject, x, level);
+  return ret;
+#endif  
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DLod", ptr, obj, x, y, level);
+#endif
+}
+
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level);
+  return ret;
+#endif  
+}
+
+#if __NV_TEX_SPARSE
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLod_sparse", ptr, obj, x, y, level, &res);
+  *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLod(&ret, texObject, x, y, level, isResident);
+  return ret;
+#endif  
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex3DLod", ptr, obj, x, y, z, level);
+#endif
+}
+
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level);
+  return ret;
+#endif  
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level, bool* isResident)
+{ 
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DLod_sparse", ptr, obj, x, y, z, level, &res);
+  *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3DLod(&ret, texObject, x, y, z, level, isResident);
+  return ret;
+#endif  
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex1DLayeredLod", ptr, obj, x, layer, level);
+#endif
+}
+
+template <class T>
+static __device__  T tex1DLayeredLod(cudaTextureObject_t texObject, float x, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DLayeredLod(&ret, texObject, x, layer, level);
+  return ret;
+#endif  
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DLayeredLod", ptr, obj, x, y, layer, level);
+#endif
+}
+
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level);
+  return ret;
+#endif  
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level, bool* isResident)
+{ 
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredLod_sparse", ptr, obj, x, y, layer, level, &res);
+  *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayeredLod(&ret, texObject, x, y, layer, level, isResident);
+  return ret;
+#endif  
+}
+#endif  /* __NV_TEX_SPARSE */
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapLod", ptr, obj, x, y, z, level);
+#endif
+}
+
+template <class T>
+static __device__  T texCubemapLod(cudaTextureObject_t texObject, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapLod(&ret, texObject, x, y, z, level);
+  return ret;
+#endif  
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+#endif
+}
+
+template <class T>
+static __device__  T texCubemapGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+#endif  
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapLayeredLod", ptr, obj, x, y, z, layer, level);
+#endif
+}
+
+template <class T>
+static __device__  T texCubemapLayeredLod(cudaTextureObject_t texObject, float x, float y, float z, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapLayeredLod(&ret, texObject, x, y, z, layer, level);
+  return ret;
+#endif  
+}
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DGrad(T *ptr, cudaTextureObject_t obj, float x, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex1DGrad", ptr, obj, x, dPdx, dPdy);
+#endif
+}
+
+template <class T>
+static __device__  T tex1DGrad(cudaTextureObject_t texObject, float x, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DGrad(&ret, texObject, x, dPdx, dPdy);
+  return ret;
+#endif  
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DGrad_v2", ptr, obj, x, y, &dPdx, &dPdy);
+#endif
+
+}
+
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy);
+  return ret;
+#endif  
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{ 
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DGrad_sparse", ptr, obj, x, y, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+#endif
+
+}
+
+template <class T>
+static __device__  T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DGrad(&ret, texObject, x, y, dPdx, dPdy, isResident);
+  return ret;
+#endif  
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex3DGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy);
+#endif
+}
+
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy);
+  return ret;
+#endif  
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{ 
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex3DGrad_sparse", ptr, obj, x, y, z, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy, isResident);
+  return ret;
+#endif  
+}
+
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex1DLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, int layer, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex1DLayeredGrad", ptr, obj, x, layer, dPdx, dPdy);
+#endif
+}
+
+template <class T>
+static __device__  T tex1DLayeredGrad(cudaTextureObject_t texObject, float x, int layer, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex1DLayeredGrad(&ret, texObject, x, layer, dPdx, dPdy);
+  return ret;
+#endif  
+}
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itex2DLayeredGrad_v2", ptr, obj, x, y, layer, &dPdx, &dPdy);
+#endif
+}
+
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy);
+  return ret;
+#endif  
+}
+
+#if __NV_TEX_SPARSE
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{ 
+#ifdef __CUDA_ARCH__
+  unsigned char res;
+  __nv_tex_surf_handler("__itex2DLayeredGrad_sparse", ptr, obj, x, y, layer, &dPdx, &dPdy, &res);
+  *isResident = (res != 0);
+#endif
+}
+
+template <class T>
+static __device__  T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy, isResident);
+  return ret;
+#endif  
+}
+#endif  /* __NV_TEX_SPARSE */
+
+
+template <typename T>
+static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__itexCubemapLayeredGrad_v2", ptr, obj, x, y, z, layer, &dPdx, &dPdy);
+#endif
+}
+
+template <class T>
+static __device__  T texCubemapLayeredGrad(cudaTextureObject_t texObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T ret;
+  texCubemapLayeredGrad(&ret, texObject, x, y, z, layer, dPdx, dPdy);
+  return ret;
+#endif  
+}
+
+#undef __NV_TEX_SPARSE
+
+#endif // __cplusplus && __CUDACC__
+#endif // __TEXTURE_INDIRECT_FUNCTIONS_H__
diff --git a/ext/cudart/include/texture_types.h b/ext/cudart/include/texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef319422874e887dd7e5ac7cf275714f7737c26e
--- /dev/null
+++ b/ext/cudart/include/texture_types.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__TEXTURE_TYPES_H__)
+#define __TEXTURE_TYPES_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#define cudaTextureType1D              0x01
+#define cudaTextureType2D              0x02
+#define cudaTextureType3D              0x03
+#define cudaTextureTypeCubemap         0x0C
+#define cudaTextureType1DLayered       0xF1
+#define cudaTextureType2DLayered       0xF2
+#define cudaTextureTypeCubemapLayered  0xFC
+
+/**
+ * CUDA texture address modes
+ */
+enum __device_builtin__ cudaTextureAddressMode
+{
+    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
+    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
+    cudaAddressModeMirror = 2,    /**< Mirror address mode */
+    cudaAddressModeBorder = 3     /**< Border address mode */
+};
+
+/**
+ * CUDA texture filter modes
+ */
+enum __device_builtin__ cudaTextureFilterMode
+{
+    cudaFilterModePoint  = 0,     /**< Point filter mode */
+    cudaFilterModeLinear = 1      /**< Linear filter mode */
+};
+
+/**
+ * CUDA texture read modes
+ */
+enum __device_builtin__ cudaTextureReadMode
+{
+    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
+    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
+};
+
+/**
+ * CUDA texture reference
+ */
+struct __device_builtin__ textureReference
+{
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                          normalized;
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode   filterMode;
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode  addressMode[3];
+    /**
+     * Channel descriptor for the texture reference
+     */
+    struct cudaChannelFormatDesc channelDesc;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                          sRGB;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                 maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode   mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                        mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                        minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                        maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                          disableTrilinearOptimization;
+    int                          __cudaReserved[14];
+};
+
+/**
+ * CUDA texture descriptor
+ */
+struct __device_builtin__ cudaTextureDesc
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Texture Border Color
+     */
+    float                       borderColor[4];
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                         disableTrilinearOptimization;
+};
+
+struct __device_builtin__ cudaTextureDesc_v2
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Texture Border Color
+     */
+    float                       borderColor[4];
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                         disableTrilinearOptimization;
+    /**
+     * Enable seamless cube map filtering.
+     */
+    int                         seamlessCubemap;
+};
+
+/**
+ * An opaque value that represents a CUDA texture object
+ */
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#endif /* !__TEXTURE_TYPES_H__ */
diff --git a/ext/cudart/include/vector_functions.h b/ext/cudart/include/vector_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..bee6cd32c36d94bde65aad1c867352493d07a0dc
--- /dev/null
+++ b/ext/cudart/include/vector_functions.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_H__)
+#define __VECTOR_FUNCTIONS_H__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x);
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x);
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y);
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y);
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z);
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z);
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w);
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w);
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x);
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x);
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y);
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y);
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z);
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z);
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w);
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w);
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x);
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x);
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y);
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y);
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z);
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z);
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w);
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w);
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x);
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y);
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z);
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w);
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x);
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y);
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z);
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w);
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x);
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y);
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z);
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w);
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x);
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y);
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z);
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w);
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#if !defined(__CUDACC_RTC__)
+#include "vector_functions.hpp"
+#endif /* !__CUDACC_RTC__ */
+
+#endif /* !__VECTOR_FUNCTIONS_H__ */
diff --git a/ext/cudart/include/vector_functions.hpp b/ext/cudart/include/vector_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ab69cf38045a7c44dae67e7149d49ac4c6148747
--- /dev/null
+++ b/ext/cudart/include/vector_functions.hpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_FUNCTIONS_HPP__)
+#define __VECTOR_FUNCTIONS_HPP__
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
+{
+  char1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
+{
+  uchar1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
+{
+  char2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
+{
+  uchar2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
+{
+  char3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
+{
+  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
+{
+  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
+{
+  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
+{
+  short1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
+{
+  ushort1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
+{
+  short2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
+{
+  ushort2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
+{ 
+  short3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
+{
+  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
+{
+  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
+{
+  int1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
+{
+  uint1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
+{
+  int2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
+{
+  uint3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
+{
+  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
+{
+  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
+{
+  long1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
+{
+  ulong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
+{
+  long2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
+{
+  ulong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
+{
+  long3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
+{
+  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
+{
+  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
+{
+  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
+{
+  float1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
+{
+  float2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
+{
+  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
+{
+  longlong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
+{
+  ulonglong1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
+{
+  longlong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
+{
+  ulonglong2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
+{
+  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
+{
+  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
+{
+  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
+{
+  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
+{
+  double1 t; t.x = x; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
+{
+  double2 t; t.x = x; t.y = y; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
+{
+  double3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
+{
+  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+
+#undef __VECTOR_FUNCTIONS_DECL__
+
+#endif /* !__VECTOR_FUNCTIONS_HPP__ */
+
diff --git a/ext/cudart/include/vector_types.h b/ext/cudart/include/vector_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4cfabcff8a25adaf3f589d38d531bc63cae2fcf6
--- /dev/null
+++ b/ext/cudart/include/vector_types.h
@@ -0,0 +1,443 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__VECTOR_TYPES_H__)
+#define __VECTOR_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \
+    defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(push)
+#pragma warning(disable: 4201 4408)
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ tag                      \
+{                                                  \
+    union                                          \
+    {                                              \
+        struct { members };                        \
+        struct { long long int :1,:0; };           \
+    };                                             \
+}
+
+#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
+
+#define __cuda_builtin_vector_align8(tag, members) \
+struct __device_builtin__ __align__(8) tag         \
+{                                                  \
+    members                                        \
+}
+
+#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */
+
+struct __device_builtin__ char1
+{
+    signed char x;
+};
+
+struct __device_builtin__ uchar1
+{
+    unsigned char x;
+};
+
+
+struct __device_builtin__ __align__(2) char2
+{
+    signed char x, y;
+};
+
+struct __device_builtin__ __align__(2) uchar2
+{
+    unsigned char x, y;
+};
+
+struct __device_builtin__ char3
+{
+    signed char x, y, z;
+};
+
+struct __device_builtin__ uchar3
+{
+    unsigned char x, y, z;
+};
+
+struct __device_builtin__ __align__(4) char4
+{
+    signed char x, y, z, w;
+};
+
+struct __device_builtin__ __align__(4) uchar4
+{
+    unsigned char x, y, z, w;
+};
+
+struct __device_builtin__ short1
+{
+    short x;
+};
+
+struct __device_builtin__ ushort1
+{
+    unsigned short x;
+};
+
+struct __device_builtin__ __align__(4) short2
+{
+    short x, y;
+};
+
+struct __device_builtin__ __align__(4) ushort2
+{
+    unsigned short x, y;
+};
+
+struct __device_builtin__ short3
+{
+    short x, y, z;
+};
+
+struct __device_builtin__ ushort3
+{
+    unsigned short x, y, z;
+};
+
+__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;);
+__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
+
+struct __device_builtin__ int1
+{
+    int x;
+};
+
+struct __device_builtin__ uint1
+{
+    unsigned int x;
+};
+
+__cuda_builtin_vector_align8(int2, int x; int y;);
+__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;);
+
+struct __device_builtin__ int3
+{
+    int x, y, z;
+};
+
+struct __device_builtin__ uint3
+{
+    unsigned int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) int4
+{
+    int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) uint4
+{
+    unsigned int x, y, z, w;
+};
+
+struct __device_builtin__ long1
+{
+    long int x;
+};
+
+struct __device_builtin__ ulong1
+{
+    unsigned long x;
+};
+
+#if defined(_WIN32)
+__cuda_builtin_vector_align8(long2, long int x; long int y;);
+__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;);
+#else /* !_WIN32 */
+
+struct __device_builtin__ __align__(2*sizeof(long int)) long2
+{
+    long int x, y;
+};
+
+struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2
+{
+    unsigned long int x, y;
+};
+
+#endif /* _WIN32 */
+
+struct __device_builtin__ long3
+{
+    long int x, y, z;
+};
+
+struct __device_builtin__ ulong3
+{
+    unsigned long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) long4
+{
+    long int x, y, z, w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulong4
+{
+    unsigned long int x, y, z, w;
+};
+
+struct __device_builtin__ float1
+{
+    float x;
+};
+
+#if !defined(__CUDACC__) && defined(__arm__) && \
+    defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-pedantic"
+
+struct __device_builtin__ __attribute__((aligned(8))) float2
+{
+    float x; float y; float __cuda_gnu_arm_ice_workaround[0];
+};
+
+#pragma GCC poison __cuda_gnu_arm_ice_workaround
+#pragma GCC diagnostic pop
+
+#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
+         __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+__cuda_builtin_vector_align8(float2, float x; float y;);
+
+#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP &&
+          __GNUC__ == 4&& __GNUC_MINOR__ == 6 */
+
+struct __device_builtin__ float3
+{
+    float x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) float4
+{
+    float x, y, z, w;
+};
+
+struct __device_builtin__ longlong1
+{
+    long long int x;
+};
+
+struct __device_builtin__ ulonglong1
+{
+    unsigned long long int x;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong2
+{
+    long long int x, y;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong2
+{
+    unsigned long long int x, y;
+};
+
+struct __device_builtin__ longlong3
+{
+    long long int x, y, z;
+};
+
+struct __device_builtin__ ulonglong3
+{
+    unsigned long long int x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) longlong4
+{
+    long long int x, y, z ,w;
+};
+
+struct __device_builtin__ __builtin_align__(16) ulonglong4
+{
+    unsigned long long int x, y, z, w;
+};
+
+struct __device_builtin__ double1
+{
+    double x;
+};
+
+struct __device_builtin__ __builtin_align__(16) double2
+{
+    double x, y;
+};
+
+struct __device_builtin__ double3
+{
+    double x, y, z;
+};
+
+struct __device_builtin__ __builtin_align__(16) double4
+{
+    double x, y, z, w;
+};
+
+#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64)
+
+#pragma warning(pop)
+
+#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+typedef __device_builtin__ struct char1 char1;
+typedef __device_builtin__ struct uchar1 uchar1;
+typedef __device_builtin__ struct char2 char2;
+typedef __device_builtin__ struct uchar2 uchar2;
+typedef __device_builtin__ struct char3 char3;
+typedef __device_builtin__ struct uchar3 uchar3;
+typedef __device_builtin__ struct char4 char4;
+typedef __device_builtin__ struct uchar4 uchar4;
+typedef __device_builtin__ struct short1 short1;
+typedef __device_builtin__ struct ushort1 ushort1;
+typedef __device_builtin__ struct short2 short2;
+typedef __device_builtin__ struct ushort2 ushort2;
+typedef __device_builtin__ struct short3 short3;
+typedef __device_builtin__ struct ushort3 ushort3;
+typedef __device_builtin__ struct short4 short4;
+typedef __device_builtin__ struct ushort4 ushort4;
+typedef __device_builtin__ struct int1 int1;
+typedef __device_builtin__ struct uint1 uint1;
+typedef __device_builtin__ struct int2 int2;
+typedef __device_builtin__ struct uint2 uint2;
+typedef __device_builtin__ struct int3 int3;
+typedef __device_builtin__ struct uint3 uint3;
+typedef __device_builtin__ struct int4 int4;
+typedef __device_builtin__ struct uint4 uint4;
+typedef __device_builtin__ struct long1 long1;
+typedef __device_builtin__ struct ulong1 ulong1;
+typedef __device_builtin__ struct long2 long2;
+typedef __device_builtin__ struct ulong2 ulong2;
+typedef __device_builtin__ struct long3 long3;
+typedef __device_builtin__ struct ulong3 ulong3;
+typedef __device_builtin__ struct long4 long4;
+typedef __device_builtin__ struct ulong4 ulong4;
+typedef __device_builtin__ struct float1 float1;
+typedef __device_builtin__ struct float2 float2;
+typedef __device_builtin__ struct float3 float3;
+typedef __device_builtin__ struct float4 float4;
+typedef __device_builtin__ struct longlong1 longlong1;
+typedef __device_builtin__ struct ulonglong1 ulonglong1;
+typedef __device_builtin__ struct longlong2 longlong2;
+typedef __device_builtin__ struct ulonglong2 ulonglong2;
+typedef __device_builtin__ struct longlong3 longlong3;
+typedef __device_builtin__ struct ulonglong3 ulonglong3;
+typedef __device_builtin__ struct longlong4 longlong4;
+typedef __device_builtin__ struct ulonglong4 ulonglong4;
+typedef __device_builtin__ struct double1 double1;
+typedef __device_builtin__ struct double2 double2;
+typedef __device_builtin__ struct double3 double3;
+typedef __device_builtin__ struct double4 double4;
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+struct __device_builtin__ dim3
+{
+    unsigned int x, y, z;
+#if defined(__cplusplus)
+#if __cplusplus >= 201103L
+    __host__ __device__ constexpr dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ constexpr dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ constexpr operator uint3(void) const { return uint3{x, y, z}; }
+#else
+    __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {}
+    __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {}
+    __host__ __device__ operator uint3(void) const { uint3 t; t.x = x; t.y = y; t.z = z; return t; }
+#endif
+#endif /* __cplusplus */
+};
+
+typedef __device_builtin__ struct dim3 dim3;
+
+#undef  __cuda_builtin_vector_align8
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__
+#endif
+
+#endif /* !__VECTOR_TYPES_H__ */
diff --git a/ext/cudart/lib/Win32/OpenCL.lib b/ext/cudart/lib/Win32/OpenCL.lib
new file mode 100644
index 0000000000000000000000000000000000000000..c06b10f396eb1ee2175d459efba571400892e47b
Binary files /dev/null and b/ext/cudart/lib/Win32/OpenCL.lib differ
diff --git a/ext/cudart/lib/Win32/cuda.lib b/ext/cudart/lib/Win32/cuda.lib
new file mode 100644
index 0000000000000000000000000000000000000000..56151c765658e5be3e672dd47594ea6a8d35e9d1
Binary files /dev/null and b/ext/cudart/lib/Win32/cuda.lib differ
diff --git a/ext/cudart/lib/Win32/cudadevrt.lib b/ext/cudart/lib/Win32/cudadevrt.lib
new file mode 100644
index 0000000000000000000000000000000000000000..a8f1e0e36f27a0161b0f4978256857ed77758c0a
Binary files /dev/null and b/ext/cudart/lib/Win32/cudadevrt.lib differ
diff --git a/ext/cudart/lib/Win32/cudart.lib b/ext/cudart/lib/Win32/cudart.lib
new file mode 100644
index 0000000000000000000000000000000000000000..385e92bd4660ffe8771e5ba30803990d7668f4ab
Binary files /dev/null and b/ext/cudart/lib/Win32/cudart.lib differ
diff --git a/ext/cudart/lib/Win32/cudart_static.lib b/ext/cudart/lib/Win32/cudart_static.lib
new file mode 100644
index 0000000000000000000000000000000000000000..b28c233b93739792cfc271ed1f75813de757eb48
Binary files /dev/null and b/ext/cudart/lib/Win32/cudart_static.lib differ
diff --git a/ext/cudart/lib/x64/OpenCL.lib b/ext/cudart/lib/x64/OpenCL.lib
new file mode 100644
index 0000000000000000000000000000000000000000..20ed60d9f2e07df188ab496f61c21f3f5e96862d
Binary files /dev/null and b/ext/cudart/lib/x64/OpenCL.lib differ
diff --git a/ext/cudart/lib/x64/cuda.lib b/ext/cudart/lib/x64/cuda.lib
new file mode 100644
index 0000000000000000000000000000000000000000..f146412d42b563d1962a708d3f628c1852692f69
Binary files /dev/null and b/ext/cudart/lib/x64/cuda.lib differ
diff --git a/ext/cudart/lib/x64/cudadevrt.lib b/ext/cudart/lib/x64/cudadevrt.lib
new file mode 100644
index 0000000000000000000000000000000000000000..d933b7fbdd650777540d5ca6cf62a5785700e851
Binary files /dev/null and b/ext/cudart/lib/x64/cudadevrt.lib differ
diff --git a/ext/cudart/lib/x64/cudart.lib b/ext/cudart/lib/x64/cudart.lib
new file mode 100644
index 0000000000000000000000000000000000000000..b023e8d3116164aa1dada4cb60eee025c748cd06
Binary files /dev/null and b/ext/cudart/lib/x64/cudart.lib differ
diff --git a/ext/cudart/lib/x64/cudart_static.lib b/ext/cudart/lib/x64/cudart_static.lib
new file mode 100644
index 0000000000000000000000000000000000000000..d975c98869fddb1a48fa7a8f5aa393501e9cc655
Binary files /dev/null and b/ext/cudart/lib/x64/cudart_static.lib differ
diff --git a/src/encoder/nvidia_encoder.cpp b/src/encoder/nvidia_encoder.cpp
index 0951a0d70c7aa38ae4390f1f41b11f395b46c6d6..45abd9100e5cf6d366d53a4d87c96a65acfae2d3 100644
--- a/src/encoder/nvidia_encoder.cpp
+++ b/src/encoder/nvidia_encoder.cpp
@@ -1,5 +1,7 @@
 #include "nvidia_encoder.hpp"
 
+#include <cuda.h>
+
 #if defined(__unix__)
     #include <dlfcn.h>
 #endif
@@ -8,6 +10,8 @@ typedef NVENCSTATUS(NVENCAPI* NvEncodeAPICreateInstance_Type)(NV_ENCODE_API_FUNC
 
 bool NvidiaEncoder::create(lava::device_ptr device, const lava::renderer& renderer, const glm::uvec2& size, uint32_t input_buffers)
 {
+    cuInit(0);
+
     if (!this->load_library())
     {
         return false;