diff --git a/.gitmodules b/.gitmodules index 64372bf3cc41964cc9fbc298d80f154fc6f9dee1..79827d19945515b283d73c2fa3312ddf097d6b8b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -54,6 +54,3 @@ [submodule "ext/openxr"] path = ext/openxr url = https://github.com/KhronosGroup/OpenXR-SDK.git -[submodule "ext/cudart"] - path = ext/cudart - url = https://gitlab.com/nvidia/headers/cuda-individual/cudart.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 15ab7f4724bf65b19c97ec5dbb0b0b69959c5a25..7b0e9914efa1edfaae3e38e2582e1213171ecc26 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -194,6 +194,7 @@ target_include_directories(lava.base PUBLIC $<BUILD_INTERFACE:${LIBLAVA_EXT_DIR}/VulkanMemoryAllocator/include> $<BUILD_INTERFACE:${LIBLAVA_EXT_DIR}/volk> $<BUILD_INTERFACE:${LIBLAVA_INC_DIR}/nvenc_headers> + $<BUILD_INTERFACE:${LIBLAVA_EXT_DIR}/cudart/include> ) target_link_libraries(lava.base @@ -500,6 +501,7 @@ set(EXT_INCLUDE_DIRS ${LIBLAVA_EXT_DIR}/argh ${LIBLAVA_EXT_DIR}/imgui ${LIBLAVA_INC_DIR}/nvenc_headers + ${LIBLAVA_EXT_DIR}/cudart/include ) foreach(DIR ${EXT_INCLUDE_DIRS}) @@ -801,6 +803,7 @@ message("======================================================================= target_link_libraries(${NAME} lava::app assimp::assimp) target_include_directories(${NAME} PUBLIC ${OPENVR_HEADER_DIR}) target_link_libraries(${NAME} "${CMAKE_SOURCE_DIR}/ext/openvr/lib/win64/openvr_api.lib") + target_link_libraries(${NAME} "${CMAKE_SOURCE_DIR}/ext/cudart/lib/x64/cuda.lib") if(MSVC) source_group("" FILES ${SRC_DIR}/main.cpp) diff --git a/ext/cudart b/ext/cudart deleted file mode 160000 index b92c3c0351d6d4402b81101ffcd6660474d1306b..0000000000000000000000000000000000000000 --- a/ext/cudart +++ /dev/null @@ -1 +0,0 @@ -Subproject commit b92c3c0351d6d4402b81101ffcd6660474d1306b diff --git a/ext/cudart/bin/cudart32_110.dll b/ext/cudart/bin/cudart32_110.dll new file mode 100644 index 0000000000000000000000000000000000000000..20cf2aee49aed62ae6d5ccad7bbbf5d4f3474eca Binary files /dev/null and b/ext/cudart/bin/cudart32_110.dll differ diff --git a/ext/cudart/bin/cudart64_110.dll b/ext/cudart/bin/cudart64_110.dll new file mode 100644 index 0000000000000000000000000000000000000000..b36d7a3e29ff90d1e43460489ec78d3b78b5fda9 Binary files /dev/null and b/ext/cudart/bin/cudart64_110.dll differ diff --git a/ext/cudart/include/CL/cl.h b/ext/cudart/include/CL/cl.h new file mode 100644 index 0000000000000000000000000000000000000000..290495a68f1be43916c468b9a3881c6972085e89 --- /dev/null +++ b/ext/cudart/include/CL/cl.h @@ -0,0 +1,1578 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_H +#define __OPENCL_CL_H + +#ifdef __APPLE__ +#include <OpenCL/cl_platform.h> +#else +#include <CL/cl_platform.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ + +typedef struct _cl_platform_id * cl_platform_id; +typedef struct _cl_device_id * cl_device_id; +typedef struct _cl_context * cl_context; +typedef struct _cl_command_queue * cl_command_queue; +typedef struct _cl_mem * cl_mem; +typedef struct _cl_program * cl_program; +typedef struct _cl_kernel * cl_kernel; +typedef struct _cl_event * cl_event; +typedef struct _cl_sampler * cl_sampler; + +typedef cl_uint cl_bool; /* WARNING! Unlike cl_ types in cl_platform.h, cl_bool is not guaranteed to be the same size as the bool in kernels. */ +typedef cl_ulong cl_bitfield; +typedef cl_ulong cl_properties; +typedef cl_bitfield cl_device_type; +typedef cl_uint cl_platform_info; +typedef cl_uint cl_device_info; +typedef cl_bitfield cl_device_fp_config; +typedef cl_uint cl_device_mem_cache_type; +typedef cl_uint cl_device_local_mem_type; +typedef cl_bitfield cl_device_exec_capabilities; +typedef cl_bitfield cl_device_svm_capabilities; +typedef cl_bitfield cl_command_queue_properties; +typedef intptr_t cl_device_partition_property; +typedef cl_bitfield cl_device_affinity_domain; + +typedef intptr_t cl_context_properties; +typedef cl_uint cl_context_info; +typedef cl_properties cl_queue_properties; +typedef cl_uint cl_command_queue_info; +typedef cl_uint cl_channel_order; +typedef cl_uint cl_channel_type; +typedef cl_bitfield cl_mem_flags; +typedef cl_bitfield cl_svm_mem_flags; +typedef cl_uint cl_mem_object_type; +typedef cl_uint cl_mem_info; +typedef cl_bitfield cl_mem_migration_flags; +typedef cl_uint cl_image_info; +typedef cl_uint cl_buffer_create_type; +typedef cl_uint cl_addressing_mode; +typedef cl_uint cl_filter_mode; +typedef cl_uint cl_sampler_info; +typedef cl_bitfield cl_map_flags; +typedef intptr_t cl_pipe_properties; +typedef cl_uint cl_pipe_info; +typedef cl_uint cl_program_info; +typedef cl_uint cl_program_build_info; +typedef cl_uint cl_program_binary_type; +typedef cl_int cl_build_status; +typedef cl_uint cl_kernel_info; +typedef cl_uint cl_kernel_arg_info; +typedef cl_uint cl_kernel_arg_address_qualifier; +typedef cl_uint cl_kernel_arg_access_qualifier; +typedef cl_bitfield cl_kernel_arg_type_qualifier; +typedef cl_uint cl_kernel_work_group_info; +typedef cl_uint cl_kernel_sub_group_info; +typedef cl_uint cl_event_info; +typedef cl_uint cl_command_type; +typedef cl_uint cl_profiling_info; +typedef cl_properties cl_sampler_properties; +typedef cl_uint cl_kernel_exec_info; +typedef cl_bitfield cl_device_atomic_capabilities; +typedef cl_bitfield cl_device_device_enqueue_capabilities; +typedef cl_uint cl_khronos_vendor_id; +typedef cl_properties cl_mem_properties; +typedef cl_uint cl_version; + +typedef struct _cl_image_format { + cl_channel_order image_channel_order; + cl_channel_type image_channel_data_type; +} cl_image_format; + +typedef struct _cl_image_desc { + cl_mem_object_type image_type; + size_t image_width; + size_t image_height; + size_t image_depth; + size_t image_array_size; + size_t image_row_pitch; + size_t image_slice_pitch; + cl_uint num_mip_levels; + cl_uint num_samples; +#ifdef __GNUC__ + __extension__ /* Prevents warnings about anonymous union in -pedantic builds */ +#endif + union { + cl_mem buffer; + cl_mem mem_object; + }; +} cl_image_desc; + +typedef struct _cl_buffer_region { + size_t origin; + size_t size; +} cl_buffer_region; + +#define CL_NAME_VERSION_MAX_NAME_SIZE 64 + +typedef struct _cl_name_version { + cl_version version; + char name[CL_NAME_VERSION_MAX_NAME_SIZE]; +} cl_name_version; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_SUCCESS 0 +#define CL_DEVICE_NOT_FOUND -1 +#define CL_DEVICE_NOT_AVAILABLE -2 +#define CL_COMPILER_NOT_AVAILABLE -3 +#define CL_MEM_OBJECT_ALLOCATION_FAILURE -4 +#define CL_OUT_OF_RESOURCES -5 +#define CL_OUT_OF_HOST_MEMORY -6 +#define CL_PROFILING_INFO_NOT_AVAILABLE -7 +#define CL_MEM_COPY_OVERLAP -8 +#define CL_IMAGE_FORMAT_MISMATCH -9 +#define CL_IMAGE_FORMAT_NOT_SUPPORTED -10 +#define CL_BUILD_PROGRAM_FAILURE -11 +#define CL_MAP_FAILURE -12 +#define CL_MISALIGNED_SUB_BUFFER_OFFSET -13 +#define CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST -14 +#define CL_COMPILE_PROGRAM_FAILURE -15 +#define CL_LINKER_NOT_AVAILABLE -16 +#define CL_LINK_PROGRAM_FAILURE -17 +#define CL_DEVICE_PARTITION_FAILED -18 +#define CL_KERNEL_ARG_INFO_NOT_AVAILABLE -19 + +#define CL_INVALID_VALUE -30 +#define CL_INVALID_DEVICE_TYPE -31 +#define CL_INVALID_PLATFORM -32 +#define CL_INVALID_DEVICE -33 +#define CL_INVALID_CONTEXT -34 +#define CL_INVALID_QUEUE_PROPERTIES -35 +#define CL_INVALID_COMMAND_QUEUE -36 +#define CL_INVALID_HOST_PTR -37 +#define CL_INVALID_MEM_OBJECT -38 +#define CL_INVALID_IMAGE_FORMAT_DESCRIPTOR -39 +#define CL_INVALID_IMAGE_SIZE -40 +#define CL_INVALID_SAMPLER -41 +#define CL_INVALID_BINARY -42 +#define CL_INVALID_BUILD_OPTIONS -43 +#define CL_INVALID_PROGRAM -44 +#define CL_INVALID_PROGRAM_EXECUTABLE -45 +#define CL_INVALID_KERNEL_NAME -46 +#define CL_INVALID_KERNEL_DEFINITION -47 +#define CL_INVALID_KERNEL -48 +#define CL_INVALID_ARG_INDEX -49 +#define CL_INVALID_ARG_VALUE -50 +#define CL_INVALID_ARG_SIZE -51 +#define CL_INVALID_KERNEL_ARGS -52 +#define CL_INVALID_WORK_DIMENSION -53 +#define CL_INVALID_WORK_GROUP_SIZE -54 +#define CL_INVALID_WORK_ITEM_SIZE -55 +#define CL_INVALID_GLOBAL_OFFSET -56 +#define CL_INVALID_EVENT_WAIT_LIST -57 +#define CL_INVALID_EVENT -58 +#define CL_INVALID_OPERATION -59 +#define CL_INVALID_GL_OBJECT -60 +#define CL_INVALID_BUFFER_SIZE -61 +#define CL_INVALID_MIP_LEVEL -62 +#define CL_INVALID_GLOBAL_WORK_SIZE -63 +#define CL_INVALID_PROPERTY -64 +#define CL_INVALID_IMAGE_DESCRIPTOR -65 +#define CL_INVALID_COMPILER_OPTIONS -66 +#define CL_INVALID_LINKER_OPTIONS -67 +#define CL_INVALID_DEVICE_PARTITION_COUNT -68 +#define CL_INVALID_PIPE_SIZE -69 +#define CL_INVALID_DEVICE_QUEUE -70 +#define CL_INVALID_SPEC_ID -71 +#define CL_MAX_SIZE_RESTRICTION_EXCEEDED -72 + +/* OpenCL Version */ +#define CL_VERSION_1_0 1 +#define CL_VERSION_1_1 1 +#define CL_VERSION_1_2 1 +#define CL_VERSION_2_0 1 +#define CL_VERSION_2_1 1 +#define CL_VERSION_2_2 1 +#define CL_VERSION_3_0 1 + +/* cl_bool */ +#define CL_FALSE 0 +#define CL_TRUE 1 +#define CL_BLOCKING CL_TRUE +#define CL_NON_BLOCKING CL_FALSE + +/* cl_platform_info */ +#define CL_PLATFORM_PROFILE 0x0900 +#define CL_PLATFORM_VERSION 0x0901 +#define CL_PLATFORM_NAME 0x0902 +#define CL_PLATFORM_VENDOR 0x0903 +#define CL_PLATFORM_EXTENSIONS 0x0904 +#define CL_PLATFORM_HOST_TIMER_RESOLUTION 0x0905 +#define CL_PLATFORM_NUMERIC_VERSION 0x0906 +#define CL_PLATFORM_EXTENSIONS_WITH_VERSION 0x0907 + +/* cl_device_type - bitfield */ +#define CL_DEVICE_TYPE_DEFAULT (1 << 0) +#define CL_DEVICE_TYPE_CPU (1 << 1) +#define CL_DEVICE_TYPE_GPU (1 << 2) +#define CL_DEVICE_TYPE_ACCELERATOR (1 << 3) +#define CL_DEVICE_TYPE_CUSTOM (1 << 4) +#define CL_DEVICE_TYPE_ALL 0xFFFFFFFF + +/* cl_device_info */ +#define CL_DEVICE_TYPE 0x1000 +#define CL_DEVICE_VENDOR_ID 0x1001 +#define CL_DEVICE_MAX_COMPUTE_UNITS 0x1002 +#define CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS 0x1003 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE 0x1004 +#define CL_DEVICE_MAX_WORK_ITEM_SIZES 0x1005 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR 0x1006 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT 0x1007 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT 0x1008 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG 0x1009 +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT 0x100A +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE 0x100B +#define CL_DEVICE_MAX_CLOCK_FREQUENCY 0x100C +#define CL_DEVICE_ADDRESS_BITS 0x100D +#define CL_DEVICE_MAX_READ_IMAGE_ARGS 0x100E +#define CL_DEVICE_MAX_WRITE_IMAGE_ARGS 0x100F +#define CL_DEVICE_MAX_MEM_ALLOC_SIZE 0x1010 +#define CL_DEVICE_IMAGE2D_MAX_WIDTH 0x1011 +#define CL_DEVICE_IMAGE2D_MAX_HEIGHT 0x1012 +#define CL_DEVICE_IMAGE3D_MAX_WIDTH 0x1013 +#define CL_DEVICE_IMAGE3D_MAX_HEIGHT 0x1014 +#define CL_DEVICE_IMAGE3D_MAX_DEPTH 0x1015 +#define CL_DEVICE_IMAGE_SUPPORT 0x1016 +#define CL_DEVICE_MAX_PARAMETER_SIZE 0x1017 +#define CL_DEVICE_MAX_SAMPLERS 0x1018 +#define CL_DEVICE_MEM_BASE_ADDR_ALIGN 0x1019 +#define CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE 0x101A +#define CL_DEVICE_SINGLE_FP_CONFIG 0x101B +#define CL_DEVICE_GLOBAL_MEM_CACHE_TYPE 0x101C +#define CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE 0x101D +#define CL_DEVICE_GLOBAL_MEM_CACHE_SIZE 0x101E +#define CL_DEVICE_GLOBAL_MEM_SIZE 0x101F +#define CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE 0x1020 +#define CL_DEVICE_MAX_CONSTANT_ARGS 0x1021 +#define CL_DEVICE_LOCAL_MEM_TYPE 0x1022 +#define CL_DEVICE_LOCAL_MEM_SIZE 0x1023 +#define CL_DEVICE_ERROR_CORRECTION_SUPPORT 0x1024 +#define CL_DEVICE_PROFILING_TIMER_RESOLUTION 0x1025 +#define CL_DEVICE_ENDIAN_LITTLE 0x1026 +#define CL_DEVICE_AVAILABLE 0x1027 +#define CL_DEVICE_COMPILER_AVAILABLE 0x1028 +#define CL_DEVICE_EXECUTION_CAPABILITIES 0x1029 +#define CL_DEVICE_QUEUE_PROPERTIES 0x102A /* deprecated */ +#define CL_DEVICE_QUEUE_ON_HOST_PROPERTIES 0x102A +#define CL_DEVICE_NAME 0x102B +#define CL_DEVICE_VENDOR 0x102C +#define CL_DRIVER_VERSION 0x102D +#define CL_DEVICE_PROFILE 0x102E +#define CL_DEVICE_VERSION 0x102F +#define CL_DEVICE_EXTENSIONS 0x1030 +#define CL_DEVICE_PLATFORM 0x1031 +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 +/* 0x1033 reserved for CL_DEVICE_HALF_FP_CONFIG which is already defined in "cl_ext.h" */ +#define CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF 0x1034 +#define CL_DEVICE_HOST_UNIFIED_MEMORY 0x1035 /* deprecated */ +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR 0x1036 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT 0x1037 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_INT 0x1038 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG 0x1039 +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT 0x103A +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE 0x103B +#define CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF 0x103C +#define CL_DEVICE_OPENCL_C_VERSION 0x103D +#define CL_DEVICE_LINKER_AVAILABLE 0x103E +#define CL_DEVICE_BUILT_IN_KERNELS 0x103F +#define CL_DEVICE_IMAGE_MAX_BUFFER_SIZE 0x1040 +#define CL_DEVICE_IMAGE_MAX_ARRAY_SIZE 0x1041 +#define CL_DEVICE_PARENT_DEVICE 0x1042 +#define CL_DEVICE_PARTITION_MAX_SUB_DEVICES 0x1043 +#define CL_DEVICE_PARTITION_PROPERTIES 0x1044 +#define CL_DEVICE_PARTITION_AFFINITY_DOMAIN 0x1045 +#define CL_DEVICE_PARTITION_TYPE 0x1046 +#define CL_DEVICE_REFERENCE_COUNT 0x1047 +#define CL_DEVICE_PREFERRED_INTEROP_USER_SYNC 0x1048 +#define CL_DEVICE_PRINTF_BUFFER_SIZE 0x1049 +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT 0x104B +#define CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS 0x104C +#define CL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE 0x104D +#define CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES 0x104E +#define CL_DEVICE_QUEUE_ON_DEVICE_PREFERRED_SIZE 0x104F +#define CL_DEVICE_QUEUE_ON_DEVICE_MAX_SIZE 0x1050 +#define CL_DEVICE_MAX_ON_DEVICE_QUEUES 0x1051 +#define CL_DEVICE_MAX_ON_DEVICE_EVENTS 0x1052 +#define CL_DEVICE_SVM_CAPABILITIES 0x1053 +#define CL_DEVICE_GLOBAL_VARIABLE_PREFERRED_TOTAL_SIZE 0x1054 +#define CL_DEVICE_MAX_PIPE_ARGS 0x1055 +#define CL_DEVICE_PIPE_MAX_ACTIVE_RESERVATIONS 0x1056 +#define CL_DEVICE_PIPE_MAX_PACKET_SIZE 0x1057 +#define CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT 0x1058 +#define CL_DEVICE_PREFERRED_GLOBAL_ATOMIC_ALIGNMENT 0x1059 +#define CL_DEVICE_PREFERRED_LOCAL_ATOMIC_ALIGNMENT 0x105A +#define CL_DEVICE_IL_VERSION 0x105B +#define CL_DEVICE_MAX_NUM_SUB_GROUPS 0x105C +#define CL_DEVICE_SUB_GROUP_INDEPENDENT_FORWARD_PROGRESS 0x105D +#define CL_DEVICE_NUMERIC_VERSION 0x105E +#define CL_DEVICE_EXTENSIONS_WITH_VERSION 0x1060 +#define CL_DEVICE_ILS_WITH_VERSION 0x1061 +#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION 0x1062 +#define CL_DEVICE_ATOMIC_MEMORY_CAPABILITIES 0x1063 +#define CL_DEVICE_ATOMIC_FENCE_CAPABILITIES 0x1064 +#define CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT 0x1065 +#define CL_DEVICE_OPENCL_C_ALL_VERSIONS 0x1066 +#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x1067 +#define CL_DEVICE_WORK_GROUP_COLLECTIVE_FUNCTIONS_SUPPORT 0x1068 +#define CL_DEVICE_GENERIC_ADDRESS_SPACE_SUPPORT 0x1069 +/* 0x106A to 0x106E - Reserved for upcoming KHR extension */ +#define CL_DEVICE_OPENCL_C_FEATURES 0x106F +#define CL_DEVICE_DEVICE_ENQUEUE_CAPABILITIES 0x1070 +#define CL_DEVICE_PIPE_SUPPORT 0x1071 +#define CL_DEVICE_LATEST_CONFORMANCE_VERSION_PASSED 0x1072 + +/* cl_device_fp_config - bitfield */ +#define CL_FP_DENORM (1 << 0) +#define CL_FP_INF_NAN (1 << 1) +#define CL_FP_ROUND_TO_NEAREST (1 << 2) +#define CL_FP_ROUND_TO_ZERO (1 << 3) +#define CL_FP_ROUND_TO_INF (1 << 4) +#define CL_FP_FMA (1 << 5) +#define CL_FP_SOFT_FLOAT (1 << 6) +#define CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT (1 << 7) + +/* cl_device_mem_cache_type */ +#define CL_NONE 0x0 +#define CL_READ_ONLY_CACHE 0x1 +#define CL_READ_WRITE_CACHE 0x2 + +/* cl_device_local_mem_type */ +#define CL_LOCAL 0x1 +#define CL_GLOBAL 0x2 + +/* cl_device_exec_capabilities - bitfield */ +#define CL_EXEC_KERNEL (1 << 0) +#define CL_EXEC_NATIVE_KERNEL (1 << 1) + +/* cl_command_queue_properties - bitfield */ +#define CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE (1 << 0) +#define CL_QUEUE_PROFILING_ENABLE (1 << 1) +#define CL_QUEUE_ON_DEVICE (1 << 2) +#define CL_QUEUE_ON_DEVICE_DEFAULT (1 << 3) + +/* cl_context_info */ +#define CL_CONTEXT_REFERENCE_COUNT 0x1080 +#define CL_CONTEXT_DEVICES 0x1081 +#define CL_CONTEXT_PROPERTIES 0x1082 +#define CL_CONTEXT_NUM_DEVICES 0x1083 + +/* cl_context_properties */ +#define CL_CONTEXT_PLATFORM 0x1084 +#define CL_CONTEXT_INTEROP_USER_SYNC 0x1085 + +/* cl_device_partition_property */ +#define CL_DEVICE_PARTITION_EQUALLY 0x1086 +#define CL_DEVICE_PARTITION_BY_COUNTS 0x1087 +#define CL_DEVICE_PARTITION_BY_COUNTS_LIST_END 0x0 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN 0x1088 + +/* cl_device_affinity_domain */ +#define CL_DEVICE_AFFINITY_DOMAIN_NUMA (1 << 0) +#define CL_DEVICE_AFFINITY_DOMAIN_L4_CACHE (1 << 1) +#define CL_DEVICE_AFFINITY_DOMAIN_L3_CACHE (1 << 2) +#define CL_DEVICE_AFFINITY_DOMAIN_L2_CACHE (1 << 3) +#define CL_DEVICE_AFFINITY_DOMAIN_L1_CACHE (1 << 4) +#define CL_DEVICE_AFFINITY_DOMAIN_NEXT_PARTITIONABLE (1 << 5) + +/* cl_device_svm_capabilities */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS (1 << 3) + +/* cl_command_queue_info */ +#define CL_QUEUE_CONTEXT 0x1090 +#define CL_QUEUE_DEVICE 0x1091 +#define CL_QUEUE_REFERENCE_COUNT 0x1092 +#define CL_QUEUE_PROPERTIES 0x1093 +#define CL_QUEUE_SIZE 0x1094 +#define CL_QUEUE_DEVICE_DEFAULT 0x1095 +#define CL_QUEUE_PROPERTIES_ARRAY 0x1098 + +/* cl_mem_flags and cl_svm_mem_flags - bitfield */ +#define CL_MEM_READ_WRITE (1 << 0) +#define CL_MEM_WRITE_ONLY (1 << 1) +#define CL_MEM_READ_ONLY (1 << 2) +#define CL_MEM_USE_HOST_PTR (1 << 3) +#define CL_MEM_ALLOC_HOST_PTR (1 << 4) +#define CL_MEM_COPY_HOST_PTR (1 << 5) +/* reserved (1 << 6) */ +#define CL_MEM_HOST_WRITE_ONLY (1 << 7) +#define CL_MEM_HOST_READ_ONLY (1 << 8) +#define CL_MEM_HOST_NO_ACCESS (1 << 9) +#define CL_MEM_SVM_FINE_GRAIN_BUFFER (1 << 10) /* used by cl_svm_mem_flags only */ +#define CL_MEM_SVM_ATOMICS (1 << 11) /* used by cl_svm_mem_flags only */ +#define CL_MEM_KERNEL_READ_AND_WRITE (1 << 12) + +/* cl_mem_migration_flags - bitfield */ +#define CL_MIGRATE_MEM_OBJECT_HOST (1 << 0) +#define CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED (1 << 1) + +/* cl_channel_order */ +#define CL_R 0x10B0 +#define CL_A 0x10B1 +#define CL_RG 0x10B2 +#define CL_RA 0x10B3 +#define CL_RGB 0x10B4 +#define CL_RGBA 0x10B5 +#define CL_BGRA 0x10B6 +#define CL_ARGB 0x10B7 +#define CL_INTENSITY 0x10B8 +#define CL_LUMINANCE 0x10B9 +#define CL_Rx 0x10BA +#define CL_RGx 0x10BB +#define CL_RGBx 0x10BC +#define CL_DEPTH 0x10BD +#define CL_DEPTH_STENCIL 0x10BE +#define CL_sRGB 0x10BF +#define CL_sRGBx 0x10C0 +#define CL_sRGBA 0x10C1 +#define CL_sBGRA 0x10C2 +#define CL_ABGR 0x10C3 + +/* cl_channel_type */ +#define CL_SNORM_INT8 0x10D0 +#define CL_SNORM_INT16 0x10D1 +#define CL_UNORM_INT8 0x10D2 +#define CL_UNORM_INT16 0x10D3 +#define CL_UNORM_SHORT_565 0x10D4 +#define CL_UNORM_SHORT_555 0x10D5 +#define CL_UNORM_INT_101010 0x10D6 +#define CL_SIGNED_INT8 0x10D7 +#define CL_SIGNED_INT16 0x10D8 +#define CL_SIGNED_INT32 0x10D9 +#define CL_UNSIGNED_INT8 0x10DA +#define CL_UNSIGNED_INT16 0x10DB +#define CL_UNSIGNED_INT32 0x10DC +#define CL_HALF_FLOAT 0x10DD +#define CL_FLOAT 0x10DE +#define CL_UNORM_INT24 0x10DF +#define CL_UNORM_INT_101010_2 0x10E0 + +/* cl_mem_object_type */ +#define CL_MEM_OBJECT_BUFFER 0x10F0 +#define CL_MEM_OBJECT_IMAGE2D 0x10F1 +#define CL_MEM_OBJECT_IMAGE3D 0x10F2 +#define CL_MEM_OBJECT_IMAGE2D_ARRAY 0x10F3 +#define CL_MEM_OBJECT_IMAGE1D 0x10F4 +#define CL_MEM_OBJECT_IMAGE1D_ARRAY 0x10F5 +#define CL_MEM_OBJECT_IMAGE1D_BUFFER 0x10F6 +#define CL_MEM_OBJECT_PIPE 0x10F7 + +/* cl_mem_info */ +#define CL_MEM_TYPE 0x1100 +#define CL_MEM_FLAGS 0x1101 +#define CL_MEM_SIZE 0x1102 +#define CL_MEM_HOST_PTR 0x1103 +#define CL_MEM_MAP_COUNT 0x1104 +#define CL_MEM_REFERENCE_COUNT 0x1105 +#define CL_MEM_CONTEXT 0x1106 +#define CL_MEM_ASSOCIATED_MEMOBJECT 0x1107 +#define CL_MEM_OFFSET 0x1108 +#define CL_MEM_USES_SVM_POINTER 0x1109 +#define CL_MEM_PROPERTIES 0x110A + +/* cl_image_info */ +#define CL_IMAGE_FORMAT 0x1110 +#define CL_IMAGE_ELEMENT_SIZE 0x1111 +#define CL_IMAGE_ROW_PITCH 0x1112 +#define CL_IMAGE_SLICE_PITCH 0x1113 +#define CL_IMAGE_WIDTH 0x1114 +#define CL_IMAGE_HEIGHT 0x1115 +#define CL_IMAGE_DEPTH 0x1116 +#define CL_IMAGE_ARRAY_SIZE 0x1117 +#define CL_IMAGE_BUFFER 0x1118 +#define CL_IMAGE_NUM_MIP_LEVELS 0x1119 +#define CL_IMAGE_NUM_SAMPLES 0x111A + +/* cl_pipe_info */ +#define CL_PIPE_PACKET_SIZE 0x1120 +#define CL_PIPE_MAX_PACKETS 0x1121 +#define CL_PIPE_PROPERTIES 0x1122 + +/* cl_addressing_mode */ +#define CL_ADDRESS_NONE 0x1130 +#define CL_ADDRESS_CLAMP_TO_EDGE 0x1131 +#define CL_ADDRESS_CLAMP 0x1132 +#define CL_ADDRESS_REPEAT 0x1133 +#define CL_ADDRESS_MIRRORED_REPEAT 0x1134 + +/* cl_filter_mode */ +#define CL_FILTER_NEAREST 0x1140 +#define CL_FILTER_LINEAR 0x1141 + +/* cl_sampler_info */ +#define CL_SAMPLER_REFERENCE_COUNT 0x1150 +#define CL_SAMPLER_CONTEXT 0x1151 +#define CL_SAMPLER_NORMALIZED_COORDS 0x1152 +#define CL_SAMPLER_ADDRESSING_MODE 0x1153 +#define CL_SAMPLER_FILTER_MODE 0x1154 +/* These enumerants are for the cl_khr_mipmap_image extension. + They have since been added to cl_ext.h with an appropriate + KHR suffix, but are left here for backwards compatibility. */ +#define CL_SAMPLER_MIP_FILTER_MODE 0x1155 +#define CL_SAMPLER_LOD_MIN 0x1156 +#define CL_SAMPLER_LOD_MAX 0x1157 +#define CL_SAMPLER_PROPERTIES 0x1158 + +/* cl_map_flags - bitfield */ +#define CL_MAP_READ (1 << 0) +#define CL_MAP_WRITE (1 << 1) +#define CL_MAP_WRITE_INVALIDATE_REGION (1 << 2) + +/* cl_program_info */ +#define CL_PROGRAM_REFERENCE_COUNT 0x1160 +#define CL_PROGRAM_CONTEXT 0x1161 +#define CL_PROGRAM_NUM_DEVICES 0x1162 +#define CL_PROGRAM_DEVICES 0x1163 +#define CL_PROGRAM_SOURCE 0x1164 +#define CL_PROGRAM_BINARY_SIZES 0x1165 +#define CL_PROGRAM_BINARIES 0x1166 +#define CL_PROGRAM_NUM_KERNELS 0x1167 +#define CL_PROGRAM_KERNEL_NAMES 0x1168 +#define CL_PROGRAM_IL 0x1169 +#define CL_PROGRAM_SCOPE_GLOBAL_CTORS_PRESENT 0x116A +#define CL_PROGRAM_SCOPE_GLOBAL_DTORS_PRESENT 0x116B + +/* cl_program_build_info */ +#define CL_PROGRAM_BUILD_STATUS 0x1181 +#define CL_PROGRAM_BUILD_OPTIONS 0x1182 +#define CL_PROGRAM_BUILD_LOG 0x1183 +#define CL_PROGRAM_BINARY_TYPE 0x1184 +#define CL_PROGRAM_BUILD_GLOBAL_VARIABLE_TOTAL_SIZE 0x1185 + +/* cl_program_binary_type */ +#define CL_PROGRAM_BINARY_TYPE_NONE 0x0 +#define CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT 0x1 +#define CL_PROGRAM_BINARY_TYPE_LIBRARY 0x2 +#define CL_PROGRAM_BINARY_TYPE_EXECUTABLE 0x4 + +/* cl_build_status */ +#define CL_BUILD_SUCCESS 0 +#define CL_BUILD_NONE -1 +#define CL_BUILD_ERROR -2 +#define CL_BUILD_IN_PROGRESS -3 + +/* cl_kernel_info */ +#define CL_KERNEL_FUNCTION_NAME 0x1190 +#define CL_KERNEL_NUM_ARGS 0x1191 +#define CL_KERNEL_REFERENCE_COUNT 0x1192 +#define CL_KERNEL_CONTEXT 0x1193 +#define CL_KERNEL_PROGRAM 0x1194 +#define CL_KERNEL_ATTRIBUTES 0x1195 + +/* cl_kernel_arg_info */ +#define CL_KERNEL_ARG_ADDRESS_QUALIFIER 0x1196 +#define CL_KERNEL_ARG_ACCESS_QUALIFIER 0x1197 +#define CL_KERNEL_ARG_TYPE_NAME 0x1198 +#define CL_KERNEL_ARG_TYPE_QUALIFIER 0x1199 +#define CL_KERNEL_ARG_NAME 0x119A + +/* cl_kernel_arg_address_qualifier */ +#define CL_KERNEL_ARG_ADDRESS_GLOBAL 0x119B +#define CL_KERNEL_ARG_ADDRESS_LOCAL 0x119C +#define CL_KERNEL_ARG_ADDRESS_CONSTANT 0x119D +#define CL_KERNEL_ARG_ADDRESS_PRIVATE 0x119E + +/* cl_kernel_arg_access_qualifier */ +#define CL_KERNEL_ARG_ACCESS_READ_ONLY 0x11A0 +#define CL_KERNEL_ARG_ACCESS_WRITE_ONLY 0x11A1 +#define CL_KERNEL_ARG_ACCESS_READ_WRITE 0x11A2 +#define CL_KERNEL_ARG_ACCESS_NONE 0x11A3 + +/* cl_kernel_arg_type_qualifier */ +#define CL_KERNEL_ARG_TYPE_NONE 0 +#define CL_KERNEL_ARG_TYPE_CONST (1 << 0) +#define CL_KERNEL_ARG_TYPE_RESTRICT (1 << 1) +#define CL_KERNEL_ARG_TYPE_VOLATILE (1 << 2) +#define CL_KERNEL_ARG_TYPE_PIPE (1 << 3) + +/* cl_kernel_work_group_info */ +#define CL_KERNEL_WORK_GROUP_SIZE 0x11B0 +#define CL_KERNEL_COMPILE_WORK_GROUP_SIZE 0x11B1 +#define CL_KERNEL_LOCAL_MEM_SIZE 0x11B2 +#define CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE 0x11B3 +#define CL_KERNEL_PRIVATE_MEM_SIZE 0x11B4 +#define CL_KERNEL_GLOBAL_WORK_SIZE 0x11B5 + +/* cl_kernel_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE 0x2034 +#define CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT 0x11B8 +#define CL_KERNEL_MAX_NUM_SUB_GROUPS 0x11B9 +#define CL_KERNEL_COMPILE_NUM_SUB_GROUPS 0x11BA + +/* cl_kernel_exec_info */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS 0x11B6 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM 0x11B7 + +/* cl_event_info */ +#define CL_EVENT_COMMAND_QUEUE 0x11D0 +#define CL_EVENT_COMMAND_TYPE 0x11D1 +#define CL_EVENT_REFERENCE_COUNT 0x11D2 +#define CL_EVENT_COMMAND_EXECUTION_STATUS 0x11D3 +#define CL_EVENT_CONTEXT 0x11D4 + +/* cl_command_type */ +#define CL_COMMAND_NDRANGE_KERNEL 0x11F0 +#define CL_COMMAND_TASK 0x11F1 +#define CL_COMMAND_NATIVE_KERNEL 0x11F2 +#define CL_COMMAND_READ_BUFFER 0x11F3 +#define CL_COMMAND_WRITE_BUFFER 0x11F4 +#define CL_COMMAND_COPY_BUFFER 0x11F5 +#define CL_COMMAND_READ_IMAGE 0x11F6 +#define CL_COMMAND_WRITE_IMAGE 0x11F7 +#define CL_COMMAND_COPY_IMAGE 0x11F8 +#define CL_COMMAND_COPY_IMAGE_TO_BUFFER 0x11F9 +#define CL_COMMAND_COPY_BUFFER_TO_IMAGE 0x11FA +#define CL_COMMAND_MAP_BUFFER 0x11FB +#define CL_COMMAND_MAP_IMAGE 0x11FC +#define CL_COMMAND_UNMAP_MEM_OBJECT 0x11FD +#define CL_COMMAND_MARKER 0x11FE +#define CL_COMMAND_ACQUIRE_GL_OBJECTS 0x11FF +#define CL_COMMAND_RELEASE_GL_OBJECTS 0x1200 +#define CL_COMMAND_READ_BUFFER_RECT 0x1201 +#define CL_COMMAND_WRITE_BUFFER_RECT 0x1202 +#define CL_COMMAND_COPY_BUFFER_RECT 0x1203 +#define CL_COMMAND_USER 0x1204 +#define CL_COMMAND_BARRIER 0x1205 +#define CL_COMMAND_MIGRATE_MEM_OBJECTS 0x1206 +#define CL_COMMAND_FILL_BUFFER 0x1207 +#define CL_COMMAND_FILL_IMAGE 0x1208 +#define CL_COMMAND_SVM_FREE 0x1209 +#define CL_COMMAND_SVM_MEMCPY 0x120A +#define CL_COMMAND_SVM_MEMFILL 0x120B +#define CL_COMMAND_SVM_MAP 0x120C +#define CL_COMMAND_SVM_UNMAP 0x120D +#define CL_COMMAND_SVM_MIGRATE_MEM 0x120E + +/* command execution status */ +#define CL_COMPLETE 0x0 +#define CL_RUNNING 0x1 +#define CL_SUBMITTED 0x2 +#define CL_QUEUED 0x3 + +/* cl_buffer_create_type */ +#define CL_BUFFER_CREATE_TYPE_REGION 0x1220 + +/* cl_profiling_info */ +#define CL_PROFILING_COMMAND_QUEUED 0x1280 +#define CL_PROFILING_COMMAND_SUBMIT 0x1281 +#define CL_PROFILING_COMMAND_START 0x1282 +#define CL_PROFILING_COMMAND_END 0x1283 +#define CL_PROFILING_COMMAND_COMPLETE 0x1284 + +/* cl_device_atomic_capabilities - bitfield */ +#define CL_DEVICE_ATOMIC_ORDER_RELAXED (1 << 0) +#define CL_DEVICE_ATOMIC_ORDER_ACQ_REL (1 << 1) +#define CL_DEVICE_ATOMIC_ORDER_SEQ_CST (1 << 2) +#define CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM (1 << 3) +#define CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP (1 << 4) +#define CL_DEVICE_ATOMIC_SCOPE_DEVICE (1 << 5) +#define CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES (1 << 6) + +/* cl_device_device_enqueue_capabilities - bitfield */ +#define CL_DEVICE_QUEUE_SUPPORTED (1 << 0) +#define CL_DEVICE_QUEUE_REPLACEABLE_DEFAULT (1 << 1) + +/* cl_khronos_vendor_id */ +#define CL_KHRONOS_VENDOR_ID_CODEPLAY 0x10004 + +/* cl_version */ +#define CL_VERSION_MAJOR_BITS (10) +#define CL_VERSION_MINOR_BITS (10) +#define CL_VERSION_PATCH_BITS (12) + +#define CL_VERSION_MAJOR_MASK ((1 << CL_VERSION_MAJOR_BITS) - 1) +#define CL_VERSION_MINOR_MASK ((1 << CL_VERSION_MINOR_BITS) - 1) +#define CL_VERSION_PATCH_MASK ((1 << CL_VERSION_PATCH_BITS) - 1) + +#define CL_VERSION_MAJOR(version) \ + ((version) >> (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) + +#define CL_VERSION_MINOR(version) \ + (((version) >> CL_VERSION_PATCH_BITS) & CL_VERSION_MINOR_MASK) + +#define CL_VERSION_PATCH(version) ((version) & CL_VERSION_PATCH_MASK) + +#define CL_MAKE_VERSION(major, minor, patch) \ + ((((major) & CL_VERSION_MAJOR_MASK) \ + << (CL_VERSION_MINOR_BITS + CL_VERSION_PATCH_BITS)) | \ + (((minor) & CL_VERSION_MINOR_MASK) << CL_VERSION_PATCH_BITS) | \ + ((patch) & CL_VERSION_PATCH_MASK)) + +/********************************************************************************************************/ + +/* Platform API */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformIDs(cl_uint num_entries, + cl_platform_id * platforms, + cl_uint * num_platforms) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPlatformInfo(cl_platform_id platform, + cl_platform_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +/* Device APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceIDs(cl_platform_id platform, + cl_device_type device_type, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceInfo(cl_device_id device, + cl_device_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevices(cl_device_id in_device, + const cl_device_partition_property * properties, + cl_uint num_devices, + cl_device_id * out_devices, + cl_uint * num_devices_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDevice(cl_device_id device) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetDefaultDeviceCommandQueue(cl_context context, + cl_device_id device, + cl_command_queue command_queue) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceAndHostTimer(cl_device_id device, + cl_ulong* device_timestamp, + cl_ulong* host_timestamp) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetHostTimer(cl_device_id device, + cl_ulong * host_timestamp) CL_API_SUFFIX__VERSION_2_1; + +/* Context APIs */ +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContext(const cl_context_properties * properties, + cl_uint num_devices, + const cl_device_id * devices, + void (CL_CALLBACK * pfn_notify)(const char * errinfo, + const void * private_info, + size_t cb, + void * user_data), + void * user_data, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_context CL_API_CALL +clCreateContextFromType(const cl_context_properties * properties, + cl_device_type device_type, + void (CL_CALLBACK * pfn_notify)(const char * errinfo, + const void * private_info, + size_t cb, + void * user_data), + void * user_data, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainContext(cl_context context) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseContext(cl_context context) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetContextInfo(cl_context context, + cl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetContextDestructorCallback(cl_context context, + void (CL_CALLBACK* pfn_notify)(cl_context context, + void* user_data), + void* user_data) CL_API_SUFFIX__VERSION_3_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueTask(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +/* Command Queue APIs */ +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueue(cl_context context, + cl_device_id device, + cl_command_queue_properties properties, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithProperties(cl_context context, + cl_device_id device, + const cl_queue_properties * properties, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseCommandQueue(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetCommandQueueInfo(cl_command_queue command_queue, + cl_command_queue_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +/* Memory Object APIs */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBuffer(cl_context context, + cl_mem_flags flags, + size_t size, + void * host_ptr, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateSubBuffer(cl_mem buffer, + cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void * buffer_create_info, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImage(cl_context context, + cl_mem_flags flags, + const cl_image_format * image_format, + const cl_image_desc * image_desc, + void * host_ptr, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreatePipe(cl_context context, + cl_mem_flags flags, + cl_uint pipe_packet_size, + cl_uint pipe_max_packets, + const cl_pipe_properties * properties, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBufferWithProperties(cl_context context, + const cl_mem_properties * properties, + cl_mem_flags flags, + size_t size, + void * host_ptr, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImageWithProperties(cl_context context, + const cl_mem_properties * properties, + cl_mem_flags flags, + const cl_image_format * image_format, + const cl_image_desc * image_desc, + void * host_ptr, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_3_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseMemObject(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSupportedImageFormats(cl_context context, + cl_mem_flags flags, + cl_mem_object_type image_type, + cl_uint num_entries, + cl_image_format * image_formats, + cl_uint * num_image_formats) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetMemObjectInfo(cl_mem memobj, + cl_mem_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetImageInfo(cl_mem image, + cl_image_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetPipeInfo(cl_mem pipe, + cl_pipe_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetMemObjectDestructorCallback(cl_mem memobj, + void (CL_CALLBACK * pfn_notify)(cl_mem memobj, + void * user_data), + void * user_data) CL_API_SUFFIX__VERSION_1_1; + +/* SVM Allocation APIs */ +extern CL_API_ENTRY void * CL_API_CALL +clSVMAlloc(cl_context context, + cl_svm_mem_flags flags, + size_t size, + cl_uint alignment) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFree(cl_context context, + void * svm_pointer) CL_API_SUFFIX__VERSION_2_0; + +/* Sampler APIs */ +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSampler(cl_context context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_sampler CL_API_CALL +clCreateSamplerWithProperties(cl_context context, + const cl_sampler_properties * sampler_properties, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSampler(cl_sampler sampler) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSamplerInfo(cl_sampler sampler, + cl_sampler_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +/* Program Object APIs */ +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithSource(cl_context context, + cl_uint count, + const char ** strings, + const size_t * lengths, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBinary(cl_context context, + cl_uint num_devices, + const cl_device_id * device_list, + const size_t * lengths, + const unsigned char ** binaries, + cl_int * binary_status, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithBuiltInKernels(cl_context context, + cl_uint num_devices, + const cl_device_id * device_list, + const char * kernel_names, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithIL(cl_context context, + const void* il, + size_t length, + cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseProgram(cl_program program) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clBuildProgram(cl_program program, + cl_uint num_devices, + const cl_device_id * device_list, + const char * options, + void (CL_CALLBACK * pfn_notify)(cl_program program, + void * user_data), + void * user_data) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCompileProgram(cl_program program, + cl_uint num_devices, + const cl_device_id * device_list, + const char * options, + cl_uint num_input_headers, + const cl_program * input_headers, + const char ** header_include_names, + void (CL_CALLBACK * pfn_notify)(cl_program program, + void * user_data), + void * user_data) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_program CL_API_CALL +clLinkProgram(cl_context context, + cl_uint num_devices, + const cl_device_id * device_list, + const char * options, + cl_uint num_input_programs, + const cl_program * input_programs, + void (CL_CALLBACK * pfn_notify)(cl_program program, + void * user_data), + void * user_data, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_2_2_DEPRECATED cl_int CL_API_CALL +clSetProgramReleaseCallback(cl_program program, + void (CL_CALLBACK * pfn_notify)(cl_program program, + void * user_data), + void * user_data) CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetProgramSpecializationConstant(cl_program program, + cl_uint spec_id, + size_t spec_size, + const void* spec_value) CL_API_SUFFIX__VERSION_2_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clUnloadPlatformCompiler(cl_platform_id platform) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramInfo(cl_program program, + cl_program_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetProgramBuildInfo(cl_program program, + cl_device_id device, + cl_program_build_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +/* Kernel Object APIs */ +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCreateKernel(cl_program program, + const char * kernel_name, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateKernelsInProgram(cl_program program, + cl_uint num_kernels, + cl_kernel * kernels, + cl_uint * num_kernels_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_kernel CL_API_CALL +clCloneKernel(cl_kernel source_kernel, + cl_int* errcode_ret) CL_API_SUFFIX__VERSION_2_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseKernel(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArg(cl_kernel kernel, + cl_uint arg_index, + size_t arg_size, + const void * arg_value) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointer(cl_kernel kernel, + cl_uint arg_index, + const void * arg_value) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfo(cl_kernel kernel, + cl_kernel_exec_info param_name, + size_t param_value_size, + const void * param_value) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelInfo(cl_kernel kernel, + cl_kernel_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelArgInfo(cl_kernel kernel, + cl_uint arg_indx, + cl_kernel_arg_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelWorkGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_work_group_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfo(cl_kernel kernel, + cl_device_id device, + cl_kernel_sub_group_info param_name, + size_t input_value_size, + const void* input_value, + size_t param_value_size, + void* param_value, + size_t* param_value_size_ret) CL_API_SUFFIX__VERSION_2_1; + +/* Event Object APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clWaitForEvents(cl_uint num_events, + const cl_event * event_list) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventInfo(cl_event event, + cl_event_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateUserEvent(cl_context context, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetUserEventStatus(cl_event event, + cl_int execution_status) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetEventCallback(cl_event event, + cl_int command_exec_callback_type, + void (CL_CALLBACK * pfn_notify)(cl_event event, + cl_int event_command_status, + void * user_data), + void * user_data) CL_API_SUFFIX__VERSION_1_1; + +/* Profiling APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +/* Flush and Finish APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clFlush(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clFinish(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0; + +/* Enqueued Commands APIs */ +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + size_t offset, + size_t size, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadBufferRect(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_read, + const size_t * buffer_offset, + const size_t * host_offset, + const size_t * region, + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + size_t offset, + size_t size, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteBufferRect(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_write, + const size_t * buffer_offset, + const size_t * host_offset, + const size_t * region, + size_t buffer_row_pitch, + size_t buffer_slice_pitch, + size_t host_row_pitch, + size_t host_slice_pitch, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillBuffer(cl_command_queue command_queue, + cl_mem buffer, + const void * pattern, + size_t pattern_size, + size_t offset, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBuffer(cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + size_t src_offset, + size_t dst_offset, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferRect(cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_buffer, + const size_t * src_origin, + const size_t * dst_origin, + const size_t * region, + size_t src_row_pitch, + size_t src_slice_pitch, + size_t dst_row_pitch, + size_t dst_slice_pitch, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReadImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_read, + const size_t * origin, + const size_t * region, + size_t row_pitch, + size_t slice_pitch, + void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWriteImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_write, + const size_t * origin, + const size_t * region, + size_t input_row_pitch, + size_t input_slice_pitch, + const void * ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueFillImage(cl_command_queue command_queue, + cl_mem image, + const void * fill_color, + const size_t * origin, + const size_t * region, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImage(cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_image, + const size_t * src_origin, + const size_t * dst_origin, + const size_t * region, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyImageToBuffer(cl_command_queue command_queue, + cl_mem src_image, + cl_mem dst_buffer, + const size_t * src_origin, + const size_t * region, + size_t dst_offset, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueCopyBufferToImage(cl_command_queue command_queue, + cl_mem src_buffer, + cl_mem dst_image, + size_t src_offset, + const size_t * dst_origin, + const size_t * region, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapBuffer(cl_command_queue command_queue, + cl_mem buffer, + cl_bool blocking_map, + cl_map_flags map_flags, + size_t offset, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY void * CL_API_CALL +clEnqueueMapImage(cl_command_queue command_queue, + cl_mem image, + cl_bool blocking_map, + cl_map_flags map_flags, + const size_t * origin, + const size_t * region, + size_t * image_row_pitch, + size_t * image_slice_pitch, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueUnmapMemObject(cl_command_queue command_queue, + cl_mem memobj, + void * mapped_ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjects(cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem * mem_objects, + cl_mem_migration_flags flags, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNDRangeKernel(cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t * global_work_offset, + const size_t * global_work_size, + const size_t * local_work_size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueNativeKernel(cl_command_queue command_queue, + void (CL_CALLBACK * user_func)(void *), + void * args, + size_t cb_args, + cl_uint num_mem_objects, + const cl_mem * mem_list, + const void ** args_mem_loc, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMarkerWithWaitList(cl_command_queue command_queue, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueBarrierWithWaitList(cl_command_queue command_queue, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFree(cl_command_queue command_queue, + cl_uint num_svm_pointers, + void * svm_pointers[], + void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue, + cl_uint num_svm_pointers, + void * svm_pointers[], + void * user_data), + void * user_data, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpy(cl_command_queue command_queue, + cl_bool blocking_copy, + void * dst_ptr, + const void * src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFill(cl_command_queue command_queue, + void * svm_ptr, + const void * pattern, + size_t pattern_size, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMap(cl_command_queue command_queue, + cl_bool blocking_map, + cl_map_flags flags, + void * svm_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmap(cl_command_queue command_queue, + void * svm_ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_2_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMigrateMem(cl_command_queue command_queue, + cl_uint num_svm_pointers, + const void ** svm_pointers, + const size_t * sizes, + cl_mem_migration_flags flags, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_2_1; + +/* Extension function access + * + * Returns the extension function address for the given function name, + * or NULL if a valid function can not be found. The client must + * check to make sure the address is not NULL, before using or + * calling the returned function address. + */ +extern CL_API_ENTRY void * CL_API_CALL +clGetExtensionFunctionAddressForPlatform(cl_platform_id platform, + const char * func_name) CL_API_SUFFIX__VERSION_1_2; + +#ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + /* + * WARNING: + * This API introduces mutable state into the OpenCL implementation. It has been REMOVED + * to better facilitate thread safety. The 1.0 API is not thread safe. It is not tested by the + * OpenCL 1.1 conformance test, and consequently may not work or may not work dependably. + * It is likely to be non-performant. Use of this API is not advised. Use at your own risk. + * + * Software developers previously relying on this API are instructed to set the command queue + * properties when creating the queue, instead. + */ + extern CL_API_ENTRY cl_int CL_API_CALL + clSetCommandQueueProperty(cl_command_queue command_queue, + cl_command_queue_properties properties, + cl_bool enable, + cl_command_queue_properties * old_properties) CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED; +#endif /* CL_USE_DEPRECATED_OPENCL_1_0_APIS */ + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage2D(cl_context context, + cl_mem_flags flags, + const cl_image_format * image_format, + size_t image_width, + size_t image_height, + size_t image_row_pitch, + void * host_ptr, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateImage3D(cl_context context, + cl_mem_flags flags, + const cl_image_format * image_format, + size_t image_width, + size_t image_height, + size_t image_depth, + size_t image_row_pitch, + size_t image_slice_pitch, + void * host_ptr, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueMarker(cl_command_queue command_queue, + cl_event * event) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueWaitForEvents(cl_command_queue command_queue, + cl_uint num_events, + const cl_event * event_list) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clEnqueueBarrier(cl_command_queue command_queue) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int CL_API_CALL +clUnloadCompiler(void) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED void * CL_API_CALL +clGetExtensionFunctionAddress(const char * func_name) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_H */ diff --git a/ext/cudart/include/CL/cl.hpp b/ext/cudart/include/CL/cl.hpp new file mode 100644 index 0000000000000000000000000000000000000000..263e0ab4717ed411621f8cb3330aed385c672151 --- /dev/null +++ b/ext/cudart/include/CL/cl.hpp @@ -0,0 +1,12419 @@ +/******************************************************************************* + * Copyright (c) 2008-2013 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + ******************************************************************************/ + +/*! \file + * + * \brief C++ bindings for OpenCL 1.0 (rev 48), OpenCL 1.1 (rev 33) and + * OpenCL 1.2 (rev 15) + * \author Benedict R. Gaster, Laurent Morichetti and Lee Howes + * + * Additions and fixes from: + * Brian Cole, March 3rd 2010 and April 2012 + * Matt Gruenke, April 2012. + * Bruce Merry, February 2013. + * + * \version 1.2.5 + * \date June 2013 + * + * Optional extension support + * + * cl + * cl_ext_device_fission + * #define USE_CL_DEVICE_FISSION + */ + +/*! \mainpage + * \section intro Introduction + * For many large applications C++ is the language of choice and so it seems + * reasonable to define C++ bindings for OpenCL. + * + * + * The interface is contained with a single C++ header file \em cl.hpp and all + * definitions are contained within the namespace \em cl. There is no additional + * requirement to include \em cl.h and to use either the C++ or original C + * bindings it is enough to simply include \em cl.hpp. + * + * The bindings themselves are lightweight and correspond closely to the + * underlying C API. Using the C++ bindings introduces no additional execution + * overhead. + * + * For detail documentation on the bindings see: + * + * The OpenCL C++ Wrapper API 1.2 (revision 09) + * http://www.khronos.org/registry/cl/specs/opencl-cplusplus-1.2.pdf + * + * \section example Example + * + * The following example shows a general use case for the C++ + * bindings, including support for the optional exception feature and + * also the supplied vector and string classes, see following sections for + * decriptions of these features. + * + * \code + * #define __CL_ENABLE_EXCEPTIONS + * + * #if defined(__APPLE__) || defined(__MACOSX) + * #include <OpenCL/cl.hpp> + * #else + * #include <CL/cl.hpp> + * #endif + * #include <cstdio> + * #include <cstdlib> + * #include <iostream> + * + * const char * helloStr = "__kernel void " + * "hello(void) " + * "{ " + * " " + * "} "; + * + * int + * main(void) + * { + * cl_int err = CL_SUCCESS; + * try { + * + * std::vector<cl::Platform> platforms; + * cl::Platform::get(&platforms); + * if (platforms.size() == 0) { + * std::cout << "Platform size 0\n"; + * return -1; + * } + * + * cl_context_properties properties[] = + * { CL_CONTEXT_PLATFORM, (cl_context_properties)(platforms[0])(), 0}; + * cl::Context context(CL_DEVICE_TYPE_CPU, properties); + * + * std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>(); + * + * cl::Program::Sources source(1, + * std::make_pair(helloStr,strlen(helloStr))); + * cl::Program program_ = cl::Program(context, source); + * program_.build(devices); + * + * cl::Kernel kernel(program_, "hello", &err); + * + * cl::Event event; + * cl::CommandQueue queue(context, devices[0], 0, &err); + * queue.enqueueNDRangeKernel( + * kernel, + * cl::NullRange, + * cl::NDRange(4,4), + * cl::NullRange, + * NULL, + * &event); + * + * event.wait(); + * } + * catch (cl::Error err) { + * std::cerr + * << "ERROR: " + * << err.what() + * << "(" + * << err.err() + * << ")" + * << std::endl; + * } + * + * return EXIT_SUCCESS; + * } + * + * \endcode + * + */ +#ifndef CL_HPP_ +#define CL_HPP_ + +#ifdef _WIN32 + +#include <windows.h> +#include <malloc.h> +#include <iterator> +#include <intrin.h> + +#if defined(__CL_ENABLE_EXCEPTIONS) +#include <exception> +#endif // #if defined(__CL_ENABLE_EXCEPTIONS) + +#pragma push_macro("max") +#undef max +#if defined(USE_DX_INTEROP) +#include <CL/cl_d3d10.h> +#include <CL/cl_dx9_media_sharing.h> +#endif +#endif // _WIN32 + +// +#if defined(USE_CL_DEVICE_FISSION) +#include <CL/cl_ext.h> +#endif + +#if defined(__APPLE__) || defined(__MACOSX) +#include <OpenGL/OpenGL.h> +#include <OpenCL/opencl.h> +#include <libkern/OSAtomic.h> +#else +#include <GL/gl.h> +#include <CL/opencl.h> +#endif // !__APPLE__ + +// To avoid accidentally taking ownership of core OpenCL types +// such as cl_kernel constructors are made explicit +// under OpenCL 1.2 +#if defined(CL_VERSION_1_2) && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS explicit +#else // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +#define __CL_EXPLICIT_CONSTRUCTORS +#endif // #if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +// Define deprecated prefixes and suffixes to ensure compilation +// in case they are not pre-defined +#if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_1_DEPRECATED) +#if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_1_2_DEPRECATED) +#if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_0_DEPRECATED) +#if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_1_DEPRECATED) +#if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED) +#define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED) +#if !defined(CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED) +#define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED +#endif // #if !defined(CL_EXT_PREFIX__VERSION_2_2_DEPRECATED) + +#if !defined(CL_CALLBACK) +#define CL_CALLBACK +#endif //CL_CALLBACK + +#include <utility> +#include <limits> + +#if !defined(__NO_STD_VECTOR) +#include <vector> +#endif + +#if !defined(__NO_STD_STRING) +#include <string> +#endif + +#if defined(linux) || defined(__APPLE__) || defined(__MACOSX) +#include <alloca.h> + +#include <emmintrin.h> +#include <xmmintrin.h> +#endif // linux + +#include <cstring> + + +/*! \namespace cl + * + * \brief The OpenCL C++ bindings are defined within this namespace. + * + */ +namespace cl { + +class Memory; + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __INIT_CL_EXT_FCN_PTR(name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddress(#name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +#if defined(CL_VERSION_1_2) +#define __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, name) \ + if(!pfn_##name) { \ + pfn_##name = (PFN_##name) \ + clGetExtensionFunctionAddressForPlatform(platform, #name); \ + if(!pfn_##name) { \ + } \ + } +#endif // #if defined(CL_VERSION_1_1) + +class Program; +class Device; +class Context; +class CommandQueue; +class Memory; +class Buffer; + +#if defined(__CL_ENABLE_EXCEPTIONS) +/*! \brief Exception class + * + * This may be thrown by API functions when __CL_ENABLE_EXCEPTIONS is defined. + */ +class Error : public std::exception +{ +private: + cl_int err_; + const char * errStr_; +public: + /*! \brief Create a new CL error exception for a given error code + * and corresponding message. + * + * \param err error code value. + * + * \param errStr a descriptive string that must remain in scope until + * handling of the exception has concluded. If set, it + * will be returned by what(). + */ + Error(cl_int err, const char * errStr = NULL) : err_(err), errStr_(errStr) + {} + + ~Error() throw() {} + + /*! \brief Get error string associated with exception + * + * \return A memory pointer to the error message string. + */ + virtual const char * what() const throw () + { + if (errStr_ == NULL) { + return "empty"; + } + else { + return errStr_; + } + } + + /*! \brief Get error code associated with exception + * + * \return The error code. + */ + cl_int err(void) const { return err_; } +}; + +#define __ERR_STR(x) #x +#else +#define __ERR_STR(x) NULL +#endif // __CL_ENABLE_EXCEPTIONS + + +namespace detail +{ +#if defined(__CL_ENABLE_EXCEPTIONS) +static inline cl_int errHandler ( + cl_int err, + const char * errStr = NULL) +{ + if (err != CL_SUCCESS) { + throw Error(err, errStr); + } + return err; +} +#else +static inline cl_int errHandler (cl_int err, const char * errStr = NULL) +{ + (void) errStr; // suppress unused variable warning + return err; +} +#endif // __CL_ENABLE_EXCEPTIONS +} + + + +//! \cond DOXYGEN_DETAIL +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#define __GET_DEVICE_INFO_ERR __ERR_STR(clGetDeviceInfo) +#define __GET_PLATFORM_INFO_ERR __ERR_STR(clGetPlatformInfo) +#define __GET_DEVICE_IDS_ERR __ERR_STR(clGetDeviceIDs) +#define __GET_PLATFORM_IDS_ERR __ERR_STR(clGetPlatformIDs) +#define __GET_CONTEXT_INFO_ERR __ERR_STR(clGetContextInfo) +#define __GET_EVENT_INFO_ERR __ERR_STR(clGetEventInfo) +#define __GET_EVENT_PROFILE_INFO_ERR __ERR_STR(clGetEventProfileInfo) +#define __GET_MEM_OBJECT_INFO_ERR __ERR_STR(clGetMemObjectInfo) +#define __GET_IMAGE_INFO_ERR __ERR_STR(clGetImageInfo) +#define __GET_SAMPLER_INFO_ERR __ERR_STR(clGetSamplerInfo) +#define __GET_KERNEL_INFO_ERR __ERR_STR(clGetKernelInfo) +#if defined(CL_VERSION_1_2) +#define __GET_KERNEL_ARG_INFO_ERR __ERR_STR(clGetKernelArgInfo) +#endif // #if defined(CL_VERSION_1_2) +#define __GET_KERNEL_WORK_GROUP_INFO_ERR __ERR_STR(clGetKernelWorkGroupInfo) +#define __GET_PROGRAM_INFO_ERR __ERR_STR(clGetProgramInfo) +#define __GET_PROGRAM_BUILD_INFO_ERR __ERR_STR(clGetProgramBuildInfo) +#define __GET_COMMAND_QUEUE_INFO_ERR __ERR_STR(clGetCommandQueueInfo) + +#define __CREATE_CONTEXT_ERR __ERR_STR(clCreateContext) +#define __CREATE_CONTEXT_FROM_TYPE_ERR __ERR_STR(clCreateContextFromType) +#define __GET_SUPPORTED_IMAGE_FORMATS_ERR __ERR_STR(clGetSupportedImageFormats) + +#define __CREATE_BUFFER_ERR __ERR_STR(clCreateBuffer) +#define __COPY_ERR __ERR_STR(cl::copy) +#define __CREATE_SUBBUFFER_ERR __ERR_STR(clCreateSubBuffer) +#define __CREATE_GL_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __CREATE_GL_RENDER_BUFFER_ERR __ERR_STR(clCreateFromGLBuffer) +#define __GET_GL_OBJECT_INFO_ERR __ERR_STR(clGetGLObjectInfo) +#if defined(CL_VERSION_1_2) +#define __CREATE_IMAGE_ERR __ERR_STR(clCreateImage) +#define __CREATE_GL_TEXTURE_ERR __ERR_STR(clCreateFromGLTexture) +#define __IMAGE_DIMENSION_ERR __ERR_STR(Incorrect image dimensions) +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_SAMPLER_PROPERTY_ERR __ERR_STR(clCreateSamplerWithProperties) +#define __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR __ERR_STR(clSetMemObjectDestructorCallback) + +#define __CREATE_USER_EVENT_ERR __ERR_STR(clCreateUserEvent) +#define __SET_USER_EVENT_STATUS_ERR __ERR_STR(clSetUserEventStatus) +#define __SET_EVENT_CALLBACK_ERR __ERR_STR(clSetEventCallback) +#define __WAIT_FOR_EVENTS_ERR __ERR_STR(clWaitForEvents) + +#define __CREATE_KERNEL_ERR __ERR_STR(clCreateKernel) +#define __SET_KERNEL_ARGS_ERR __ERR_STR(clSetKernelArg) +#define __CREATE_PROGRAM_WITH_SOURCE_ERR __ERR_STR(clCreateProgramWithSource) +#define __CREATE_PROGRAM_WITH_BINARY_ERR __ERR_STR(clCreateProgramWithBinary) +#if defined(CL_VERSION_1_2) +#define __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR __ERR_STR(clCreateProgramWithBuiltInKernels) +#endif // #if defined(CL_VERSION_1_2) +#define __BUILD_PROGRAM_ERR __ERR_STR(clBuildProgram) +#if defined(CL_VERSION_1_2) +#define __COMPILE_PROGRAM_ERR __ERR_STR(clCompileProgram) + +#endif // #if defined(CL_VERSION_1_2) +#define __CREATE_KERNELS_IN_PROGRAM_ERR __ERR_STR(clCreateKernelsInProgram) + +#define __CREATE_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clCreateCommandQueueWithProperties) +#define __SET_COMMAND_QUEUE_PROPERTY_ERR __ERR_STR(clSetCommandQueueProperty) +#define __ENQUEUE_READ_BUFFER_ERR __ERR_STR(clEnqueueReadBuffer) +#define __ENQUEUE_READ_BUFFER_RECT_ERR __ERR_STR(clEnqueueReadBufferRect) +#define __ENQUEUE_WRITE_BUFFER_ERR __ERR_STR(clEnqueueWriteBuffer) +#define __ENQUEUE_WRITE_BUFFER_RECT_ERR __ERR_STR(clEnqueueWriteBufferRect) +#define __ENQEUE_COPY_BUFFER_ERR __ERR_STR(clEnqueueCopyBuffer) +#define __ENQEUE_COPY_BUFFER_RECT_ERR __ERR_STR(clEnqueueCopyBufferRect) +#define __ENQUEUE_FILL_BUFFER_ERR __ERR_STR(clEnqueueFillBuffer) +#define __ENQUEUE_READ_IMAGE_ERR __ERR_STR(clEnqueueReadImage) +#define __ENQUEUE_WRITE_IMAGE_ERR __ERR_STR(clEnqueueWriteImage) +#define __ENQUEUE_COPY_IMAGE_ERR __ERR_STR(clEnqueueCopyImage) +#define __ENQUEUE_FILL_IMAGE_ERR __ERR_STR(clEnqueueFillImage) +#define __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR __ERR_STR(clEnqueueCopyImageToBuffer) +#define __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR __ERR_STR(clEnqueueCopyBufferToImage) +#define __ENQUEUE_MAP_BUFFER_ERR __ERR_STR(clEnqueueMapBuffer) +#define __ENQUEUE_MAP_IMAGE_ERR __ERR_STR(clEnqueueMapImage) +#define __ENQUEUE_UNMAP_MEM_OBJECT_ERR __ERR_STR(clEnqueueUnMapMemObject) +#define __ENQUEUE_NDRANGE_KERNEL_ERR __ERR_STR(clEnqueueNDRangeKernel) +#define __ENQUEUE_NATIVE_KERNEL __ERR_STR(clEnqueueNativeKernel) +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MIGRATE_MEM_OBJECTS_ERR __ERR_STR(clEnqueueMigrateMemObjects) +#endif // #if defined(CL_VERSION_1_2) + +#define __ENQUEUE_ACQUIRE_GL_ERR __ERR_STR(clEnqueueAcquireGLObjects) +#define __ENQUEUE_RELEASE_GL_ERR __ERR_STR(clEnqueueReleaseGLObjects) + + +#define __RETAIN_ERR __ERR_STR(Retain Object) +#define __RELEASE_ERR __ERR_STR(Release Object) +#define __FLUSH_ERR __ERR_STR(clFlush) +#define __FINISH_ERR __ERR_STR(clFinish) +#define __VECTOR_CAPACITY_ERR __ERR_STR(Vector capacity error) + +/** + * CL 1.2 version that uses device fission. + */ +#if defined(CL_VERSION_1_2) +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevices) +#else +#define __CREATE_SUB_DEVICES __ERR_STR(clCreateSubDevicesEXT) +#endif // #if defined(CL_VERSION_1_2) + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +#define __ENQUEUE_MARKER_ERR __ERR_STR(clEnqueueMarker) +#define __ENQUEUE_WAIT_FOR_EVENTS_ERR __ERR_STR(clEnqueueWaitForEvents) +#define __ENQUEUE_BARRIER_ERR __ERR_STR(clEnqueueBarrier) +#define __UNLOAD_COMPILER_ERR __ERR_STR(clUnloadCompiler) +#define __CREATE_GL_TEXTURE_2D_ERR __ERR_STR(clCreateFromGLTexture2D) +#define __CREATE_GL_TEXTURE_3D_ERR __ERR_STR(clCreateFromGLTexture3D) +#define __CREATE_IMAGE2D_ERR __ERR_STR(clCreateImage2D) +#define __CREATE_IMAGE3D_ERR __ERR_STR(clCreateImage3D) +#endif // #if defined(CL_VERSION_1_1) + +#if defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) || !defined(CL_VERSION_2_0) +#define __CREATE_COMMAND_QUEUE_ERR __ERR_STR(clCreateCommandQueue) +#define __CREATE_SAMPLER_ERR __ERR_STR(clCreateSampler) +#define __ENQUEUE_TASK_ERR __ERR_STR(clEnqueueTask) +#endif // #if defined(CL_VERSION_1_2) + +#endif // __CL_USER_OVERRIDE_ERROR_STRINGS +//! \endcond + +/** + * CL 1.2 marker and barrier commands + */ +#if defined(CL_VERSION_1_2) +#define __ENQUEUE_MARKER_WAIT_LIST_ERR __ERR_STR(clEnqueueMarkerWithWaitList) +#define __ENQUEUE_BARRIER_WAIT_LIST_ERR __ERR_STR(clEnqueueBarrierWithWaitList) +#endif // #if defined(CL_VERSION_1_2) + +#if !defined(__USE_DEV_STRING) && !defined(__NO_STD_STRING) +typedef std::string STRING_CLASS; +#elif !defined(__USE_DEV_STRING) + +/*! \class string + * \brief Simple string class, that provides a limited subset of std::string + * functionality but avoids many of the issues that come with that class. + + * \note Deprecated. Please use std::string as default or + * re-define the string class to match the std::string + * interface by defining STRING_CLASS + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED string CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +{ +private: + ::size_t size_; + char * str_; +public: + //! \brief Constructs an empty string, allocating no memory. + string(void) : size_(0), str_(NULL) + { + } + + /*! \brief Constructs a string populated from an arbitrary value of + * specified size. + * + * An extra '\0' is added, in case none was contained in str. + * + * \param str the initial value of the string instance. Note that '\0' + * characters receive no special treatment. If NULL, + * the string is left empty, with a size of 0. + * + * \param size the number of characters to copy from str. + */ + string(const char * str, ::size_t size) : + size_(size), + str_(NULL) + { + if( size > 0 ) { + str_ = new char[size_+1]; + if (str_ != NULL) { + memcpy(str_, str, size_ * sizeof(char)); + str_[size_] = '\0'; + } + else { + size_ = 0; + } + } + } + + /*! \brief Constructs a string populated from a null-terminated value. + * + * \param str the null-terminated initial value of the string instance. + * If NULL, the string is left empty, with a size of 0. + */ + string(const char * str) : + size_(0), + str_(NULL) + { + if( str ) { + size_= ::strlen(str); + } + if( size_ > 0 ) { + str_ = new char[size_ + 1]; + if (str_ != NULL) { + memcpy(str_, str, (size_ + 1) * sizeof(char)); + } + } + } + + void resize( ::size_t n ) + { + if( size_ == n ) { + return; + } + if (n == 0) { + if( str_ ) { + delete [] str_; + } + str_ = NULL; + size_ = 0; + } + else { + char *newString = new char[n + 1]; + int copySize = n; + if( size_ < n ) { + copySize = size_; + } + size_ = n; + + if(str_) { + memcpy(newString, str_, (copySize + 1) * sizeof(char)); + } + if( copySize < size_ ) { + memset(newString + copySize, 0, size_ - copySize); + } + newString[size_] = '\0'; + + delete [] str_; + str_ = newString; + } + } + + const char& operator[] ( ::size_t pos ) const + { + return str_[pos]; + } + + char& operator[] ( ::size_t pos ) + { + return str_[pos]; + } + + /*! \brief Copies the value of another string to this one. + * + * \param rhs the string to copy. + * + * \returns a reference to the modified instance. + */ + string& operator=(const string& rhs) + { + if (this == &rhs) { + return *this; + } + + if( str_ != NULL ) { + delete [] str_; + str_ = NULL; + size_ = 0; + } + + if (rhs.size_ == 0 || rhs.str_ == NULL) { + str_ = NULL; + size_ = 0; + } + else { + str_ = new char[rhs.size_ + 1]; + size_ = rhs.size_; + + if (str_ != NULL) { + memcpy(str_, rhs.str_, (size_ + 1) * sizeof(char)); + } + else { + size_ = 0; + } + } + + return *this; + } + + /*! \brief Constructs a string by copying the value of another instance. + * + * \param rhs the string to copy. + */ + string(const string& rhs) : + size_(0), + str_(NULL) + { + *this = rhs; + } + + //! \brief Destructor - frees memory used to hold the current value. + ~string() + { + delete[] str_; + str_ = NULL; + } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t size(void) const { return size_; } + + //! \brief Queries the length of the string, excluding any added '\0's. + ::size_t length(void) const { return size(); } + + /*! \brief Returns a pointer to the private copy held by this instance, + * or "" if empty/unset. + */ + const char * c_str(void) const { return (str_) ? str_ : "";} +}; +typedef cl::string STRING_CLASS; +#endif // #elif !defined(__USE_DEV_STRING) + +#if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) +#define VECTOR_CLASS std::vector +#elif !defined(__USE_DEV_VECTOR) +#define VECTOR_CLASS cl::vector + +#if !defined(__MAX_DEFAULT_VECTOR_SIZE) +#define __MAX_DEFAULT_VECTOR_SIZE 10 +#endif + +/*! \class vector + * \brief Fixed sized vector implementation that mirroring + * + * \note Deprecated. Please use std::vector as default or + * re-define the vector class to match the std::vector + * interface by defining VECTOR_CLASS + + * \note Not recommended for use with custom objects as + * current implementation will construct N elements + * + * std::vector functionality. + * \brief Fixed sized vector compatible with std::vector. + * + * \note + * This differs from std::vector<> not just in memory allocation, + * but also in terms of when members are constructed, destroyed, + * and assigned instead of being copy constructed. + * + * \param T type of element contained in the vector. + * + * \param N maximum size of the vector. + */ +template <typename T, unsigned int N = __MAX_DEFAULT_VECTOR_SIZE> +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED vector CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED +{ +private: + T data_[N]; + unsigned int size_; + +public: + //! \brief Constructs an empty vector with no memory allocated. + vector() : + size_(static_cast<unsigned int>(0)) + {} + + //! \brief Deallocates the vector's memory and destroys all of its elements. + ~vector() + { + clear(); + } + + //! \brief Returns the number of elements currently contained. + unsigned int size(void) const + { + return size_; + } + + /*! \brief Empties the vector of all elements. + * \note + * This does not deallocate memory but will invoke destructors + * on contained elements. + */ + void clear() + { + while(!empty()) { + pop_back(); + } + } + + /*! \brief Appends an element after the last valid element. + * Calling this on a vector that has reached capacity will throw an + * exception if exceptions are enabled. + */ + void push_back (const T& x) + { + if (size() < N) { + new (&data_[size_]) T(x); + size_++; + } else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Removes the last valid element from the vector. + * Calling this on an empty vector will throw an exception + * if exceptions are enabled. + */ + void pop_back(void) + { + if (size_ != 0) { + --size_; + data_[size_].~T(); + } else { + detail::errHandler(CL_MEM_OBJECT_ALLOCATION_FAILURE, __VECTOR_CAPACITY_ERR); + } + } + + /*! \brief Constructs with a value copied from another. + * + * \param vec the vector to copy. + */ + vector(const vector<T, N>& vec) : + size_(vec.size_) + { + if (size_ != 0) { + assign(vec.begin(), vec.end()); + } + } + + /*! \brief Constructs with a specified number of initial elements. + * + * \param size number of initial elements. + * + * \param val value of initial elements. + */ + vector(unsigned int size, const T& val = T()) : + size_(0) + { + for (unsigned int i = 0; i < size; i++) { + push_back(val); + } + } + + /*! \brief Overwrites the current content with that copied from another + * instance. + * + * \param rhs vector to copy. + * + * \returns a reference to this. + */ + vector<T, N>& operator=(const vector<T, N>& rhs) + { + if (this == &rhs) { + return *this; + } + + if (rhs.size_ != 0) { + assign(rhs.begin(), rhs.end()); + } else { + clear(); + } + + return *this; + } + + /*! \brief Tests equality against another instance. + * + * \param vec the vector against which to compare. + */ + bool operator==(vector<T,N> &vec) + { + if (size() != vec.size()) { + return false; + } + + for( unsigned int i = 0; i < size(); ++i ) { + if( operator[](i) != vec[i] ) { + return false; + } + } + return true; + } + + //! \brief Conversion operator to T*. + operator T* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const T* () const { return data_; } + + //! \brief Tests whether this instance has any elements. + bool empty (void) const + { + return size_==0; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int max_size (void) const + { + return N; + } + + //! \brief Returns the maximum number of elements this instance can hold. + unsigned int capacity () const + { + return N; + } + + /*! \brief Returns a reference to a given element. + * + * \param index which element to access. * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + T& operator[](int index) + { + return data_[index]; + } + + /*! \brief Returns a const reference to a given element. + * + * \param index which element to access. + * + * \note + * The caller is responsible for ensuring index is >= 0 and < size(). + */ + const T& operator[](int index) const + { + return data_[index]; + } + + /*! \brief Assigns elements of the vector based on a source iterator range. + * + * \param start Beginning iterator of source range + * \param end Enditerator of source range + * + * \note + * Will throw an exception if exceptions are enabled and size exceeded. + */ + template<class I> + void assign(I start, I end) + { + clear(); + while(start != end) { + push_back(*start); + start++; + } + } + + /*! \class iterator + * \brief Const iterator class for vectors + */ + class iterator + { + private: + const vector<T,N> *vec_; + int index_; + + /** + * Internal iterator constructor to capture reference + * to the vector it iterates over rather than taking + * the vector by copy. + */ + iterator (const vector<T,N> &vec, int index) : + vec_(&vec) + { + if( !vec.empty() ) { + index_ = index; + } else { + index_ = -1; + } + } + + public: + iterator(void) : + index_(-1), + vec_(NULL) + { + } + + iterator(const iterator& rhs) : + vec_(rhs.vec_), + index_(rhs.index_) + { + } + + ~iterator(void) {} + + static iterator begin(const cl::vector<T,N> &vec) + { + iterator i(vec, 0); + + return i; + } + + static iterator end(const cl::vector<T,N> &vec) + { + iterator i(vec, vec.size()); + + return i; + } + + bool operator==(iterator i) + { + return ((vec_ == i.vec_) && + (index_ == i.index_)); + } + + bool operator!=(iterator i) + { + return (!(*this==i)); + } + + iterator& operator++() + { + ++index_; + return *this; + } + + iterator operator++(int) + { + iterator retVal(*this); + ++index_; + return retVal; + } + + iterator& operator--() + { + --index_; + return *this; + } + + iterator operator--(int) + { + iterator retVal(*this); + --index_; + return retVal; + } + + const T& operator *() const + { + return (*vec_)[index_]; + } + }; + + iterator begin(void) + { + return iterator::begin(*this); + } + + iterator begin(void) const + { + return iterator::begin(*this); + } + + iterator end(void) + { + return iterator::end(*this); + } + + iterator end(void) const + { + return iterator::end(*this); + } + + T& front(void) + { + return data_[0]; + } + + T& back(void) + { + return data_[size_]; + } + + const T& front(void) const + { + return data_[0]; + } + + const T& back(void) const + { + return data_[size_-1]; + } +}; +#endif // #if !defined(__USE_DEV_VECTOR) && !defined(__NO_STD_VECTOR) + + + + + +namespace detail { +#define __DEFAULT_NOT_INITIALIZED 1 +#define __DEFAULT_BEING_INITIALIZED 2 +#define __DEFAULT_INITIALIZED 4 + + /* + * Compare and exchange primitives are needed for handling of defaults + */ + inline int compare_exchange(volatile int * dest, int exchange, int comparand) + { +#ifdef _WIN32 + return (int)(InterlockedCompareExchange( + (volatile long*)dest, + (long)exchange, + (long)comparand)); +#elif defined(__APPLE__) || defined(__MACOSX) + return OSAtomicOr32Orig((uint32_t)exchange, (volatile uint32_t*)dest); +#else // !_WIN32 || defined(__APPLE__) || defined(__MACOSX) + return (__sync_val_compare_and_swap( + dest, + comparand, + exchange)); +#endif // !_WIN32 + } + + inline void fence() { _mm_mfence(); } +}; // namespace detail + + +/*! \brief class used to interface between C++ and + * OpenCL C calls that require arrays of size_t values, whose + * size is known statically. + */ +template <int N> +class size_t +{ +private: + ::size_t data_[N]; + +public: + //! \brief Initialize size_t to all 0s + size_t() + { + for( int i = 0; i < N; ++i ) { + data_[i] = 0; + } + } + + ::size_t& operator[](int index) + { + return data_[index]; + } + + const ::size_t& operator[](int index) const + { + return data_[index]; + } + + //! \brief Conversion operator to T*. + operator ::size_t* () { return data_; } + + //! \brief Conversion operator to const T*. + operator const ::size_t* () const { return data_; } +}; + +namespace detail { + +// Generic getInfoHelper. The final parameter is used to guide overload +// resolution: the actual parameter passed is an int, which makes this +// a worse conversion sequence than a specialization that declares the +// parameter as an int. +template<typename Functor, typename T> +inline cl_int getInfoHelper(Functor f, cl_uint name, T* param, long) +{ + return f(name, sizeof(T), param, NULL); +} + +// Specialized getInfoHelper for VECTOR_CLASS params +template <typename Func, typename T> +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + T* value = (T*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + param->assign(&value[0], &value[required/sizeof(T)]); + return CL_SUCCESS; +} + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper<T>::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper<T> + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template <typename Func, typename T> +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<T>* param, int, typename T::cl_type = 0) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + typename T::cl_type * value = (typename T::cl_type *) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t elements = required / sizeof(typename T::cl_type); + param->assign(&value[0], &value[elements]); + for (::size_t i = 0; i < elements; i++) + { + if (value[i] != NULL) + { + err = (*param)[i].retain(); + if (err != CL_SUCCESS) { + return err; + } + } + } + return CL_SUCCESS; +} + +// Specialized for getInfo<CL_PROGRAM_BINARIES> +template <typename Func> +inline cl_int getInfoHelper(Func f, cl_uint name, VECTOR_CLASS<char *>* param, int) +{ + cl_int err = f(name, param->size() * sizeof(char *), &(*param)[0], NULL); + + if (err != CL_SUCCESS) { + return err; + } + + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for STRING_CLASS params +template <typename Func> +inline cl_int getInfoHelper(Func f, cl_uint name, STRING_CLASS* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + char* value = (char*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + *param = value; + return CL_SUCCESS; +} + +// Specialized GetInfoHelper for cl::size_t params +template <typename Func, ::size_t N> +inline cl_int getInfoHelper(Func f, cl_uint name, size_t<N>* param, long) +{ + ::size_t required; + cl_int err = f(name, 0, NULL, &required); + if (err != CL_SUCCESS) { + return err; + } + + ::size_t* value = (::size_t*) alloca(required); + err = f(name, required, value, NULL); + if (err != CL_SUCCESS) { + return err; + } + + for(int i = 0; i < N; ++i) { + (*param)[i] = value[i]; + } + + return CL_SUCCESS; +} + +template<typename T> struct ReferenceHandler; + +/* Specialization for reference-counted types. This depends on the + * existence of Wrapper<T>::cl_type, and none of the other types having the + * cl_type member. Note that simplify specifying the parameter as Wrapper<T> + * does not work, because when using a derived type (e.g. Context) the generic + * template will provide a better match. + */ +template<typename Func, typename T> +inline cl_int getInfoHelper(Func f, cl_uint name, T* param, int, typename T::cl_type = 0) +{ + typename T::cl_type value; + cl_int err = f(name, sizeof(value), &value, NULL); + if (err != CL_SUCCESS) { + return err; + } + *param = value; + if (value != NULL) + { + err = param->retain(); + if (err != CL_SUCCESS) { + return err; + } + } + return CL_SUCCESS; +} + +#define __PARAM_NAME_INFO_1_0(F) \ + F(cl_platform_info, CL_PLATFORM_PROFILE, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VERSION, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_NAME, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_VENDOR, STRING_CLASS) \ + F(cl_platform_info, CL_PLATFORM_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_TYPE, cl_device_type) \ + F(cl_device_info, CL_DEVICE_VENDOR_ID, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_GROUP_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_WORK_ITEM_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) \ + F(cl_device_info, CL_DEVICE_ADDRESS_BITS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE2D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_WIDTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_HEIGHT, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE3D_MAX_DEPTH, ::size_t) \ + F(cl_device_info, CL_DEVICE_IMAGE_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_MAX_PARAMETER_SIZE, ::size_t) \ + F(cl_device_info, CL_DEVICE_MAX_SAMPLERS, cl_uint) \ + F(cl_device_info, CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) \ + F(cl_device_info, CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) \ + F(cl_device_info, CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint)\ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type) \ + F(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) \ + F(cl_device_info, CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) \ + F(cl_device_info, CL_DEVICE_PROFILING_TIMER_RESOLUTION, ::size_t) \ + F(cl_device_info, CL_DEVICE_ENDIAN_LITTLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_COMPILER_AVAILABLE, cl_bool) \ + F(cl_device_info, CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities) \ + F(cl_device_info, CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties) \ + F(cl_device_info, CL_DEVICE_PLATFORM, cl_platform_id) \ + F(cl_device_info, CL_DEVICE_NAME, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VENDOR, STRING_CLASS) \ + F(cl_device_info, CL_DRIVER_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_PROFILE, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_VERSION, STRING_CLASS) \ + F(cl_device_info, CL_DEVICE_EXTENSIONS, STRING_CLASS) \ + \ + F(cl_context_info, CL_CONTEXT_REFERENCE_COUNT, cl_uint) \ + F(cl_context_info, CL_CONTEXT_DEVICES, VECTOR_CLASS<Device>) \ + F(cl_context_info, CL_CONTEXT_PROPERTIES, VECTOR_CLASS<cl_context_properties>) \ + \ + F(cl_event_info, CL_EVENT_COMMAND_QUEUE, cl::CommandQueue) \ + F(cl_event_info, CL_EVENT_COMMAND_TYPE, cl_command_type) \ + F(cl_event_info, CL_EVENT_REFERENCE_COUNT, cl_uint) \ + F(cl_event_info, CL_EVENT_COMMAND_EXECUTION_STATUS, cl_uint) \ + \ + F(cl_profiling_info, CL_PROFILING_COMMAND_QUEUED, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_SUBMIT, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_START, cl_ulong) \ + F(cl_profiling_info, CL_PROFILING_COMMAND_END, cl_ulong) \ + \ + F(cl_mem_info, CL_MEM_TYPE, cl_mem_object_type) \ + F(cl_mem_info, CL_MEM_FLAGS, cl_mem_flags) \ + F(cl_mem_info, CL_MEM_SIZE, ::size_t) \ + F(cl_mem_info, CL_MEM_HOST_PTR, void*) \ + F(cl_mem_info, CL_MEM_MAP_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_REFERENCE_COUNT, cl_uint) \ + F(cl_mem_info, CL_MEM_CONTEXT, cl::Context) \ + \ + F(cl_image_info, CL_IMAGE_FORMAT, cl_image_format) \ + F(cl_image_info, CL_IMAGE_ELEMENT_SIZE, ::size_t) \ + F(cl_image_info, CL_IMAGE_ROW_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_SLICE_PITCH, ::size_t) \ + F(cl_image_info, CL_IMAGE_WIDTH, ::size_t) \ + F(cl_image_info, CL_IMAGE_HEIGHT, ::size_t) \ + F(cl_image_info, CL_IMAGE_DEPTH, ::size_t) \ + \ + F(cl_sampler_info, CL_SAMPLER_REFERENCE_COUNT, cl_uint) \ + F(cl_sampler_info, CL_SAMPLER_CONTEXT, cl::Context) \ + F(cl_sampler_info, CL_SAMPLER_NORMALIZED_COORDS, cl_addressing_mode) \ + F(cl_sampler_info, CL_SAMPLER_ADDRESSING_MODE, cl_filter_mode) \ + F(cl_sampler_info, CL_SAMPLER_FILTER_MODE, cl_bool) \ + \ + F(cl_program_info, CL_PROGRAM_REFERENCE_COUNT, cl_uint) \ + F(cl_program_info, CL_PROGRAM_CONTEXT, cl::Context) \ + F(cl_program_info, CL_PROGRAM_NUM_DEVICES, cl_uint) \ + F(cl_program_info, CL_PROGRAM_DEVICES, VECTOR_CLASS<Device>) \ + F(cl_program_info, CL_PROGRAM_SOURCE, STRING_CLASS) \ + F(cl_program_info, CL_PROGRAM_BINARY_SIZES, VECTOR_CLASS< ::size_t>) \ + F(cl_program_info, CL_PROGRAM_BINARIES, VECTOR_CLASS<char *>) \ + \ + F(cl_program_build_info, CL_PROGRAM_BUILD_STATUS, cl_build_status) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_OPTIONS, STRING_CLASS) \ + F(cl_program_build_info, CL_PROGRAM_BUILD_LOG, STRING_CLASS) \ + \ + F(cl_kernel_info, CL_KERNEL_FUNCTION_NAME, STRING_CLASS) \ + F(cl_kernel_info, CL_KERNEL_NUM_ARGS, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_REFERENCE_COUNT, cl_uint) \ + F(cl_kernel_info, CL_KERNEL_CONTEXT, cl::Context) \ + F(cl_kernel_info, CL_KERNEL_PROGRAM, cl::Program) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_WORK_GROUP_SIZE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_COMPILE_WORK_GROUP_SIZE, cl::size_t<3>) \ + F(cl_kernel_work_group_info, CL_KERNEL_LOCAL_MEM_SIZE, cl_ulong) \ + \ + F(cl_command_queue_info, CL_QUEUE_CONTEXT, cl::Context) \ + F(cl_command_queue_info, CL_QUEUE_DEVICE, cl::Device) \ + F(cl_command_queue_info, CL_QUEUE_REFERENCE_COUNT, cl_uint) \ + F(cl_command_queue_info, CL_QUEUE_PROPERTIES, cl_command_queue_properties) + +#if defined(CL_VERSION_1_1) +#define __PARAM_NAME_INFO_1_1(F) \ + F(cl_context_info, CL_CONTEXT_NUM_DEVICES, cl_uint)\ + F(cl_device_info, CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint) \ + F(cl_device_info, CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) \ + F(cl_device_info, CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config) \ + F(cl_device_info, CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) \ + F(cl_device_info, CL_DEVICE_OPENCL_C_VERSION, STRING_CLASS) \ + \ + F(cl_mem_info, CL_MEM_ASSOCIATED_MEMOBJECT, cl::Memory) \ + F(cl_mem_info, CL_MEM_OFFSET, ::size_t) \ + \ + F(cl_kernel_work_group_info, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, ::size_t) \ + F(cl_kernel_work_group_info, CL_KERNEL_PRIVATE_MEM_SIZE, cl_ulong) \ + \ + F(cl_event_info, CL_EVENT_CONTEXT, cl::Context) +#endif // CL_VERSION_1_1 + + +#if defined(CL_VERSION_1_2) +#define __PARAM_NAME_INFO_1_2(F) \ + F(cl_image_info, CL_IMAGE_BUFFER, cl::Buffer) \ + \ + F(cl_program_info, CL_PROGRAM_NUM_KERNELS, ::size_t) \ + F(cl_program_info, CL_PROGRAM_KERNEL_NAMES, STRING_CLASS) \ + \ + F(cl_program_build_info, CL_PROGRAM_BINARY_TYPE, cl_program_binary_type) \ + \ + F(cl_kernel_info, CL_KERNEL_ATTRIBUTES, STRING_CLASS) \ + \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ADDRESS_QUALIFIER, cl_kernel_arg_address_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_ACCESS_QUALIFIER, cl_kernel_arg_access_qualifier) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_TYPE_NAME, STRING_CLASS) \ + F(cl_kernel_arg_info, CL_KERNEL_ARG_NAME, STRING_CLASS) \ + \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_PROPERTIES, VECTOR_CLASS<cl_device_partition_property>) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPE, VECTOR_CLASS<cl_device_partition_property>) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT, cl_uint) \ + F(cl_device_info, CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, ::size_t) \ + F(cl_device_info, CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain) \ + F(cl_device_info, CL_DEVICE_BUILT_IN_KERNELS, STRING_CLASS) +#endif // #if defined(CL_VERSION_1_2) + +#if defined(USE_CL_DEVICE_FISSION) +#define __PARAM_NAME_DEVICE_FISSION(F) \ + F(cl_device_info, CL_DEVICE_PARENT_DEVICE_EXT, cl_device_id) \ + F(cl_device_info, CL_DEVICE_PARTITION_TYPES_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \ + F(cl_device_info, CL_DEVICE_AFFINITY_DOMAINS_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) \ + F(cl_device_info, CL_DEVICE_REFERENCE_COUNT_EXT , cl_uint) \ + F(cl_device_info, CL_DEVICE_PARTITION_STYLE_EXT, VECTOR_CLASS<cl_device_partition_property_ext>) +#endif // USE_CL_DEVICE_FISSION + +template <typename enum_type, cl_int Name> +struct param_traits {}; + +#define __CL_DECLARE_PARAM_TRAITS(token, param_name, T) \ +struct token; \ +template<> \ +struct param_traits<detail:: token,param_name> \ +{ \ + enum { value = param_name }; \ + typedef T param_type; \ +}; + +__PARAM_NAME_INFO_1_0(__CL_DECLARE_PARAM_TRAITS) +#if defined(CL_VERSION_1_1) +__PARAM_NAME_INFO_1_1(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 +#if defined(CL_VERSION_1_2) +__PARAM_NAME_INFO_1_2(__CL_DECLARE_PARAM_TRAITS) +#endif // CL_VERSION_1_1 + +#if defined(USE_CL_DEVICE_FISSION) +__PARAM_NAME_DEVICE_FISSION(__CL_DECLARE_PARAM_TRAITS); +#endif // USE_CL_DEVICE_FISSION + +#ifdef CL_PLATFORM_ICD_SUFFIX_KHR +__CL_DECLARE_PARAM_TRAITS(cl_platform_info, CL_PLATFORM_ICD_SUFFIX_KHR, STRING_CLASS) +#endif + +#ifdef CL_DEVICE_PROFILING_TIMER_OFFSET_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_PROFILING_TIMER_OFFSET_AMD, cl_ulong) +#endif + +#ifdef CL_DEVICE_GLOBAL_FREE_MEMORY_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_FREE_MEMORY_AMD, VECTOR_CLASS< ::size_t>) +#endif +#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_WAVEFRONT_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WAVEFRONT_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD, cl_uint) +#endif +#ifdef CL_DEVICE_LOCAL_MEM_BANKS_AMD +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_LOCAL_MEM_BANKS_AMD, cl_uint) +#endif + +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, cl_uint) +#endif +#ifdef CL_DEVICE_REGISTERS_PER_BLOCK_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_REGISTERS_PER_BLOCK_NV, cl_uint) +#endif +#ifdef CL_DEVICE_WARP_SIZE_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_WARP_SIZE_NV, cl_uint) +#endif +#ifdef CL_DEVICE_GPU_OVERLAP_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_GPU_OVERLAP_NV, cl_bool) +#endif +#ifdef CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV, cl_bool) +#endif +#ifdef CL_DEVICE_INTEGRATED_MEMORY_NV +__CL_DECLARE_PARAM_TRAITS(cl_device_info, CL_DEVICE_INTEGRATED_MEMORY_NV, cl_bool) +#endif + +// Convenience functions + +template <typename Func, typename T> +inline cl_int +getInfo(Func f, cl_uint name, T* param) +{ + return getInfoHelper(f, name, param, 0); +} + +template <typename Func, typename Arg0> +struct GetInfoFunctor0 +{ + Func f_; const Arg0& arg0_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { return f_(arg0_, param, size, value, size_ret); } +}; + +template <typename Func, typename Arg0, typename Arg1> +struct GetInfoFunctor1 +{ + Func f_; const Arg0& arg0_; const Arg1& arg1_; + cl_int operator ()( + cl_uint param, ::size_t size, void* value, ::size_t* size_ret) + { return f_(arg0_, arg1_, param, size, value, size_ret); } +}; + +template <typename Func, typename Arg0, typename T> +inline cl_int +getInfo(Func f, const Arg0& arg0, cl_uint name, T* param) +{ + GetInfoFunctor0<Func, Arg0> f0 = { f, arg0 }; + return getInfoHelper(f0, name, param, 0); +} + +template <typename Func, typename Arg0, typename Arg1, typename T> +inline cl_int +getInfo(Func f, const Arg0& arg0, const Arg1& arg1, cl_uint name, T* param) +{ + GetInfoFunctor1<Func, Arg0, Arg1> f0 = { f, arg0, arg1 }; + return getInfoHelper(f0, name, param, 0); +} + +template<typename T> +struct ReferenceHandler +{ }; + +#if defined(CL_VERSION_1_2) +/** + * OpenCL 1.2 devices do have retain/release. + */ +template <> +struct ReferenceHandler<cl_device_id> +{ + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int retain(cl_device_id device) + { return ::clRetainDevice(device); } + /** + * Retain the device. + * \param device A valid device created using createSubDevices + * \return + * CL_SUCCESS if the function executed successfully. + * CL_INVALID_DEVICE if device was not a valid subdevice + * CL_OUT_OF_RESOURCES + * CL_OUT_OF_HOST_MEMORY + */ + static cl_int release(cl_device_id device) + { return ::clReleaseDevice(device); } +}; +#else // #if defined(CL_VERSION_1_2) +/** + * OpenCL 1.1 devices do not have retain/release. + */ +template <> +struct ReferenceHandler<cl_device_id> +{ + // cl_device_id does not have retain(). + static cl_int retain(cl_device_id) + { return CL_SUCCESS; } + // cl_device_id does not have release(). + static cl_int release(cl_device_id) + { return CL_SUCCESS; } +}; +#endif // #if defined(CL_VERSION_1_2) + +template <> +struct ReferenceHandler<cl_platform_id> +{ + // cl_platform_id does not have retain(). + static cl_int retain(cl_platform_id) + { return CL_SUCCESS; } + // cl_platform_id does not have release(). + static cl_int release(cl_platform_id) + { return CL_SUCCESS; } +}; + +template <> +struct ReferenceHandler<cl_context> +{ + static cl_int retain(cl_context context) + { return ::clRetainContext(context); } + static cl_int release(cl_context context) + { return ::clReleaseContext(context); } +}; + +template <> +struct ReferenceHandler<cl_command_queue> +{ + static cl_int retain(cl_command_queue queue) + { return ::clRetainCommandQueue(queue); } + static cl_int release(cl_command_queue queue) + { return ::clReleaseCommandQueue(queue); } +}; + +template <> +struct ReferenceHandler<cl_mem> +{ + static cl_int retain(cl_mem memory) + { return ::clRetainMemObject(memory); } + static cl_int release(cl_mem memory) + { return ::clReleaseMemObject(memory); } +}; + +template <> +struct ReferenceHandler<cl_sampler> +{ + static cl_int retain(cl_sampler sampler) + { return ::clRetainSampler(sampler); } + static cl_int release(cl_sampler sampler) + { return ::clReleaseSampler(sampler); } +}; + +template <> +struct ReferenceHandler<cl_program> +{ + static cl_int retain(cl_program program) + { return ::clRetainProgram(program); } + static cl_int release(cl_program program) + { return ::clReleaseProgram(program); } +}; + +template <> +struct ReferenceHandler<cl_kernel> +{ + static cl_int retain(cl_kernel kernel) + { return ::clRetainKernel(kernel); } + static cl_int release(cl_kernel kernel) + { return ::clReleaseKernel(kernel); } +}; + +template <> +struct ReferenceHandler<cl_event> +{ + static cl_int retain(cl_event event) + { return ::clRetainEvent(event); } + static cl_int release(cl_event event) + { return ::clReleaseEvent(event); } +}; + + +// Extracts version number with major in the upper 16 bits, minor in the lower 16 +static cl_uint getVersion(const char *versionInfo) +{ + int highVersion = 0; + int lowVersion = 0; + int index = 7; + while(versionInfo[index] != '.' ) { + highVersion *= 10; + highVersion += versionInfo[index]-'0'; + ++index; + } + ++index; + while(versionInfo[index] != ' ' ) { + lowVersion *= 10; + lowVersion += versionInfo[index]-'0'; + ++index; + } + return (highVersion << 16) | lowVersion; +} + +static cl_uint getPlatformVersion(cl_platform_id platform) +{ + ::size_t size = 0; + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 0, NULL, &size); + char *versionInfo = (char *) alloca(size); + clGetPlatformInfo(platform, CL_PLATFORM_VERSION, size, &versionInfo[0], &size); + return getVersion(versionInfo); +} + +static cl_uint getDevicePlatformVersion(cl_device_id device) +{ + cl_platform_id platform; + clGetDeviceInfo(device, CL_DEVICE_PLATFORM, sizeof(platform), &platform, NULL); + return getPlatformVersion(platform); +} + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) +static cl_uint getContextPlatformVersion(cl_context context) +{ + // The platform cannot be queried directly, so we first have to grab a + // device and obtain its context + ::size_t size = 0; + clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size); + if (size == 0) + return 0; + cl_device_id *devices = (cl_device_id *) alloca(size); + clGetContextInfo(context, CL_CONTEXT_DEVICES, size, devices, NULL); + return getDevicePlatformVersion(devices[0]); +} +#endif // #if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + +template <typename T> +class Wrapper +{ +public: + typedef T cl_type; + +protected: + cl_type object_; + +public: + Wrapper() : object_(NULL) { } + + Wrapper(const cl_type &obj) : object_(obj) { } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper<cl_type>& rhs) + { + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + + Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + return *this; + } + + Wrapper<cl_type>& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + +protected: + template<typename Func, typename U> + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + cl_int retain() const + { + return ReferenceHandler<cl_type>::retain(object_); + } + + cl_int release() const + { + return ReferenceHandler<cl_type>::release(object_); + } +}; + +template <> +class Wrapper<cl_device_id> +{ +public: + typedef cl_device_id cl_type; + +protected: + cl_type object_; + bool referenceCountable_; + + static bool isReferenceCountable(cl_device_id device) + { + bool retVal = false; + if (device != NULL) { + int version = getDevicePlatformVersion(device); + if(version > ((1 << 16) + 1)) { + retVal = true; + } + } + return retVal; + } + +public: + Wrapper() : object_(NULL), referenceCountable_(false) + { + } + + Wrapper(const cl_type &obj) : object_(obj), referenceCountable_(false) + { + referenceCountable_ = isReferenceCountable(obj); + } + + ~Wrapper() + { + if (object_ != NULL) { release(); } + } + + Wrapper(const Wrapper<cl_type>& rhs) + { + object_ = rhs.object_; + referenceCountable_ = isReferenceCountable(object_); + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + } + + Wrapper<cl_type>& operator = (const Wrapper<cl_type>& rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs.object_; + referenceCountable_ = rhs.referenceCountable_; + if (object_ != NULL) { detail::errHandler(retain(), __RETAIN_ERR); } + return *this; + } + + Wrapper<cl_type>& operator = (const cl_type &rhs) + { + if (object_ != NULL) { detail::errHandler(release(), __RELEASE_ERR); } + object_ = rhs; + referenceCountable_ = isReferenceCountable(object_); + return *this; + } + + cl_type operator ()() const { return object_; } + + cl_type& operator ()() { return object_; } + +protected: + template<typename Func, typename U> + friend inline cl_int getInfoHelper(Func, cl_uint, U*, int, typename U::cl_type); + + template<typename Func, typename U> + friend inline cl_int getInfoHelper(Func, cl_uint, VECTOR_CLASS<U>*, int, typename U::cl_type); + + cl_int retain() const + { + if( referenceCountable_ ) { + return ReferenceHandler<cl_type>::retain(object_); + } + else { + return CL_SUCCESS; + } + } + + cl_int release() const + { + if( referenceCountable_ ) { + return ReferenceHandler<cl_type>::release(object_); + } + else { + return CL_SUCCESS; + } + } +}; + +} // namespace detail +//! \endcond + +/*! \stuct ImageFormat + * \brief Adds constructors and member functions for cl_image_format. + * + * \see cl_image_format + */ +struct ImageFormat : public cl_image_format +{ + //! \brief Default constructor - performs no initialization. + ImageFormat(){} + + //! \brief Initializing constructor. + ImageFormat(cl_channel_order order, cl_channel_type type) + { + image_channel_order = order; + image_channel_data_type = type; + } + + //! \brief Assignment operator. + ImageFormat& operator = (const ImageFormat& rhs) + { + if (this != &rhs) { + this->image_channel_data_type = rhs.image_channel_data_type; + this->image_channel_order = rhs.image_channel_order; + } + return *this; + } +}; + +/*! \brief Class interface for cl_device_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_device_id + */ +class Device : public detail::Wrapper<cl_device_id> +{ +public: + //! \brief Default constructor - initializes to NULL. + Device() : detail::Wrapper<cl_type>() { } + + /*! \brief Copy constructor. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device(const Device& device) : detail::Wrapper<cl_type>(device) { } + + /*! \brief Constructor from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device(const cl_device_id &device) : detail::Wrapper<cl_type>(device) { } + + /*! \brief Returns the first device on the default context. + * + * \see Context::getDefault() + */ + static Device getDefault(cl_int * err = NULL); + + /*! \brief Assignment operator from Device. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const Device& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_device_id. + * + * This simply copies the device ID value, which is an inexpensive operation. + */ + Device& operator = (const cl_device_id& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetDeviceInfo(). + template <typename T> + cl_int getInfo(cl_device_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetDeviceInfo, object_, name, param), + __GET_DEVICE_INFO_ERR); + } + + //! \brief Wrapper for clGetDeviceInfo() that returns by value. + template <cl_int name> typename + detail::param_traits<detail::cl_device_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_device_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /** + * CL 1.2 version + */ +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clCreateSubDevicesEXT(). + cl_int createSubDevices( + const cl_device_partition_property * properties, + VECTOR_CLASS<Device>* devices) + { + cl_uint n = 0; + cl_int err = clCreateSubDevices(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = clCreateSubDevices(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(CL_VERSION_1_2) + +/** + * CL 1.1 version that uses device fission. + */ +#if defined(CL_VERSION_1_1) +#if defined(USE_CL_DEVICE_FISSION) + cl_int createSubDevices( + const cl_device_partition_property_ext * properties, + VECTOR_CLASS<Device>* devices) + { + typedef CL_API_ENTRY cl_int + ( CL_API_CALL * PFN_clCreateSubDevicesEXT)( + cl_device_id /*in_device*/, + const cl_device_partition_property_ext * /* properties */, + cl_uint /*num_entries*/, + cl_device_id * /*out_devices*/, + cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1; + + static PFN_clCreateSubDevicesEXT pfn_clCreateSubDevicesEXT = NULL; + __INIT_CL_EXT_FCN_PTR(clCreateSubDevicesEXT); + + cl_uint n = 0; + cl_int err = pfn_clCreateSubDevicesEXT(object_, properties, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = pfn_clCreateSubDevicesEXT(object_, properties, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_SUB_DEVICES); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif // #if defined(USE_CL_DEVICE_FISSION) +#endif // #if defined(CL_VERSION_1_1) +}; + +/*! \brief Class interface for cl_platform_id. + * + * \note Copies of these objects are inexpensive, since they don't 'own' + * any underlying resources or data structures. + * + * \see cl_platform_id + */ +class Platform : public detail::Wrapper<cl_platform_id> +{ +public: + //! \brief Default constructor - initializes to NULL. + Platform() : detail::Wrapper<cl_type>() { } + + /*! \brief Copy constructor. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform(const Platform& platform) : detail::Wrapper<cl_type>(platform) { } + + /*! \brief Constructor from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform(const cl_platform_id &platform) : detail::Wrapper<cl_type>(platform) { } + + /*! \brief Assignment operator from Platform. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const Platform& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_platform_id. + * + * This simply copies the platform ID value, which is an inexpensive operation. + */ + Platform& operator = (const cl_platform_id& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetPlatformInfo(). + cl_int getInfo(cl_platform_info name, STRING_CLASS* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetPlatformInfo, object_, name, param), + __GET_PLATFORM_INFO_ERR); + } + + //! \brief Wrapper for clGetPlatformInfo() that returns by value. + template <cl_int name> typename + detail::param_traits<detail::cl_platform_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_platform_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of devices for this platform. + * + * Wraps clGetDeviceIDs(). + */ + cl_int getDevices( + cl_device_type type, + VECTOR_CLASS<Device>* devices) const + { + cl_uint n = 0; + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + cl_int err = ::clGetDeviceIDs(object_, type, 0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = ::clGetDeviceIDs(object_, type, n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + +#if defined(USE_DX_INTEROP) + /*! \brief Get the list of available D3D10 devices. + * + * \param d3d_device_source. + * + * \param d3d_object. + * + * \param d3d_device_set. + * + * \param devices returns a vector of OpenCL D3D10 devices found. The cl::Device + * values returned in devices can be used to identify a specific OpenCL + * device. If \a devices argument is NULL, this argument is ignored. + * + * \return One of the following values: + * - CL_SUCCESS if the function is executed successfully. + * + * The application can query specific capabilities of the OpenCL device(s) + * returned by cl::getDevices. This can be used by the application to + * determine which device(s) to use. + * + * \note In the case that exceptions are enabled and a return value + * other than CL_SUCCESS is generated, then cl::Error exception is + * generated. + */ + cl_int getDevices( + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + VECTOR_CLASS<Device>* devices) const + { + typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clGetDeviceIDsFromD3D10KHR)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint* num_devices); + + if( devices == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_DEVICE_IDS_ERR); + } + + static PFN_clGetDeviceIDsFromD3D10KHR pfn_clGetDeviceIDsFromD3D10KHR = NULL; + __INIT_CL_EXT_FCN_PTR_PLATFORM(object_, clGetDeviceIDsFromD3D10KHR); + + cl_uint n = 0; + cl_int err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + 0, + NULL, + &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + cl_device_id* ids = (cl_device_id*) alloca(n * sizeof(cl_device_id)); + err = pfn_clGetDeviceIDsFromD3D10KHR( + object_, + d3d_device_source, + d3d_object, + d3d_device_set, + n, + ids, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_DEVICE_IDS_ERR); + } + + devices->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } +#endif + + /*! \brief Gets a list of available platforms. + * + * Wraps clGetPlatformIDs(). + */ + static cl_int get( + VECTOR_CLASS<Platform>* platforms) + { + cl_uint n = 0; + + if( platforms == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + platforms->assign(&ids[0], &ids[n]); + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static cl_int get( + Platform * platform) + { + cl_uint n = 0; + + if( platform == NULL ) { + return detail::errHandler(CL_INVALID_ARG_VALUE, __GET_PLATFORM_IDS_ERR); + } + + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + *platform = ids[0]; + return CL_SUCCESS; + } + + /*! \brief Gets the first available platform, returning it by value. + * + * Wraps clGetPlatformIDs(), returning the first result. + */ + static Platform get( + cl_int * errResult = NULL) + { + Platform platform; + cl_uint n = 0; + cl_int err = ::clGetPlatformIDs(0, NULL, &n); + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + if (errResult != NULL) { + *errResult = err; + } + } + + cl_platform_id* ids = (cl_platform_id*) alloca( + n * sizeof(cl_platform_id)); + err = ::clGetPlatformIDs(n, ids, NULL); + + if (err != CL_SUCCESS) { + detail::errHandler(err, __GET_PLATFORM_IDS_ERR); + } + + if (errResult != NULL) { + *errResult = err; + } + + return ids[0]; + } + + static Platform getDefault( + cl_int *errResult = NULL ) + { + return get(errResult); + } + + +#if defined(CL_VERSION_1_2) + //! \brief Wrapper for clUnloadCompiler(). + cl_int + unloadCompiler() + { + return ::clUnloadPlatformCompiler(object_); + } +#endif // #if defined(CL_VERSION_1_2) +}; // class Platform + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) +/** + * Unload the OpenCL compiler. + * \note Deprecated for OpenCL 1.2. Use Platform::unloadCompiler instead. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_int +UnloadCompiler() CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline cl_int +UnloadCompiler() +{ + return ::clUnloadCompiler(); +} +#endif // #if defined(CL_VERSION_1_1) + +/*! \brief Class interface for cl_context. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_context as the original. For details, see + * clRetainContext() and clReleaseContext(). + * + * \see cl_context + */ +class Context + : public detail::Wrapper<cl_context> +{ +private: + static volatile int default_initialized_; + static Context default_; + static volatile cl_int default_error_; +public: + /*! \brief Destructor. + * + * This calls clReleaseContext() on the value held by this instance. + */ + ~Context() { } + + /*! \brief Constructs a context including a list of specified devices. + * + * Wraps clCreateContext(). + */ + Context( + const VECTOR_CLASS<Device>& devices, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateContext( + properties, (cl_uint) numDevices, + deviceIDs, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + Context( + const Device& device, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + + cl_device_id deviceID = device(); + + object_ = ::clCreateContext( + properties, 1, + &deviceID, + notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a context including all devices of a specified type. + * + * Wraps clCreateContextFromType(). + */ + Context( + cl_device_type type, + cl_context_properties* properties = NULL, + void (CL_CALLBACK * notifyFptr)( + const char *, + const void *, + ::size_t, + void *) = NULL, + void* data = NULL, + cl_int* err = NULL) + { + cl_int error; + +#if !defined(__APPLE__) || !defined(__MACOS) + cl_context_properties prop[4] = {CL_CONTEXT_PLATFORM, 0, 0, 0 }; + if (properties == NULL) { + prop[1] = (cl_context_properties)Platform::get(&error)(); + if (error != CL_SUCCESS) { + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + return; + } + } + + properties = &prop[0]; + } +#endif + object_ = ::clCreateContextFromType( + properties, type, notifyFptr, data, &error); + + detail::errHandler(error, __CREATE_CONTEXT_FROM_TYPE_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Returns a singleton context including all devices of CL_DEVICE_TYPE_DEFAULT. + * + * \note All calls to this function return the same cl_context as the first. + */ + static Context getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while(default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + default_ = Context( + CL_DEVICE_TYPE_DEFAULT, + NULL, + NULL, + NULL, + &error); + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + //! \brief Default constructor - initializes to NULL. + Context() : detail::Wrapper<cl_type>() { } + + /*! \brief Copy constructor. + * + * This calls clRetainContext() on the parameter's cl_context. + */ + Context(const Context& context) : detail::Wrapper<cl_type>(context) { } + + /*! \brief Constructor from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_context + * into the new Context object. + */ + __CL_EXPLICIT_CONSTRUCTORS Context(const cl_context& context) : detail::Wrapper<cl_type>(context) { } + + /*! \brief Assignment operator from Context. + * + * This calls clRetainContext() on the parameter and clReleaseContext() on + * the previous value held by this instance. + */ + Context& operator = (const Context& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_context - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseContext() on the value previously held by this instance. + */ + Context& operator = (const cl_context& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetContextInfo(). + template <typename T> + cl_int getInfo(cl_context_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetContextInfo, object_, name, param), + __GET_CONTEXT_INFO_ERR); + } + + //! \brief Wrapper for clGetContextInfo() that returns by value. + template <cl_int name> typename + detail::param_traits<detail::cl_context_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_context_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Gets a list of supported image formats. + * + * Wraps clGetSupportedImageFormats(). + */ + cl_int getSupportedImageFormats( + cl_mem_flags flags, + cl_mem_object_type type, + VECTOR_CLASS<ImageFormat>* formats) const + { + cl_uint numEntries; + cl_int err = ::clGetSupportedImageFormats( + object_, + flags, + type, + 0, + NULL, + &numEntries); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + ImageFormat* value = (ImageFormat*) + alloca(numEntries * sizeof(ImageFormat)); + err = ::clGetSupportedImageFormats( + object_, + flags, + type, + numEntries, + (cl_image_format*) value, + NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __GET_SUPPORTED_IMAGE_FORMATS_ERR); + } + + formats->assign(&value[0], &value[numEntries]); + return CL_SUCCESS; + } +}; + +inline Device Device::getDefault(cl_int * err) +{ + cl_int error; + Device device; + + Context context = Context::getDefault(&error); +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); +#else + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); +#endif + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + device = context.getInfo<CL_CONTEXT_DEVICES>()[0]; + if (err != NULL) { + *err = CL_SUCCESS; + } + } + + return device; +} + + +#ifdef _WIN32 +__declspec(selectany) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__declspec(selectany) Context Context::default_; +__declspec(selectany) volatile cl_int Context::default_error_ = CL_SUCCESS; +#else +__attribute__((weak)) volatile int Context::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__attribute__((weak)) Context Context::default_; +__attribute__((weak)) volatile cl_int Context::default_error_ = CL_SUCCESS; +#endif + +/*! \brief Class interface for cl_event. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_event as the original. For details, see + * clRetainEvent() and clReleaseEvent(). + * + * \see cl_event + */ +class Event : public detail::Wrapper<cl_event> +{ +public: + /*! \brief Destructor. + * + * This calls clReleaseEvent() on the value held by this instance. + */ + ~Event() { } + + //! \brief Default constructor - initializes to NULL. + Event() : detail::Wrapper<cl_type>() { } + + /*! \brief Copy constructor. + * + * This calls clRetainEvent() on the parameter's cl_event. + */ + Event(const Event& event) : detail::Wrapper<cl_type>(event) { } + + /*! \brief Constructor from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_event + * into the new Event object. + */ + Event(const cl_event& event) : detail::Wrapper<cl_type>(event) { } + + /*! \brief Assignment operator from cl_event - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseEvent() on the value previously held by this instance. + */ + Event& operator = (const Event& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_event. + * + * This calls clRetainEvent() on the parameter and clReleaseEvent() on + * the previous value held by this instance. + */ + Event& operator = (const cl_event& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetEventInfo(). + template <typename T> + cl_int getInfo(cl_event_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetEventInfo, object_, name, param), + __GET_EVENT_INFO_ERR); + } + + //! \brief Wrapper for clGetEventInfo() that returns by value. + template <cl_int name> typename + detail::param_traits<detail::cl_event_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_event_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + //! \brief Wrapper for clGetEventProfilingInfo(). + template <typename T> + cl_int getProfilingInfo(cl_profiling_info name, T* param) const + { + return detail::errHandler(detail::getInfo( + &::clGetEventProfilingInfo, object_, name, param), + __GET_EVENT_PROFILE_INFO_ERR); + } + + //! \brief Wrapper for clGetEventProfilingInfo() that returns by value. + template <cl_int name> typename + detail::param_traits<detail::cl_profiling_info, name>::param_type + getProfilingInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_profiling_info, name>::param_type param; + cl_int result = getProfilingInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + /*! \brief Blocks the calling thread until this event completes. + * + * Wraps clWaitForEvents(). + */ + cl_int wait() const + { + return detail::errHandler( + ::clWaitForEvents(1, &object_), + __WAIT_FOR_EVENTS_ERR); + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a user callback function for a specific command execution status. + * + * Wraps clSetEventCallback(). + */ + cl_int setCallback( + cl_int type, + void (CL_CALLBACK * pfn_notify)(cl_event, cl_int, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetEventCallback( + object_, + type, + pfn_notify, + user_data), + __SET_EVENT_CALLBACK_ERR); + } +#endif + + /*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ + static cl_int + waitForEvents(const VECTOR_CLASS<Event>& events) + { + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (cl_event*)&events.front()), + __WAIT_FOR_EVENTS_ERR); + } +}; + +#if defined(CL_VERSION_1_1) +/*! \brief Class interface for user events (a subset of cl_event's). + * + * See Event for details about copy semantics, etc. + */ +class UserEvent : public Event +{ +public: + /*! \brief Constructs a user event on a given context. + * + * Wraps clCreateUserEvent(). + */ + UserEvent( + const Context& context, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateUserEvent( + context(), + &error); + + detail::errHandler(error, __CREATE_USER_EVENT_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + UserEvent() : Event() { } + + //! \brief Copy constructor - performs shallow copy. + UserEvent(const UserEvent& event) : Event(event) { } + + //! \brief Assignment Operator - performs shallow copy. + UserEvent& operator = (const UserEvent& rhs) + { + if (this != &rhs) { + Event::operator=(rhs); + } + return *this; + } + + /*! \brief Sets the execution status of a user event object. + * + * Wraps clSetUserEventStatus(). + */ + cl_int setStatus(cl_int status) + { + return detail::errHandler( + ::clSetUserEventStatus(object_,status), + __SET_USER_EVENT_STATUS_ERR); + } +}; +#endif + +/*! \brief Blocks the calling thread until every event specified is complete. + * + * Wraps clWaitForEvents(). + */ +inline static cl_int +WaitForEvents(const VECTOR_CLASS<Event>& events) +{ + return detail::errHandler( + ::clWaitForEvents( + (cl_uint) events.size(), (cl_event*)&events.front()), + __WAIT_FOR_EVENTS_ERR); +} + +/*! \brief Class interface for cl_mem. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_mem as the original. For details, see + * clRetainMemObject() and clReleaseMemObject(). + * + * \see cl_mem + */ +class Memory : public detail::Wrapper<cl_mem> +{ +public: + + /*! \brief Destructor. + * + * This calls clReleaseMemObject() on the value held by this instance. + */ + ~Memory() {} + + //! \brief Default constructor - initializes to NULL. + Memory() : detail::Wrapper<cl_type>() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainMemObject() on the parameter's cl_mem. + */ + Memory(const Memory& memory) : detail::Wrapper<cl_type>(memory) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_mem + * into the new Memory object. + */ + __CL_EXPLICIT_CONSTRUCTORS Memory(const cl_mem& memory) : detail::Wrapper<cl_type>(memory) { } + + /*! \brief Assignment operator from Memory. + * + * This calls clRetainMemObject() on the parameter and clReleaseMemObject() + * on the previous value held by this instance. + */ + Memory& operator = (const Memory& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_mem - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseMemObject() on the value previously held by this instance. + */ + Memory& operator = (const cl_mem& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetMemObjectInfo(). + template <typename T> + cl_int getInfo(cl_mem_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetMemObjectInfo, object_, name, param), + __GET_MEM_OBJECT_INFO_ERR); + } + + //! \brief Wrapper for clGetMemObjectInfo() that returns by value. + template <cl_int name> typename + detail::param_traits<detail::cl_mem_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_mem_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Registers a callback function to be called when the memory object + * is no longer needed. + * + * Wraps clSetMemObjectDestructorCallback(). + * + * Repeated calls to this function, for a given cl_mem value, will append + * to the list of functions called (in reverse order) when memory object's + * resources are freed and the memory object is deleted. + * + * \note + * The registered callbacks are associated with the underlying cl_mem + * value - not the Memory class instance. + */ + cl_int setDestructorCallback( + void (CL_CALLBACK * pfn_notify)(cl_mem, void *), + void * user_data = NULL) + { + return detail::errHandler( + ::clSetMemObjectDestructorCallback( + object_, + pfn_notify, + user_data), + __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR); + } +#endif + +}; + +// Pre-declare copy functions +class Buffer; +template< typename IteratorType > +cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ); +template< typename IteratorType > +cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ); + +/*! \brief Class interface for Buffer Memory Objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Buffer : public Memory +{ +public: + + /*! \brief Constructs a Buffer in a specified context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + */ + Buffer( + const Context& context, + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! \brief Constructs a Buffer in the default context. + * + * Wraps clCreateBuffer(). + * + * \param host_ptr Storage to be used if the CL_MEM_USE_HOST_PTR flag was + * specified. Note alignment & exclusivity requirements. + * + * \see Context::getDefault() + */ + Buffer( + cl_mem_flags flags, + ::size_t size, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(err); + + object_ = ::clCreateBuffer(context(), flags, size, host_ptr, &error); + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + /*! + * \brief Construct a Buffer from a host container via iterators. + * If useHostPtr is specified iterators must be random access. + */ + template< typename IteratorType > + Buffer( + IteratorType startIterator, + IteratorType endIterator, + bool readOnly, + bool useHostPtr = false, + cl_int* err = NULL) + { + typedef typename std::iterator_traits<IteratorType>::value_type DataType; + cl_int error; + + cl_mem_flags flags = 0; + if( readOnly ) { + flags |= CL_MEM_READ_ONLY; + } + else { + flags |= CL_MEM_READ_WRITE; + } + if( useHostPtr ) { + flags |= CL_MEM_USE_HOST_PTR; + } + + ::size_t size = sizeof(DataType)*(endIterator - startIterator); + + Context context = Context::getDefault(err); + + if( useHostPtr ) { + object_ = ::clCreateBuffer(context(), flags, size, static_cast<DataType*>(&*startIterator), &error); + } else { + object_ = ::clCreateBuffer(context(), flags, size, 0, &error); + } + + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + if( !useHostPtr ) { + error = cl::copy(startIterator, endIterator, *this); + detail::errHandler(error, __CREATE_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + } + + //! \brief Default constructor - initializes to NULL. + Buffer() : Memory() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Buffer(const Buffer& buffer) : Memory(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Buffer(const cl_mem& buffer) : Memory(buffer) { } + + /*! \brief Assignment from Buffer - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const Buffer& rhs) + { + if (this != &rhs) { + Memory::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Buffer& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + +#if defined(CL_VERSION_1_1) + /*! \brief Creates a new buffer object from this. + * + * Wraps clCreateSubBuffer(). + */ + Buffer createSubBuffer( + cl_mem_flags flags, + cl_buffer_create_type buffer_create_type, + const void * buffer_create_info, + cl_int * err = NULL) + { + Buffer result; + cl_int error; + result.object_ = ::clCreateSubBuffer( + object_, + flags, + buffer_create_type, + buffer_create_info, + &error); + + detail::errHandler(error, __CREATE_SUBBUFFER_ERR); + if (err != NULL) { + *err = error; + } + + return result; + } +#endif +}; + +#if defined (USE_DX_INTEROP) +/*! \brief Class interface for creating OpenCL buffers from ID3D10Buffer's. + * + * This is provided to facilitate interoperability with Direct3D. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferD3D10 : public Buffer +{ +public: + typedef CL_API_ENTRY cl_mem (CL_API_CALL *PFN_clCreateFromD3D10BufferKHR)( + cl_context context, cl_mem_flags flags, ID3D10Buffer* buffer, + cl_int* errcode_ret); + + /*! \brief Constructs a BufferD3D10, in a specified context, from a + * given ID3D10Buffer. + * + * Wraps clCreateFromD3D10BufferKHR(). + */ + BufferD3D10( + const Context& context, + cl_mem_flags flags, + ID3D10Buffer* bufobj, + cl_int * err = NULL) + { + static PFN_clCreateFromD3D10BufferKHR pfn_clCreateFromD3D10BufferKHR = NULL; + +#if defined(CL_VERSION_1_2) + vector<cl_context_properties> props = context.getInfo<CL_CONTEXT_PROPERTIES>(); + cl_platform platform = -1; + for( int i = 0; i < props.size(); ++i ) { + if( props[i] == CL_CONTEXT_PLATFORM ) { + platform = props[i+1]; + } + } + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clCreateFromD3D10BufferKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clCreateFromD3D10BufferKHR); +#endif + + cl_int error; + object_ = pfn_clCreateFromD3D10BufferKHR( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferD3D10() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10(const BufferD3D10& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferD3D10(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferD3D10 - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const BufferD3D10& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferD3D10& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } +}; +#endif + +/*! \brief Class interface for GL Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferGL : public Buffer +{ +public: + /*! \brief Constructs a BufferGL in a specified context, from a given + * GL buffer. + * + * Wraps clCreateFromGLBuffer(). + */ + BufferGL( + const Context& context, + cl_mem_flags flags, + GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLBuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferGL() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL(const BufferGL& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferGL - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const BufferGL& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief Class interface for GL Render Buffer Memory Objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class BufferRenderGL : public Buffer +{ +public: + /*! \brief Constructs a BufferRenderGL in a specified context, from a given + * GL Renderbuffer. + * + * Wraps clCreateFromGLRenderbuffer(). + */ + BufferRenderGL( + const Context& context, + cl_mem_flags flags, + GLuint bufobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLRenderbuffer( + context(), + flags, + bufobj, + &error); + + detail::errHandler(error, __CREATE_GL_RENDER_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + BufferRenderGL() : Buffer() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL(const BufferGL& buffer) : Buffer(buffer) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS BufferRenderGL(const cl_mem& buffer) : Buffer(buffer) { } + + /*! \brief Assignment from BufferGL - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const BufferRenderGL& rhs) + { + if (this != &rhs) { + Buffer::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + BufferRenderGL& operator = (const cl_mem& rhs) + { + Buffer::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetGLObjectInfo(). + cl_int getObjectInfo( + cl_gl_object_type *type, + GLuint * gl_object_name) + { + return detail::errHandler( + ::clGetGLObjectInfo(object_,type,gl_object_name), + __GET_GL_OBJECT_INFO_ERR); + } +}; + +/*! \brief C++ base class for Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image : public Memory +{ +protected: + //! \brief Default constructor - initializes to NULL. + Image() : Memory() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image(const Image& image) : Memory(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image(const cl_mem& image) : Memory(image) { } + + /*! \brief Assignment from Image - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const Image& rhs) + { + if (this != &rhs) { + Memory::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image& operator = (const cl_mem& rhs) + { + Memory::operator=(rhs); + return *this; + } + +public: + //! \brief Wrapper for clGetImageInfo(). + template <typename T> + cl_int getImageInfo(cl_image_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetImageInfo, object_, name, param), + __GET_IMAGE_INFO_ERR); + } + + //! \brief Wrapper for clGetImageInfo() that returns by value. + template <cl_int name> typename + detail::param_traits<detail::cl_image_info, name>::param_type + getImageInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_image_info, name>::param_type param; + cl_int result = getImageInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +#if defined(CL_VERSION_1_2) +/*! \brief Class interface for 1D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image1D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image1D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D; + desc.image_width = width; + desc.image_row_pitch = 0; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image1D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image1D(const Image1D& image1D) : Image(image1D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image1D(const cl_mem& image1D) : Image(image1D) { } + + /*! \brief Assignment from Image1D - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const Image1D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image1D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +/*! \class Image1DBuffer + * \brief Image interface for 1D buffer images. + */ +class Image1DBuffer : public Image +{ +public: + Image1DBuffer( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + const Buffer &buffer, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER; + desc.image_width = width; + desc.image_row_pitch = 0; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = buffer(); + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + NULL, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DBuffer() { } + + Image1DBuffer(const Image1DBuffer& image1D) : Image(image1D) { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DBuffer(const cl_mem& image1D) : Image(image1D) { } + + Image1DBuffer& operator = (const Image1DBuffer& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image1DBuffer& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +/*! \class Image1DArray + * \brief Image interface for arrays of 1D images. + */ +class Image1DArray : public Image +{ +public: + Image1DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t rowPitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY; + desc.image_array_size = arraySize; + desc.image_width = width; + desc.image_row_pitch = rowPitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image1DArray() { } + + Image1DArray(const Image1DArray& imageArray) : Image(imageArray) { } + + __CL_EXPLICIT_CONSTRUCTORS Image1DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image1DArray& operator = (const Image1DArray& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image1DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + + +/*! \brief Class interface for 2D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image2D : public Image +{ +public: + /*! \brief Constructs a 1D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image2D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t row_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = row_pitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage2D( + context(), flags,&format, width, height, row_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE2D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image2D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image2D(const Image2D& image2D) : Image(image2D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2D(const cl_mem& image2D) : Image(image2D) { } + + /*! \brief Assignment from Image2D - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const Image2D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + + +#if !defined(CL_VERSION_1_2) +/*! \brief Class interface for GL 2D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + * \note Deprecated for OpenCL 1.2. Please use ImageGL instead. + */ +class CL_EXT_PREFIX__VERSION_1_1_DEPRECATED Image2DGL CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED : public Image2D +{ +public: + /*! \brief Constructs an Image2DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture2D(). + */ + Image2DGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture2D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_2D_ERR); + if (err != NULL) { + *err = error; + } + + } + + //! \brief Default constructor - initializes to NULL. + Image2DGL() : Image2D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL(const Image2DGL& image) : Image2D(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image2DGL(const cl_mem& image) : Image2D(image) { } + + /*! \brief Assignment from Image2DGL - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const Image2DGL& rhs) + { + if (this != &rhs) { + Image2D::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image2DGL& operator = (const cl_mem& rhs) + { + Image2D::operator=(rhs); + return *this; + } +}; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) +/*! \class Image2DArray + * \brief Image interface for arrays of 2D images. + */ +class Image2DArray : public Image +{ +public: + Image2DArray( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t arraySize, + ::size_t width, + ::size_t height, + ::size_t rowPitch, + ::size_t slicePitch, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY; + desc.image_array_size = arraySize; + desc.image_width = width; + desc.image_height = height; + desc.image_row_pitch = rowPitch; + desc.image_slice_pitch = slicePitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } + + Image2DArray() { } + + Image2DArray(const Image2DArray& imageArray) : Image(imageArray) { } + + __CL_EXPLICIT_CONSTRUCTORS Image2DArray(const cl_mem& imageArray) : Image(imageArray) { } + + Image2DArray& operator = (const Image2DArray& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + Image2DArray& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + +/*! \brief Class interface for 3D Image Memory objects. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3D : public Image +{ +public: + /*! \brief Constructs a 3D Image in a specified context. + * + * Wraps clCreateImage(). + */ + Image3D( + const Context& context, + cl_mem_flags flags, + ImageFormat format, + ::size_t width, + ::size_t height, + ::size_t depth, + ::size_t row_pitch = 0, + ::size_t slice_pitch = 0, + void* host_ptr = NULL, + cl_int* err = NULL) + { + cl_int error; + bool useCreateImage; + +#if defined(CL_VERSION_1_2) && defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + // Run-time decision based on the actual platform + { + cl_uint version = detail::getContextPlatformVersion(context()); + useCreateImage = (version >= 0x10002); // OpenCL 1.2 or above + } +#elif defined(CL_VERSION_1_2) + useCreateImage = true; +#else + useCreateImage = false; +#endif + +#if defined(CL_VERSION_1_2) + if (useCreateImage) + { + cl_image_desc desc; + desc.image_type = CL_MEM_OBJECT_IMAGE3D; + desc.image_width = width; + desc.image_height = height; + desc.image_depth = depth; + desc.image_row_pitch = row_pitch; + desc.image_slice_pitch = slice_pitch; + desc.num_mip_levels = 0; + desc.num_samples = 0; + desc.buffer = 0; + object_ = ::clCreateImage( + context(), + flags, + &format, + &desc, + host_ptr, + &error); + + detail::errHandler(error, __CREATE_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) +#if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + if (!useCreateImage) + { + object_ = ::clCreateImage3D( + context(), flags, &format, width, height, depth, row_pitch, + slice_pitch, host_ptr, &error); + + detail::errHandler(error, __CREATE_IMAGE3D_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if !defined(CL_VERSION_1_2) || defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) + } + + //! \brief Default constructor - initializes to NULL. + Image3D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image3D(const Image3D& image3D) : Image(image3D) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3D(const cl_mem& image3D) : Image(image3D) { } + + /*! \brief Assignment from Image3D - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const Image3D& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3D& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; + +#if !defined(CL_VERSION_1_2) +/*! \brief Class interface for GL 3D Image Memory objects. + * + * This is provided to facilitate interoperability with OpenGL. + * + * See Memory for details about copy semantics, etc. + * + * \see Memory + */ +class Image3DGL : public Image3D +{ +public: + /*! \brief Constructs an Image3DGL in a specified context, from a given + * GL Texture. + * + * Wraps clCreateFromGLTexture3D(). + */ + Image3DGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture3D( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_3D_ERR); + if (err != NULL) { + *err = error; + } + } + + //! \brief Default constructor - initializes to NULL. + Image3DGL() : Image3D() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL(const Image3DGL& image) : Image3D(image) { } + + /*! \brief Constructor from cl_mem - takes ownership. + * + * See Memory for further details. + */ + __CL_EXPLICIT_CONSTRUCTORS Image3DGL(const cl_mem& image) : Image3D(image) { } + + /*! \brief Assignment from Image3DGL - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const Image3DGL& rhs) + { + if (this != &rhs) { + Image3D::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment from cl_mem - performs shallow copy. + * + * See Memory for further details. + */ + Image3DGL& operator = (const cl_mem& rhs) + { + Image3D::operator=(rhs); + return *this; + } +}; +#endif // #if !defined(CL_VERSION_1_2) + +#if defined(CL_VERSION_1_2) +/*! \class ImageGL + * \brief general image interface for GL interop. + * We abstract the 2D and 3D GL images into a single instance here + * that wraps all GL sourced images on the grounds that setup information + * was performed by OpenCL anyway. + */ +class ImageGL : public Image +{ +public: + ImageGL( + const Context& context, + cl_mem_flags flags, + GLenum target, + GLint miplevel, + GLuint texobj, + cl_int * err = NULL) + { + cl_int error; + object_ = ::clCreateFromGLTexture( + context(), + flags, + target, + miplevel, + texobj, + &error); + + detail::errHandler(error, __CREATE_GL_TEXTURE_ERR); + if (err != NULL) { + *err = error; + } + } + + ImageGL() : Image() { } + + ImageGL(const ImageGL& image) : Image(image) { } + + __CL_EXPLICIT_CONSTRUCTORS ImageGL(const cl_mem& image) : Image(image) { } + + ImageGL& operator = (const ImageGL& rhs) + { + if (this != &rhs) { + Image::operator=(rhs); + } + return *this; + } + + ImageGL& operator = (const cl_mem& rhs) + { + Image::operator=(rhs); + return *this; + } +}; +#endif // #if defined(CL_VERSION_1_2) + +/*! \brief Class interface for cl_sampler. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_sampler as the original. For details, see + * clRetainSampler() and clReleaseSampler(). + * + * \see cl_sampler + */ +class Sampler : public detail::Wrapper<cl_sampler> +{ +public: + /*! \brief Destructor. + * + * This calls clReleaseSampler() on the value held by this instance. + */ + ~Sampler() { } + + //! \brief Default constructor - initializes to NULL. + Sampler() { } + + /*! \brief Constructs a Sampler in a specified context. + * + * Wraps clCreateSampler(). + */ + + #if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + Sampler( + const Context& context, + cl_bool normalized_coords, + cl_addressing_mode addressing_mode, + cl_filter_mode filter_mode, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateSampler( + context(), + normalized_coords, + addressing_mode, + filter_mode, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_ERR); + if (err != NULL) { + *err = error; + } + } +#else + Sampler( + const Context& context, + const cl_sampler_properties *sampler_properties, + cl_int* err = NULL) + { + cl_int error; + object_ = ::clCreateSamplerWithProperties( + context(), + sampler_properties, + &error); + + detail::errHandler(error, __CREATE_SAMPLER_PROPERTY_ERR); + if (err != NULL) { + *err = error; + } + } +#endif + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainSampler() on the parameter's cl_sampler. + */ + Sampler(const Sampler& sampler) : detail::Wrapper<cl_type>(sampler) { } + + /*! \brief Constructor from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_sampler + * into the new Sampler object. + */ + Sampler(const cl_sampler& sampler) : detail::Wrapper<cl_type>(sampler) { } + + /*! \brief Assignment operator from Sampler. + * + * This calls clRetainSampler() on the parameter and clReleaseSampler() + * on the previous value held by this instance. + */ + Sampler& operator = (const Sampler& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_sampler - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseSampler() on the value previously held by this instance. + */ + Sampler& operator = (const cl_sampler& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + //! \brief Wrapper for clGetSamplerInfo(). + template <typename T> + cl_int getInfo(cl_sampler_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetSamplerInfo, object_, name, param), + __GET_SAMPLER_INFO_ERR); + } + + //! \brief Wrapper for clGetSamplerInfo() that returns by value. + template <cl_int name> typename + detail::param_traits<detail::cl_sampler_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_sampler_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +}; + +class Program; +class CommandQueue; +class Kernel; + +//! \brief Class interface for specifying NDRange values. +class NDRange +{ +private: + size_t<3> sizes_; + cl_uint dimensions_; + +public: + //! \brief Default constructor - resulting range has zero dimensions. + NDRange() + : dimensions_(0) + { } + + //! \brief Constructs one-dimensional range. + NDRange(::size_t size0) + : dimensions_(1) + { + sizes_[0] = size0; + } + + //! \brief Constructs two-dimensional range. + NDRange(::size_t size0, ::size_t size1) + : dimensions_(2) + { + sizes_[0] = size0; + sizes_[1] = size1; + } + + //! \brief Constructs three-dimensional range. + NDRange(::size_t size0, ::size_t size1, ::size_t size2) + : dimensions_(3) + { + sizes_[0] = size0; + sizes_[1] = size1; + sizes_[2] = size2; + } + + /*! \brief Conversion operator to const ::size_t *. + * + * \returns a pointer to the size of the first dimension. + */ + operator const ::size_t*() const { + return (const ::size_t*) sizes_; + } + + //! \brief Queries the number of dimensions in the range. + ::size_t dimensions() const { return dimensions_; } +}; + +//! \brief A zero-dimensional range. +static const NDRange NullRange; + +//! \brief Local address wrapper for use with Kernel::setArg +struct LocalSpaceArg +{ + ::size_t size_; +}; + +namespace detail { + +template <typename T> +struct KernelArgumentHandler +{ + static ::size_t size(const T&) { return sizeof(T); } + static T* ptr(T& value) { return &value; } +}; + +template <> +struct KernelArgumentHandler<LocalSpaceArg> +{ + static ::size_t size(const LocalSpaceArg& value) { return value.size_; } + static void* ptr(LocalSpaceArg&) { return NULL; } +}; + +} +//! \endcond + +/*! __local + * \brief Helper function for generating LocalSpaceArg objects. + * Deprecated. Replaced with Local. + */ +inline CL_EXT_PREFIX__VERSION_1_1_DEPRECATED LocalSpaceArg +__local(::size_t size) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; +inline LocalSpaceArg +__local(::size_t size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +/*! Local + * \brief Helper function for generating LocalSpaceArg objects. + */ +inline LocalSpaceArg +Local(::size_t size) +{ + LocalSpaceArg ret = { size }; + return ret; +} + +//class KernelFunctor; + +/*! \brief Class interface for cl_kernel. + * + * \note Copies of these objects are shallow, meaning that the copy will refer + * to the same underlying cl_kernel as the original. For details, see + * clRetainKernel() and clReleaseKernel(). + * + * \see cl_kernel + */ +class Kernel : public detail::Wrapper<cl_kernel> +{ +public: + inline Kernel(const Program& program, const char* name, cl_int* err = NULL); + + /*! \brief Destructor. + * + * This calls clReleaseKernel() on the value held by this instance. + */ + ~Kernel() { } + + //! \brief Default constructor - initializes to NULL. + Kernel() { } + + /*! \brief Copy constructor - performs shallow copy. + * + * This calls clRetainKernel() on the parameter's cl_kernel. + */ + Kernel(const Kernel& kernel) : detail::Wrapper<cl_type>(kernel) { } + + /*! \brief Constructor from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the cl_kernel + * into the new Kernel object. + */ + __CL_EXPLICIT_CONSTRUCTORS Kernel(const cl_kernel& kernel) : detail::Wrapper<cl_type>(kernel) { } + + /*! \brief Assignment operator from Kernel. + * + * This calls clRetainKernel() on the parameter and clReleaseKernel() + * on the previous value held by this instance. + */ + Kernel& operator = (const Kernel& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + /*! \brief Assignment operator from cl_kernel - takes ownership. + * + * This effectively transfers ownership of a refcount on the rhs and calls + * clReleaseKernel() on the value previously held by this instance. + */ + Kernel& operator = (const cl_kernel& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + template <typename T> + cl_int getInfo(cl_kernel_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelInfo, object_, name, param), + __GET_KERNEL_INFO_ERR); + } + + template <cl_int name> typename + detail::param_traits<detail::cl_kernel_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + +#if defined(CL_VERSION_1_2) + template <typename T> + cl_int getArgInfo(cl_uint argIndex, cl_kernel_arg_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetKernelArgInfo, object_, argIndex, name, param), + __GET_KERNEL_ARG_INFO_ERR); + } + + template <cl_int name> typename + detail::param_traits<detail::cl_kernel_arg_info, name>::param_type + getArgInfo(cl_uint argIndex, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_arg_info, name>::param_type param; + cl_int result = getArgInfo(argIndex, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } +#endif // #if defined(CL_VERSION_1_2) + + template <typename T> + cl_int getWorkGroupInfo( + const Device& device, cl_kernel_work_group_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetKernelWorkGroupInfo, object_, device(), name, param), + __GET_KERNEL_WORK_GROUP_INFO_ERR); + } + + template <cl_int name> typename + detail::param_traits<detail::cl_kernel_work_group_info, name>::param_type + getWorkGroupInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_kernel_work_group_info, name>::param_type param; + cl_int result = getWorkGroupInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template <typename T> + cl_int setArg(cl_uint index, T value) + { + return detail::errHandler( + ::clSetKernelArg( + object_, + index, + detail::KernelArgumentHandler<T>::size(value), + detail::KernelArgumentHandler<T>::ptr(value)), + __SET_KERNEL_ARGS_ERR); + } + + cl_int setArg(cl_uint index, ::size_t size, void* argPtr) + { + return detail::errHandler( + ::clSetKernelArg(object_, index, size, argPtr), + __SET_KERNEL_ARGS_ERR); + } +}; + +/*! \class Program + * \brief Program interface that implements cl_program. + */ +class Program : public detail::Wrapper<cl_program> +{ +public: + typedef VECTOR_CLASS<std::pair<const void*, ::size_t> > Binaries; + typedef VECTOR_CLASS<std::pair<const char*, ::size_t> > Sources; + + Program( + const STRING_CLASS& source, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const STRING_CLASS& source, + bool build, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + Context context = Context::getDefault(err); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const STRING_CLASS& source, + bool build = false, + cl_int* err = NULL) + { + cl_int error; + + const char * strings = source.c_str(); + const ::size_t length = source.size(); + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)1, &strings, &length, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + + if (error == CL_SUCCESS && build) { + + error = ::clBuildProgram( + object_, + 0, + NULL, + "", + NULL, + NULL); + + detail::errHandler(error, __BUILD_PROGRAM_ERR); + } + + if (err != NULL) { + *err = error; + } + } + + Program( + const Context& context, + const Sources& sources, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t n = (::size_t)sources.size(); + ::size_t* lengths = (::size_t*) alloca(n * sizeof(::size_t)); + const char** strings = (const char**) alloca(n * sizeof(const char*)); + + for (::size_t i = 0; i < n; ++i) { + strings[i] = sources[(int)i].first; + lengths[i] = sources[(int)i].second; + } + + object_ = ::clCreateProgramWithSource( + context(), (cl_uint)n, strings, lengths, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_SOURCE_ERR); + if (err != NULL) { + *err = error; + } + } + + /** + * Construct a program object from a list of devices and a per-device list of binaries. + * \param context A valid OpenCL context in which to construct the program. + * \param devices A vector of OpenCL device objects for which the program will be created. + * \param binaries A vector of pairs of a pointer to a binary object and its length. + * \param binaryStatus An optional vector that on completion will be resized to + * match the size of binaries and filled with values to specify if each binary + * was successfully loaded. + * Set to CL_SUCCESS if the binary was successfully loaded. + * Set to CL_INVALID_VALUE if the length is 0 or the binary pointer is NULL. + * Set to CL_INVALID_BINARY if the binary provided is not valid for the matching device. + * \param err if non-NULL will be set to CL_SUCCESS on successful operation or one of the following errors: + * CL_INVALID_CONTEXT if context is not a valid context. + * CL_INVALID_VALUE if the length of devices is zero; or if the length of binaries does not match the length of devices; + * or if any entry in binaries is NULL or has length 0. + * CL_INVALID_DEVICE if OpenCL devices listed in devices are not in the list of devices associated with context. + * CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device. + * CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host. + */ + Program( + const Context& context, + const VECTOR_CLASS<Device>& devices, + const Binaries& binaries, + VECTOR_CLASS<cl_int>* binaryStatus = NULL, + cl_int* err = NULL) + { + cl_int error; + + const ::size_t numDevices = devices.size(); + + // Catch size mismatch early and return + if(binaries.size() != numDevices) { + error = CL_INVALID_VALUE; + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + return; + } + + ::size_t* lengths = (::size_t*) alloca(numDevices * sizeof(::size_t)); + const unsigned char** images = (const unsigned char**) alloca(numDevices * sizeof(const unsigned char**)); + + for (::size_t i = 0; i < numDevices; ++i) { + images[i] = (const unsigned char*)binaries[i].first; + lengths[i] = binaries[(int)i].second; + } + + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + if(binaryStatus) { + binaryStatus->resize(numDevices); + } + + object_ = ::clCreateProgramWithBinary( + context(), (cl_uint) devices.size(), + deviceIDs, + lengths, images, binaryStatus != NULL + ? &binaryStatus->front() + : NULL, &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BINARY_ERR); + if (err != NULL) { + *err = error; + } + } + + +#if defined(CL_VERSION_1_2) + /** + * Create program using builtin kernels. + * \param kernelNames Semi-colon separated list of builtin kernel names + */ + Program( + const Context& context, + const VECTOR_CLASS<Device>& devices, + const STRING_CLASS& kernelNames, + cl_int* err = NULL) + { + cl_int error; + + + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + object_ = ::clCreateProgramWithBuiltInKernels( + context(), + (cl_uint) devices.size(), + deviceIDs, + kernelNames.c_str(), + &error); + + detail::errHandler(error, __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR); + if (err != NULL) { + *err = error; + } + } +#endif // #if defined(CL_VERSION_1_2) + + Program() { } + + Program(const Program& program) : detail::Wrapper<cl_type>(program) { } + + __CL_EXPLICIT_CONSTRUCTORS Program(const cl_program& program) : detail::Wrapper<cl_type>(program) { } + + Program& operator = (const Program& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + Program& operator = (const cl_program& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + cl_int build( + const VECTOR_CLASS<Device>& devices, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + ::size_t numDevices = devices.size(); + cl_device_id* deviceIDs = (cl_device_id*) alloca(numDevices * sizeof(cl_device_id)); + for( ::size_t deviceIndex = 0; deviceIndex < numDevices; ++deviceIndex ) { + deviceIDs[deviceIndex] = (devices[deviceIndex])(); + } + + return detail::errHandler( + ::clBuildProgram( + object_, + (cl_uint) + devices.size(), + deviceIDs, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + + cl_int build( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clBuildProgram( + object_, + 0, + NULL, + options, + notifyFptr, + data), + __BUILD_PROGRAM_ERR); + } + +#if defined(CL_VERSION_1_2) + cl_int compile( + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL) const + { + return detail::errHandler( + ::clCompileProgram( + object_, + 0, + NULL, + options, + 0, + NULL, + NULL, + notifyFptr, + data), + __COMPILE_PROGRAM_ERR); + } +#endif + + template <typename T> + cl_int getInfo(cl_program_info name, T* param) const + { + return detail::errHandler( + detail::getInfo(&::clGetProgramInfo, object_, name, param), + __GET_PROGRAM_INFO_ERR); + } + + template <cl_int name> typename + detail::param_traits<detail::cl_program_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + template <typename T> + cl_int getBuildInfo( + const Device& device, cl_program_build_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetProgramBuildInfo, object_, device(), name, param), + __GET_PROGRAM_BUILD_INFO_ERR); + } + + template <cl_int name> typename + detail::param_traits<detail::cl_program_build_info, name>::param_type + getBuildInfo(const Device& device, cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_program_build_info, name>::param_type param; + cl_int result = getBuildInfo(device, name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int createKernels(VECTOR_CLASS<Kernel>* kernels) + { + cl_uint numKernels; + cl_int err = ::clCreateKernelsInProgram(object_, 0, NULL, &numKernels); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + Kernel* value = (Kernel*) alloca(numKernels * sizeof(Kernel)); + err = ::clCreateKernelsInProgram( + object_, numKernels, (cl_kernel*) value, NULL); + if (err != CL_SUCCESS) { + return detail::errHandler(err, __CREATE_KERNELS_IN_PROGRAM_ERR); + } + + kernels->assign(&value[0], &value[numKernels]); + return CL_SUCCESS; + } +}; + +#if defined(CL_VERSION_1_2) +inline Program linkProgram( + Program input1, + Program input2, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int err_local = CL_SUCCESS; + + cl_program programs[2] = { input1(), input2() }; + + Context ctx = input1.getInfo<CL_PROGRAM_CONTEXT>(); + + cl_program prog = ::clLinkProgram( + ctx(), + 0, + NULL, + options, + 2, + programs, + notifyFptr, + data, + &err_local); + + detail::errHandler(err_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = err_local; + } + + return Program(prog); +} + +inline Program linkProgram( + VECTOR_CLASS<Program> inputPrograms, + const char* options = NULL, + void (CL_CALLBACK * notifyFptr)(cl_program, void *) = NULL, + void* data = NULL, + cl_int* err = NULL) +{ + cl_int err_local = CL_SUCCESS; + + cl_program * programs = (cl_program*) alloca(inputPrograms.size() * sizeof(cl_program)); + + if (programs != NULL) { + for (unsigned int i = 0; i < inputPrograms.size(); i++) { + programs[i] = inputPrograms[i](); + } + } + + cl_program prog = ::clLinkProgram( + Context::getDefault()(), + 0, + NULL, + options, + (cl_uint)inputPrograms.size(), + programs, + notifyFptr, + data, + &err_local); + + detail::errHandler(err_local,__COMPILE_PROGRAM_ERR); + if (err != NULL) { + *err = err_local; + } + + return Program(prog); +} +#endif + +template<> +inline VECTOR_CLASS<char *> cl::Program::getInfo<CL_PROGRAM_BINARIES>(cl_int* err) const +{ + VECTOR_CLASS< ::size_t> sizes = getInfo<CL_PROGRAM_BINARY_SIZES>(); + VECTOR_CLASS<char *> binaries; + for (VECTOR_CLASS< ::size_t>::iterator s = sizes.begin(); s != sizes.end(); ++s) + { + char *ptr = NULL; + if (*s != 0) + ptr = new char[*s]; + binaries.push_back(ptr); + } + + cl_int result = getInfo(CL_PROGRAM_BINARIES, &binaries); + if (err != NULL) { + *err = result; + } + return binaries; +} + +inline Kernel::Kernel(const Program& program, const char* name, cl_int* err) +{ + cl_int error; + + object_ = ::clCreateKernel(program(), name, &error); + detail::errHandler(error, __CREATE_KERNEL_ERR); + + if (err != NULL) { + *err = error; + } + +} + +/*! \class CommandQueue + * \brief CommandQueue interface for cl_command_queue. + */ +class CommandQueue : public detail::Wrapper<cl_command_queue> +{ +private: + static volatile int default_initialized_; + static CommandQueue default_; + static volatile cl_int default_error_; +public: +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + CommandQueue( + cl_command_queue_properties properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0]; + + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } + } +#else + CommandQueue( + const cl_queue_properties *properties, + cl_int* err = NULL) + { + cl_int error; + + Context context = Context::getDefault(&error); + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0]; + + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); + if (err != NULL) { + *err = error; + } + } + } +#endif + +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + CommandQueue( + const Context& context, + const Device& device, + cl_command_queue_properties properties = 0, + cl_int* err = NULL) + { + cl_int error; + + object_ = ::clCreateCommandQueue( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); + if (err != NULL) { + *err = error; + } + } +#else + CommandQueue( + const Context& context, + const Device& device, + const cl_queue_properties *properties = NULL, + cl_int* err = NULL) + { + cl_int error; + + object_ = ::clCreateCommandQueueWithProperties( + context(), device(), properties, &error); + + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); + if (err != NULL) { + *err = error; + } + } +#endif + + static CommandQueue getDefault(cl_int * err = NULL) + { + int state = detail::compare_exchange( + &default_initialized_, + __DEFAULT_BEING_INITIALIZED, __DEFAULT_NOT_INITIALIZED); + + if (state & __DEFAULT_INITIALIZED) { + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + if (state & __DEFAULT_BEING_INITIALIZED) { + // Assume writes will propagate eventually... + while(default_initialized_ != __DEFAULT_INITIALIZED) { + detail::fence(); + } + + if (err != NULL) { + *err = default_error_; + } + return default_; + } + + cl_int error; + + Context context = Context::getDefault(&error); +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); +#else + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); +#endif + + if (error != CL_SUCCESS) { + if (err != NULL) { + *err = error; + } + } + else { + Device device = context.getInfo<CL_CONTEXT_DEVICES>()[0]; + + default_ = CommandQueue(context, device, 0, &error); + +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + detail::errHandler(error, __CREATE_COMMAND_QUEUE_ERR); +#else + detail::errHandler(error, __CREATE_COMMAND_QUEUE_PROPERTY_ERR); +#endif + if (err != NULL) { + *err = error; + } + } + + detail::fence(); + + default_error_ = error; + // Assume writes will propagate eventually... + default_initialized_ = __DEFAULT_INITIALIZED; + + detail::fence(); + + if (err != NULL) { + *err = default_error_; + } + return default_; + + } + + CommandQueue() { } + + CommandQueue(const CommandQueue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { } + + CommandQueue(const cl_command_queue& commandQueue) : detail::Wrapper<cl_type>(commandQueue) { } + + CommandQueue& operator = (const CommandQueue& rhs) + { + if (this != &rhs) { + detail::Wrapper<cl_type>::operator=(rhs); + } + return *this; + } + + CommandQueue& operator = (const cl_command_queue& rhs) + { + detail::Wrapper<cl_type>::operator=(rhs); + return *this; + } + + template <typename T> + cl_int getInfo(cl_command_queue_info name, T* param) const + { + return detail::errHandler( + detail::getInfo( + &::clGetCommandQueueInfo, object_, name, param), + __GET_COMMAND_QUEUE_INFO_ERR); + } + + template <cl_int name> typename + detail::param_traits<detail::cl_command_queue_info, name>::param_type + getInfo(cl_int* err = NULL) const + { + typename detail::param_traits< + detail::cl_command_queue_info, name>::param_type param; + cl_int result = getInfo(name, ¶m); + if (err != NULL) { + *err = result; + } + return param; + } + + cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBuffer( + object_, buffer(), blocking, offset, size, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBuffer( + object_, src(), dst(), src_offset, dst_offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteBufferRect( + object_, + buffer(), + blocking, + (const ::size_t *)buffer_offset, + (const ::size_t *)host_offset, + (const ::size_t *)region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferRect( + object_, + src(), + dst(), + (const ::size_t *)src_origin, + (const ::size_t *)dst_origin, + (const ::size_t *)region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQEUE_COPY_BUFFER_RECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill a buffer object with a pattern + * of a given size. The pattern is specified a as vector. + * \tparam PatternType The datatype of the pattern field. + * The pattern type must be an accepted OpenCL data type. + */ + template<typename PatternType> + cl_int enqueueFillBuffer( + const Buffer& buffer, + PatternType pattern, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillBuffer( + object_, + buffer(), + static_cast<void*>(&pattern), + sizeof(PatternType), + offset, + size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReadImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_READ_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueWriteImage( + object_, image(), blocking, (const ::size_t *) origin, + (const ::size_t *) region, row_pitch, slice_pitch, ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_WRITE_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImage( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *)dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA floating-point color value if + * the image channel data type is not an unnormalized signed or + * unsigned data type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_float4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast<void*>(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA signed integer color value if + * the image channel data type is an unnormalized signed integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_int4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast<void*>(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueue a command to fill an image object with a specified color. + * \param fillColor is the color to use to fill the image. + * This is a four component RGBA unsigned integer color value if + * the image channel data type is an unnormalized unsigned integer + * type. + */ + cl_int enqueueFillImage( + const Image& image, + cl_uint4 fillColor, + const size_t<3>& origin, + const size_t<3>& region, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueFillImage( + object_, + image(), + static_cast<void*>(&fillColor), + (const ::size_t *) origin, + (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_FILL_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyImageToBuffer( + object_, src(), dst(), (const ::size_t *) src_origin, + (const ::size_t *) region, dst_offset, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueCopyBufferToImage( + object_, src(), dst(), src_offset, + (const ::size_t *) dst_origin, (const ::size_t *) region, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_int error; + void * result = ::clEnqueueMapBuffer( + object_, buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + void* enqueueMapImage( + const Image& buffer, + cl_bool blocking, + cl_map_flags flags, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t * row_pitch, + ::size_t * slice_pitch, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL, + cl_int* err = NULL) const + { + cl_int error; + void * result = ::clEnqueueMapImage( + object_, buffer(), blocking, flags, + (const ::size_t *) origin, (const ::size_t *) region, + row_pitch, slice_pitch, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_IMAGE_ERR); + if (err != NULL) { + *err = error; + } + return result; + } + + cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + object_, memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined(CL_VERSION_1_2) + /** + * Enqueues a marker command which waits for either a list of events to complete, + * or all previously enqueued commands to complete. + * + * Enqueues a marker command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command returns an event which can be waited on, + * i.e. this event can be waited on to insure that all events either in the event_wait_list + * or all previously enqueued commands, queued before this command to command_queue, + * have completed. + */ + cl_int enqueueMarkerWithWaitList( + const VECTOR_CLASS<Event> *events = 0, + Event *event = 0) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueMarkerWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_MARKER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * A synchronization point that enqueues a barrier operation. + * + * Enqueues a barrier command which waits for either a list of events to complete, + * or if the list is empty it waits for all commands previously enqueued in command_queue + * to complete before it completes. This command blocks command execution, that is, any + * following commands enqueued after it do not execute until it completes. This command + * returns an event which can be waited on, i.e. this event can be waited on to insure that + * all events either in the event_wait_list or all previously enqueued commands, queued + * before this command to command_queue, have completed. + */ + cl_int enqueueBarrierWithWaitList( + const VECTOR_CLASS<Event> *events = 0, + Event *event = 0) + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueBarrierWithWaitList( + object_, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_BARRIER_WAIT_LIST_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + /** + * Enqueues a command to indicate with which device a set of memory objects + * should be associated. + */ + cl_int enqueueMigrateMemObjects( + const VECTOR_CLASS<Memory> &memObjects, + cl_mem_migration_flags flags, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL + ) + { + cl_event tmp; + + cl_mem* localMemObjects = static_cast<cl_mem*>(alloca(memObjects.size() * sizeof(cl_mem))); + for( int i = 0; i < (int)memObjects.size(); ++i ) { + localMemObjects[i] = memObjects[i](); + } + + + cl_int err = detail::errHandler( + ::clEnqueueMigrateMemObjects( + object_, + (cl_uint)memObjects.size(), + static_cast<const cl_mem*>(localMemObjects), + flags, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif // #if defined(CL_VERSION_1_2) + + cl_int enqueueNDRangeKernel( + const Kernel& kernel, + const NDRange& offset, + const NDRange& global, + const NDRange& local = NullRange, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNDRangeKernel( + object_, kernel(), (cl_uint) global.dimensions(), + offset.dimensions() != 0 ? (const ::size_t*) offset : NULL, + (const ::size_t*) global, + local.dimensions() != 0 ? (const ::size_t*) local : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NDRANGE_KERNEL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if !defined(CL_VERSION_2_0) || defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS) + cl_int enqueueTask( + const Kernel& kernel, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueTask( + object_, kernel(), + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_TASK_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif + + cl_int enqueueNativeKernel( + void (CL_CALLBACK *userFptr)(void *), + std::pair<void*, ::size_t> args, + const VECTOR_CLASS<Memory>* mem_objects = NULL, + const VECTOR_CLASS<const void*>* mem_locs = NULL, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_mem * mems = (mem_objects != NULL && mem_objects->size() > 0) + ? (cl_mem*) alloca(mem_objects->size() * sizeof(cl_mem)) + : NULL; + + if (mems != NULL) { + for (unsigned int i = 0; i < mem_objects->size(); i++) { + mems[i] = ((*mem_objects)[i])(); + } + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueNativeKernel( + object_, userFptr, args.first, args.second, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + mems, + (mem_locs != NULL) ? (const void **) &mem_locs->front() : NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_NATIVE_KERNEL); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueMarker(Event* event = NULL) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueMarker(object_, (cl_event*) event), + __ENQUEUE_MARKER_ERR); + } + + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueWaitForEvents(const VECTOR_CLASS<Event>& events) const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueWaitForEvents( + object_, + (cl_uint) events.size(), + (const cl_event*) &events.front()), + __ENQUEUE_WAIT_FOR_EVENTS_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int enqueueAcquireGLObjects( + const VECTOR_CLASS<Memory>* mem_objects = NULL, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueAcquireGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseGLObjects( + const VECTOR_CLASS<Memory>* mem_objects = NULL, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueReleaseGLObjects( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + +#if defined (USE_DX_INTEROP) +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueAcquireD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); +typedef CL_API_ENTRY cl_int (CL_API_CALL *PFN_clEnqueueReleaseD3D10ObjectsKHR)( + cl_command_queue command_queue, cl_uint num_objects, + const cl_mem* mem_objects, cl_uint num_events_in_wait_list, + const cl_event* event_wait_list, cl_event* event); + + cl_int enqueueAcquireD3D10Objects( + const VECTOR_CLASS<Memory>* mem_objects = NULL, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueAcquireD3D10ObjectsKHR pfn_clEnqueueAcquireD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo<CL_QUEUE_CONTEXT>(); + cl::Device device(getInfo<CL_QUEUE_DEVICE>()); + cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueAcquireD3D10ObjectsKHR); +#endif +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueAcquireD3D10ObjectsKHR); +#endif + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueAcquireD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_ACQUIRE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } + + cl_int enqueueReleaseD3D10Objects( + const VECTOR_CLASS<Memory>* mem_objects = NULL, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) const + { + static PFN_clEnqueueReleaseD3D10ObjectsKHR pfn_clEnqueueReleaseD3D10ObjectsKHR = NULL; +#if defined(CL_VERSION_1_2) + cl_context context = getInfo<CL_QUEUE_CONTEXT>(); + cl::Device device(getInfo<CL_QUEUE_DEVICE>()); + cl_platform_id platform = device.getInfo<CL_DEVICE_PLATFORM>(); + __INIT_CL_EXT_FCN_PTR_PLATFORM(platform, clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_2) +#if defined(CL_VERSION_1_1) + __INIT_CL_EXT_FCN_PTR(clEnqueueReleaseD3D10ObjectsKHR); +#endif // #if defined(CL_VERSION_1_1) + + cl_event tmp; + cl_int err = detail::errHandler( + pfn_clEnqueueReleaseD3D10ObjectsKHR( + object_, + (mem_objects != NULL) ? (cl_uint) mem_objects->size() : 0, + (mem_objects != NULL) ? (const cl_mem *) &mem_objects->front(): NULL, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_RELEASE_GL_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; + } +#endif + +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS) || (defined(CL_VERSION_1_1) && !defined(CL_VERSION_1_2)) + CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + cl_int enqueueBarrier() const CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + { + return detail::errHandler( + ::clEnqueueBarrier(object_), + __ENQUEUE_BARRIER_ERR); + } +#endif // #if defined(CL_VERSION_1_1) + + cl_int flush() const + { + return detail::errHandler(::clFlush(object_), __FLUSH_ERR); + } + + cl_int finish() const + { + return detail::errHandler(::clFinish(object_), __FINISH_ERR); + } +}; + +#ifdef _WIN32 +__declspec(selectany) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__declspec(selectany) CommandQueue CommandQueue::default_; +__declspec(selectany) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#else +__attribute__((weak)) volatile int CommandQueue::default_initialized_ = __DEFAULT_NOT_INITIALIZED; +__attribute__((weak)) CommandQueue CommandQueue::default_; +__attribute__((weak)) volatile cl_int CommandQueue::default_error_ = CL_SUCCESS; +#endif + +inline cl_int enqueueReadBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + void* ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline cl_int enqueueWriteBuffer( + const Buffer& buffer, + cl_bool blocking, + ::size_t offset, + ::size_t size, + const void* ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBuffer(buffer, blocking, offset, size, ptr, events, event); +} + +inline void* enqueueMapBuffer( + const Buffer& buffer, + cl_bool blocking, + cl_map_flags flags, + ::size_t offset, + ::size_t size, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL, + cl_int* err = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + + void * result = ::clEnqueueMapBuffer( + queue(), buffer(), blocking, flags, offset, size, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (cl_event*) event, + &error); + + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (err != NULL) { + *err = error; + } + return result; +} + +inline cl_int enqueueUnmapMemObject( + const Memory& memory, + void* mapped_ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + detail::errHandler(error, __ENQUEUE_MAP_BUFFER_ERR); + if (error != CL_SUCCESS) { + return error; + } + + cl_event tmp; + cl_int err = detail::errHandler( + ::clEnqueueUnmapMemObject( + queue(), memory(), mapped_ptr, + (events != NULL) ? (cl_uint) events->size() : 0, + (events != NULL && events->size() > 0) ? (cl_event*) &events->front() : NULL, + (event != NULL) ? &tmp : NULL), + __ENQUEUE_UNMAP_MEM_OBJECT_ERR); + + if (event != NULL && err == CL_SUCCESS) + *event = tmp; + + return err; +} + +inline cl_int enqueueCopyBuffer( + const Buffer& src, + const Buffer& dst, + ::size_t src_offset, + ::size_t dst_offset, + ::size_t size, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBuffer(src, dst, src_offset, dst_offset, size, events, event); +} + +/** + * Blocking copy operation between iterators and a buffer. + */ +template< typename IteratorType > +inline cl_int copy( IteratorType startIterator, IteratorType endIterator, cl::Buffer &buffer ) +{ + typedef typename std::iterator_traits<IteratorType>::value_type DataType; + cl_int error; + + ::size_t length = endIterator-startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast<DataType*>(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_WRITE, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } +#if defined(_MSC_VER) + std::copy( + startIterator, + endIterator, + stdext::checked_array_iterator<DataType*>( + pointer, length)); +#else + std::copy(startIterator, endIterator, pointer); +#endif + Event endEvent; + error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +/** + * Blocking copy operation between iterators and a buffer. + */ +template< typename IteratorType > +inline cl_int copy( const cl::Buffer &buffer, IteratorType startIterator, IteratorType endIterator ) +{ + typedef typename std::iterator_traits<IteratorType>::value_type DataType; + cl_int error; + + ::size_t length = endIterator-startIterator; + ::size_t byteLength = length*sizeof(DataType); + + DataType *pointer = + static_cast<DataType*>(enqueueMapBuffer(buffer, CL_TRUE, CL_MAP_READ, 0, byteLength, 0, 0, &error)); + // if exceptions enabled, enqueueMapBuffer will throw + if( error != CL_SUCCESS ) { + return error; + } + std::copy(pointer, pointer + length, startIterator); + Event endEvent; + error = enqueueUnmapMemObject(buffer, pointer, 0, &endEvent); + // if exceptions enabled, enqueueUnmapMemObject will throw + if( error != CL_SUCCESS ) { + return error; + } + endEvent.wait(); + return CL_SUCCESS; +} + +#if defined(CL_VERSION_1_1) +inline cl_int enqueueReadBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteBufferRect( + const Buffer& buffer, + cl_bool blocking, + const size_t<3>& buffer_offset, + const size_t<3>& host_offset, + const size_t<3>& region, + ::size_t buffer_row_pitch, + ::size_t buffer_slice_pitch, + ::size_t host_row_pitch, + ::size_t host_slice_pitch, + void *ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteBufferRect( + buffer, + blocking, + buffer_offset, + host_offset, + region, + buffer_row_pitch, + buffer_slice_pitch, + host_row_pitch, + host_slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyBufferRect( + const Buffer& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + ::size_t src_row_pitch, + ::size_t src_slice_pitch, + ::size_t dst_row_pitch, + ::size_t dst_slice_pitch, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferRect( + src, + dst, + src_origin, + dst_origin, + region, + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, + events, + event); +} +#endif + +inline cl_int enqueueReadImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueReadImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueWriteImage( + const Image& image, + cl_bool blocking, + const size_t<3>& origin, + const size_t<3>& region, + ::size_t row_pitch, + ::size_t slice_pitch, + void* ptr, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueWriteImage( + image, + blocking, + origin, + region, + row_pitch, + slice_pitch, + ptr, + events, + event); +} + +inline cl_int enqueueCopyImage( + const Image& src, + const Image& dst, + const size_t<3>& src_origin, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImage( + src, + dst, + src_origin, + dst_origin, + region, + events, + event); +} + +inline cl_int enqueueCopyImageToBuffer( + const Image& src, + const Buffer& dst, + const size_t<3>& src_origin, + const size_t<3>& region, + ::size_t dst_offset, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyImageToBuffer( + src, + dst, + src_origin, + region, + dst_offset, + events, + event); +} + +inline cl_int enqueueCopyBufferToImage( + const Buffer& src, + const Image& dst, + ::size_t src_offset, + const size_t<3>& dst_origin, + const size_t<3>& region, + const VECTOR_CLASS<Event>* events = NULL, + Event* event = NULL) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.enqueueCopyBufferToImage( + src, + dst, + src_offset, + dst_origin, + region, + events, + event); +} + + +inline cl_int flush(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + return queue.flush(); +} + +inline cl_int finish(void) +{ + cl_int error; + CommandQueue queue = CommandQueue::getDefault(&error); + + if (error != CL_SUCCESS) { + return error; + } + + + return queue.finish(); +} + +// Kernel Functor support +// New interface as of September 2011 +// Requires the C++11 std::tr1::function (note do not support TR1) +// Visual Studio 2010 and GCC 4.2 + +struct EnqueueArgs +{ + CommandQueue queue_; + const NDRange offset_; + const NDRange global_; + const NDRange local_; + VECTOR_CLASS<Event> events_; + + EnqueueArgs(NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(Event e, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(Event e, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : + queue_(CommandQueue::getDefault()), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, Event e, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local) + { + events_.push_back(e); + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(NullRange), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange global, NDRange local) : + queue_(queue), + offset_(NullRange), + global_(global), + local_(local), + events_(events) + { + + } + + EnqueueArgs(CommandQueue &queue, const VECTOR_CLASS<Event> &events, NDRange offset, NDRange global, NDRange local) : + queue_(queue), + offset_(offset), + global_(global), + local_(local), + events_(events) + { + + } +}; + +namespace detail { + +class NullType {}; + +template<int index, typename T0> +struct SetArg +{ + static void set (Kernel kernel, T0 arg) + { + kernel.setArg(index, arg); + } +}; + +template<int index> +struct SetArg<index, NullType> +{ + static void set (Kernel, NullType) + { + } +}; + +template < + typename T0, typename T1, typename T2, typename T3, + typename T4, typename T5, typename T6, typename T7, + typename T8, typename T9, typename T10, typename T11, + typename T12, typename T13, typename T14, typename T15, + typename T16, typename T17, typename T18, typename T19, + typename T20, typename T21, typename T22, typename T23, + typename T24, typename T25, typename T26, typename T27, + typename T28, typename T29, typename T30, typename T31 +> +class KernelFunctorGlobal +{ +private: + Kernel kernel_; + +public: + KernelFunctorGlobal( + Kernel kernel) : + kernel_(kernel) + {} + + KernelFunctorGlobal( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + kernel_(program, name.c_str(), err) + {} + + Event operator() ( + const EnqueueArgs& args, + T0 t0, + T1 t1 = NullType(), + T2 t2 = NullType(), + T3 t3 = NullType(), + T4 t4 = NullType(), + T5 t5 = NullType(), + T6 t6 = NullType(), + T7 t7 = NullType(), + T8 t8 = NullType(), + T9 t9 = NullType(), + T10 t10 = NullType(), + T11 t11 = NullType(), + T12 t12 = NullType(), + T13 t13 = NullType(), + T14 t14 = NullType(), + T15 t15 = NullType(), + T16 t16 = NullType(), + T17 t17 = NullType(), + T18 t18 = NullType(), + T19 t19 = NullType(), + T20 t20 = NullType(), + T21 t21 = NullType(), + T22 t22 = NullType(), + T23 t23 = NullType(), + T24 t24 = NullType(), + T25 t25 = NullType(), + T26 t26 = NullType(), + T27 t27 = NullType(), + T28 t28 = NullType(), + T29 t29 = NullType(), + T30 t30 = NullType(), + T31 t31 = NullType() + ) + { + Event event; + SetArg<0, T0>::set(kernel_, t0); + SetArg<1, T1>::set(kernel_, t1); + SetArg<2, T2>::set(kernel_, t2); + SetArg<3, T3>::set(kernel_, t3); + SetArg<4, T4>::set(kernel_, t4); + SetArg<5, T5>::set(kernel_, t5); + SetArg<6, T6>::set(kernel_, t6); + SetArg<7, T7>::set(kernel_, t7); + SetArg<8, T8>::set(kernel_, t8); + SetArg<9, T9>::set(kernel_, t9); + SetArg<10, T10>::set(kernel_, t10); + SetArg<11, T11>::set(kernel_, t11); + SetArg<12, T12>::set(kernel_, t12); + SetArg<13, T13>::set(kernel_, t13); + SetArg<14, T14>::set(kernel_, t14); + SetArg<15, T15>::set(kernel_, t15); + SetArg<16, T16>::set(kernel_, t16); + SetArg<17, T17>::set(kernel_, t17); + SetArg<18, T18>::set(kernel_, t18); + SetArg<19, T19>::set(kernel_, t19); + SetArg<20, T20>::set(kernel_, t20); + SetArg<21, T21>::set(kernel_, t21); + SetArg<22, T22>::set(kernel_, t22); + SetArg<23, T23>::set(kernel_, t23); + SetArg<24, T24>::set(kernel_, t24); + SetArg<25, T25>::set(kernel_, t25); + SetArg<26, T26>::set(kernel_, t26); + SetArg<27, T27>::set(kernel_, t27); + SetArg<28, T28>::set(kernel_, t28); + SetArg<29, T29>::set(kernel_, t29); + SetArg<30, T30>::set(kernel_, t30); + SetArg<31, T31>::set(kernel_, t31); + + args.queue_.enqueueNDRangeKernel( + kernel_, + args.offset_, + args.global_, + args.local_, + &args.events_, + &event); + + return event; + } + +}; + +//------------------------------------------------------------------------------------------------------ + + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30, + typename T31> +struct functionImplementation_ +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 32)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + T31); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30, + T31 arg31) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30, + arg31); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29, + typename T30> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 31)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + T30); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29, + T30 arg30) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29, + arg30); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28, + typename T29> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 30)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + T29); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28, + T29 arg29) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28, + arg29); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27, + typename T28> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 29)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + T28); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27, + T28 arg28) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27, + arg28); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26, + typename T27> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 28)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + T27); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26, + T27 arg27) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26, + arg27); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25, + typename T26> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 27)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + T26); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25, + T26 arg26) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25, + arg26); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24, + typename T25> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 26)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + T25); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24, + T25 arg25) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24, + arg25); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23, + typename T24> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 25)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + T24); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23, + T24 arg24) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23, + arg24); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22, + typename T23> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 24)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + T23); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22, + T23 arg23) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22, + arg23); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21, + typename T22> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 23)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + T22); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21, + T22 arg22) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21, + arg22); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20, + typename T21> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 22)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + T21); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20, + T21 arg21) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20, + arg21); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19, + typename T20> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 21)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + T20); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19, + T20 arg20) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19, + arg20); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18, + typename T19> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 20)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + T19); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18, + T19 arg19) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18, + arg19); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17, + typename T18> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 19)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + T18); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17, + T18 arg18) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17, + arg18); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16, + typename T17> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 18)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + T17); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16, + T17 arg17) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16, + arg17); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15, + typename T16> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 17)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + T16); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15, + T16 arg16) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15, + arg16); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14, + typename T15> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 16)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + T15); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14, + T15 arg15) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14, + arg15); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13, + typename T14> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 15)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + T14); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13, + T14 arg14) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13, + arg14); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12, + typename T13> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 14)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + T13); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12, + T13 arg13) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12, + arg13); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11, + typename T12> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 13)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + T12); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11, + T12 arg12) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11, + arg12); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10, + typename T11> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 12)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + T11); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10, + T11 arg11) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10, + arg11); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9, + typename T10> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 11)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + T10); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9, + T10 arg10) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9, + arg10); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8, + typename T9> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 10)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + T9); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8, + T9 arg9) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8, + arg9); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7, + typename T8> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 9)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + T8); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7, + T8 arg8) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7, + arg8); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6, + typename T7> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 8)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6, + T7); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6, + T7 arg7) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6, + arg7); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5, + typename T6> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + T6, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 7)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5, + T6); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5, + T6 arg6) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5, + arg6); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4, + typename T5> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + T5, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 6)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4, + T5); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4, + T5 arg5) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4, + arg5); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3, + typename T4> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + T4, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 5)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3, + T4); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3, + T4 arg4) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3, + arg4); + } + + +}; + +template< + typename T0, + typename T1, + typename T2, + typename T3> +struct functionImplementation_ +< T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + T3, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 4)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2, + T3); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2, + T3 arg3) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2, + arg3); + } + + +}; + +template< + typename T0, + typename T1, + typename T2> +struct functionImplementation_ +< T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + T2, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 3)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1, + T2); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1, + T2 arg2) + { + return functor_( + enqueueArgs, + arg0, + arg1, + arg2); + } + + +}; + +template< + typename T0, + typename T1> +struct functionImplementation_ +< T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + T1, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 2)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0, + T1); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0, + T1 arg1) + { + return functor_( + enqueueArgs, + arg0, + arg1); + } + + +}; + +template< + typename T0> +struct functionImplementation_ +< T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> +{ + typedef detail::KernelFunctorGlobal< + T0, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType, + NullType> FunctorType; + + FunctorType functor_; + + functionImplementation_(const FunctorType &functor) : + functor_(functor) + { + + #if (defined(_WIN32) && defined(_VARIADIC_MAX) && (_VARIADIC_MAX < 1)) + // Fail variadic expansion for dev11 + static_assert(0, "Visual Studio has a hard limit of argument count for a std::function expansion. Please define _VARIADIC_MAX to be 10. If you need more arguments than that VC12 and below cannot support it."); + #endif + + } + + //! \brief Return type of the functor + typedef Event result_type; + + //! \brief Function signature of kernel functor with no event dependency. + typedef Event type_( + const EnqueueArgs&, + T0); + + Event operator()( + const EnqueueArgs& enqueueArgs, + T0 arg0) + { + return functor_( + enqueueArgs, + arg0); + } + + +}; + + + + + +} // namespace detail + +//---------------------------------------------------------------------------------------------- + +template < + typename T0, typename T1 = detail::NullType, typename T2 = detail::NullType, + typename T3 = detail::NullType, typename T4 = detail::NullType, + typename T5 = detail::NullType, typename T6 = detail::NullType, + typename T7 = detail::NullType, typename T8 = detail::NullType, + typename T9 = detail::NullType, typename T10 = detail::NullType, + typename T11 = detail::NullType, typename T12 = detail::NullType, + typename T13 = detail::NullType, typename T14 = detail::NullType, + typename T15 = detail::NullType, typename T16 = detail::NullType, + typename T17 = detail::NullType, typename T18 = detail::NullType, + typename T19 = detail::NullType, typename T20 = detail::NullType, + typename T21 = detail::NullType, typename T22 = detail::NullType, + typename T23 = detail::NullType, typename T24 = detail::NullType, + typename T25 = detail::NullType, typename T26 = detail::NullType, + typename T27 = detail::NullType, typename T28 = detail::NullType, + typename T29 = detail::NullType, typename T30 = detail::NullType, + typename T31 = detail::NullType +> +struct make_kernel : + public detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + > +{ +public: + typedef detail::KernelFunctorGlobal< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + > FunctorType; + + make_kernel( + const Program& program, + const STRING_CLASS name, + cl_int * err = NULL) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + >( + FunctorType(program, name, err)) + {} + + make_kernel( + const Kernel kernel) : + detail::functionImplementation_< + T0, T1, T2, T3, + T4, T5, T6, T7, + T8, T9, T10, T11, + T12, T13, T14, T15, + T16, T17, T18, T19, + T20, T21, T22, T23, + T24, T25, T26, T27, + T28, T29, T30, T31 + >( + FunctorType(kernel)) + {} +}; + + +//---------------------------------------------------------------------------------------------------------------------- + +#undef __ERR_STR +#if !defined(__CL_USER_OVERRIDE_ERROR_STRINGS) +#undef __GET_DEVICE_INFO_ERR +#undef __GET_PLATFORM_INFO_ERR +#undef __GET_DEVICE_IDS_ERR +#undef __GET_CONTEXT_INFO_ERR +#undef __GET_EVENT_INFO_ERR +#undef __GET_EVENT_PROFILE_INFO_ERR +#undef __GET_MEM_OBJECT_INFO_ERR +#undef __GET_IMAGE_INFO_ERR +#undef __GET_SAMPLER_INFO_ERR +#undef __GET_KERNEL_INFO_ERR +#undef __GET_KERNEL_ARG_INFO_ERR +#undef __GET_KERNEL_WORK_GROUP_INFO_ERR +#undef __GET_PROGRAM_INFO_ERR +#undef __GET_PROGRAM_BUILD_INFO_ERR +#undef __GET_COMMAND_QUEUE_INFO_ERR + +#undef __CREATE_CONTEXT_ERR +#undef __CREATE_CONTEXT_FROM_TYPE_ERR +#undef __GET_SUPPORTED_IMAGE_FORMATS_ERR + +#undef __CREATE_BUFFER_ERR +#undef __CREATE_SUBBUFFER_ERR +#undef __CREATE_IMAGE2D_ERR +#undef __CREATE_IMAGE3D_ERR +#undef __CREATE_SAMPLER_ERR +#undef __CREATE_SAMPLER_PROPERTY_ERR +#undef __SET_MEM_OBJECT_DESTRUCTOR_CALLBACK_ERR + +#undef __CREATE_USER_EVENT_ERR +#undef __SET_USER_EVENT_STATUS_ERR +#undef __SET_EVENT_CALLBACK_ERR +#undef __SET_PRINTF_CALLBACK_ERR + +#undef __WAIT_FOR_EVENTS_ERR + +#undef __CREATE_KERNEL_ERR +#undef __SET_KERNEL_ARGS_ERR +#undef __CREATE_PROGRAM_WITH_SOURCE_ERR +#undef __CREATE_PROGRAM_WITH_BINARY_ERR +#undef __CREATE_PROGRAM_WITH_BUILT_IN_KERNELS_ERR +#undef __BUILD_PROGRAM_ERR +#undef __CREATE_KERNELS_IN_PROGRAM_ERR + +#undef __CREATE_COMMAND_QUEUE_ERR +#undef __CREATE_COMMAND_QUEUE_PROPERTY_ERR +#undef __SET_COMMAND_QUEUE_PROPERTY_ERR +#undef __ENQUEUE_READ_BUFFER_ERR +#undef __ENQUEUE_WRITE_BUFFER_ERR +#undef __ENQUEUE_READ_BUFFER_RECT_ERR +#undef __ENQUEUE_WRITE_BUFFER_RECT_ERR +#undef __ENQEUE_COPY_BUFFER_ERR +#undef __ENQEUE_COPY_BUFFER_RECT_ERR +#undef __ENQUEUE_READ_IMAGE_ERR +#undef __ENQUEUE_WRITE_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_ERR +#undef __ENQUEUE_COPY_IMAGE_TO_BUFFER_ERR +#undef __ENQUEUE_COPY_BUFFER_TO_IMAGE_ERR +#undef __ENQUEUE_MAP_BUFFER_ERR +#undef __ENQUEUE_MAP_IMAGE_ERR +#undef __ENQUEUE_UNMAP_MEM_OBJECT_ERR +#undef __ENQUEUE_NDRANGE_KERNEL_ERR +#undef __ENQUEUE_TASK_ERR +#undef __ENQUEUE_NATIVE_KERNEL + +#undef __CL_EXPLICIT_CONSTRUCTORS + +#undef __UNLOAD_COMPILER_ERR +#endif //__CL_USER_OVERRIDE_ERROR_STRINGS + +#undef __CL_FUNCTION_TYPE + +// Extensions +/** + * Deprecated APIs for 1.2 + */ +#if defined(CL_VERSION_1_1) +#undef __INIT_CL_EXT_FCN_PTR +#endif // #if defined(CL_VERSION_1_1) +#undef __CREATE_SUB_DEVICES + +#if defined(USE_CL_DEVICE_FISSION) +#undef __PARAM_NAME_DEVICE_FISSION +#endif // USE_CL_DEVICE_FISSION + +#undef __DEFAULT_NOT_INITIALIZED +#undef __DEFAULT_BEING_INITIALIZED +#undef __DEFAULT_INITIALIZED + +} // namespace cl + +#ifdef _WIN32 +#pragma pop_macro("max") +#endif // _WIN32 + +#endif // CL_HPP_ diff --git a/ext/cudart/include/CL/cl_d3d10.h b/ext/cudart/include/CL/cl_d3d10.h new file mode 100644 index 0000000000000000000000000000000000000000..89f4bfba1f46d894289a885984c309f0f99f1165 --- /dev/null +++ b/ext/cudart/include/CL/cl_d3d10.h @@ -0,0 +1,129 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_D3D10_H +#define __OPENCL_CL_D3D10_H + +#if defined(_MSC_VER) +#if _MSC_VER >=1500 +#pragma warning( push ) +#pragma warning( disable : 4201 ) +#endif +#endif +#include <d3d10.h> +#if defined(_MSC_VER) +#if _MSC_VER >=1500 +#pragma warning( pop ) +#endif +#endif + +#include <CL/cl.h> +#include <CL/cl_platform.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d10_sharing */ +#define cl_khr_d3d10_sharing 1 + +typedef cl_uint cl_d3d10_device_source_khr; +typedef cl_uint cl_d3d10_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D10_DEVICE_KHR -1002 +#define CL_INVALID_D3D10_RESOURCE_KHR -1003 +#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004 +#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005 + +/* cl_d3d10_device_source_nv */ +#define CL_D3D10_DEVICE_KHR 0x4010 +#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011 + +/* cl_d3d10_device_set_nv */ +#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012 +#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013 + +/* cl_context_info */ +#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014 +#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C + +/* cl_mem_info */ +#define CL_MEM_D3D10_RESOURCE_KHR 0x4015 + +/* cl_image_info */ +#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016 + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017 +#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)( + cl_platform_id platform, + cl_d3d10_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D10_H */ + diff --git a/ext/cudart/include/CL/cl_d3d10_ext.h b/ext/cudart/include/CL/cl_d3d10_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..acbc1b52d984df54ee0b76acf45f46e78c6e8dbf --- /dev/null +++ b/ext/cudart/include/CL/cl_d3d10_ext.h @@ -0,0 +1,122 @@ +/********************************************************************************** + * Copyright (c) 2008-2009 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __OPENCL_CL_D3D10_EXT_H +#define __OPENCL_CL_D3D10_EXT_H + +#include <d3d10.h> +#include <CL/cl.h> +#include <CL/cl_platform.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_nv_d3d10_sharing */ + +typedef cl_uint cl_d3d10_device_source_nv; +typedef cl_uint cl_d3d10_device_set_nv; + +/******************************************************************************/ + +// Error Codes +#define CL_INVALID_D3D10_DEVICE_NV -1002 +#define CL_INVALID_D3D10_RESOURCE_NV -1003 +#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_NV -1004 +#define CL_D3D10_RESOURCE_NOT_ACQUIRED_NV -1005 + +// cl_d3d10_device_source_nv +#define CL_D3D10_DEVICE_NV 0x4010 +#define CL_D3D10_DXGI_ADAPTER_NV 0x4011 + +// cl_d3d10_device_set_nv +#define CL_PREFERRED_DEVICES_FOR_D3D10_NV 0x4012 +#define CL_ALL_DEVICES_FOR_D3D10_NV 0x4013 + +// cl_context_info +#define CL_CONTEXT_D3D10_DEVICE_NV 0x4014 + +// cl_mem_info +#define CL_MEM_D3D10_RESOURCE_NV 0x4015 + +// cl_image_info +#define CL_IMAGE_D3D10_SUBRESOURCE_NV 0x4016 + +// cl_command_type +#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_NV 0x4017 +#define CL_COMMAND_RELEASE_D3D10_OBJECTS_NV 0x4018 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10NV_fn)( + cl_platform_id platform, + cl_d3d10_device_source_nv d3d_device_source, + void * d3d_object, + cl_d3d10_device_set_nv d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferNV_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DNV_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DNV_fn)( + cl_context context, + cl_mem_flags flags, + ID3D10Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsNV_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsNV_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif // __OPENCL_CL_D3D10_H + diff --git a/ext/cudart/include/CL/cl_d3d11.h b/ext/cudart/include/CL/cl_d3d11.h new file mode 100644 index 0000000000000000000000000000000000000000..10023dde0e3715d0f973eed3b7e6e0a397382fd2 --- /dev/null +++ b/ext/cudart/include/CL/cl_d3d11.h @@ -0,0 +1,128 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_D3D11_H +#define __OPENCL_CL_D3D11_H + +#if defined(_MSC_VER) +#if _MSC_VER >=1500 +#pragma warning( push ) +#pragma warning( disable : 4201 ) +#endif +#endif +#include <d3d11.h> +#if defined(_MSC_VER) +#if _MSC_VER >=1500 +#pragma warning( pop ) +#endif +#endif +#include <CL/cl.h> +#include <CL/cl_platform.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_khr_d3d11_sharing */ +#define cl_khr_d3d11_sharing 1 + +typedef cl_uint cl_d3d11_device_source_khr; +typedef cl_uint cl_d3d11_device_set_khr; + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_D3D11_DEVICE_KHR -1006 +#define CL_INVALID_D3D11_RESOURCE_KHR -1007 +#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008 +#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009 + +/* cl_d3d11_device_source */ +#define CL_D3D11_DEVICE_KHR 0x4019 +#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A + +/* cl_d3d11_device_set */ +#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B +#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C + +/* cl_context_info */ +#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D +#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D + +/* cl_mem_info */ +#define CL_MEM_D3D11_RESOURCE_KHR 0x401E + +/* cl_image_info */ +#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020 +#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)( + cl_platform_id platform, + cl_d3d11_device_source_khr d3d_device_source, + void * d3d_object, + cl_d3d11_device_set_khr d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_D3D11_H */ + diff --git a/ext/cudart/include/CL/cl_d3d11_ext.h b/ext/cudart/include/CL/cl_d3d11_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..99b2c5bda5049a51d6bd32225308a081d1ed696b --- /dev/null +++ b/ext/cudart/include/CL/cl_d3d11_ext.h @@ -0,0 +1,122 @@ +/********************************************************************************** + * Copyright (c) 2008-2009 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __OPENCL_CL_D3D11_EXT_H +#define __OPENCL_CL_D3D11_EXT_H + +#include <d3d11.h> +#include <CL/cl.h> +#include <CL/cl_platform.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_nv_d3d11_sharing */ + +typedef cl_uint cl_d3d11_device_source_nv; +typedef cl_uint cl_d3d11_device_set_nv; + +/******************************************************************************/ + +// Error Codes +#define CL_INVALID_D3D11_DEVICE_NV -1006 +#define CL_INVALID_D3D11_RESOURCE_NV -1007 +#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_NV -1008 +#define CL_D3D11_RESOURCE_NOT_ACQUIRED_NV -1009 + +// cl_d3d11_device_source_nv +#define CL_D3D11_DEVICE_NV 0x4019 +#define CL_D3D11_DXGI_ADAPTER_NV 0x401A + +// cl_d3d11_device_set_nv +#define CL_PREFERRED_DEVICES_FOR_D3D11_NV 0x401B +#define CL_ALL_DEVICES_FOR_D3D11_NV 0x401C + +// cl_context_info +#define CL_CONTEXT_D3D11_DEVICE_NV 0x401D + +// cl_mem_info +#define CL_MEM_D3D11_RESOURCE_NV 0x401E + +// cl_image_info +#define CL_IMAGE_D3D11_SUBRESOURCE_NV 0x401F + +// cl_command_type +#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_NV 0x4020 +#define CL_COMMAND_RELEASE_D3D11_OBJECTS_NV 0x4021 + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11NV_fn)( + cl_platform_id platform, + cl_d3d11_device_source_nv d3d_device_source, + void * d3d_object, + cl_d3d11_device_set_nv d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferNV_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Buffer * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DNV_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture2D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DNV_fn)( + cl_context context, + cl_mem_flags flags, + ID3D11Texture3D * resource, + UINT subresource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsNV_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsNV_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif // __OPENCL_CL_D3D11_H + diff --git a/ext/cudart/include/CL/cl_d3d9_ext.h b/ext/cudart/include/CL/cl_d3d9_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..ab8d6cf047d0179f78dff78264d1133fce09774a --- /dev/null +++ b/ext/cudart/include/CL/cl_d3d9_ext.h @@ -0,0 +1,143 @@ +/********************************************************************************** + * Copyright (c) 2008-2009 The Khronos Group Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and/or associated documentation files (the + * "Materials"), to deal in the Materials without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Materials, and to + * permit persons to whom the Materials are furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included + * in all copies or substantial portions of the Materials. + * + * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS. + **********************************************************************************/ + +#ifndef __OPENCL_CL_D3D9_EXT_H +#define __OPENCL_CL_D3D9_EXT_H + +#include <d3d9.h> +#include <CL/cl.h> +#include <CL/cl_platform.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/****************************************************************************** + * cl_nv_d3d9_sharing */ + +typedef cl_uint cl_d3d9_device_source_nv; +typedef cl_uint cl_d3d9_device_set_nv; + +/******************************************************************************/ + +// Error Codes +#define CL_INVALID_D3D9_DEVICE_NV -1010 +#define CL_INVALID_D3D9_RESOURCE_NV -1011 +#define CL_D3D9_RESOURCE_ALREADY_ACQUIRED_NV -1012 +#define CL_D3D9_RESOURCE_NOT_ACQUIRED_NV -1013 + +// cl_d3d9_device_source_nv +#define CL_D3D9_DEVICE_NV 0x4022 +#define CL_D3D9_ADAPTER_NAME_NV 0x4023 + +// cl_d3d9_device_set_nv +#define CL_PREFERRED_DEVICES_FOR_D3D9_NV 0x4024 +#define CL_ALL_DEVICES_FOR_D3D9_NV 0x4025 + +// cl_context_info +#define CL_CONTEXT_D3D9_DEVICE_NV 0x4026 + +// cl_mem_info +#define CL_MEM_D3D9_RESOURCE_NV 0x4027 + +// cl_image_info +#define CL_IMAGE_D3D9_FACE_NV 0x4028 +#define CL_IMAGE_D3D9_LEVEL_NV 0x4029 + +// cl_command_type +#define CL_COMMAND_ACQUIRE_D3D9_OBJECTS_NV 0x402A +#define CL_COMMAND_RELEASE_D3D9_OBJECTS_NV 0x402B + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D9NV_fn)( + cl_platform_id platform, + cl_d3d9_device_source_nv d3d_device_source, + void * d3d_object, + cl_d3d9_device_set_nv d3d_device_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VertexBufferNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DVertexBuffer9 * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9IndexBufferNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DIndexBuffer9 * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9SurfaceNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DSurface9 * resource, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9TextureNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DTexture9 *resource, + UINT miplevel, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9CubeTextureNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DCubeTexture9 * resource, + D3DCUBEMAP_FACES facetype, + UINT miplevel, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D9VolumeTextureNV_fn)( + cl_context context, + cl_mem_flags flags, + IDirect3DVolumeTexture9 * resource, + UINT miplevel, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D9ObjectsNV_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D9ObjectsNV_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_0; + +#ifdef __cplusplus +} +#endif + +#endif // __OPENCL_CL_D3D9_H + diff --git a/ext/cudart/include/CL/cl_dx9_media_sharing.h b/ext/cudart/include/CL/cl_dx9_media_sharing.h new file mode 100644 index 0000000000000000000000000000000000000000..048937005353a8fb8c4fee624d3f36b9c894fe3e --- /dev/null +++ b/ext/cudart/include/CL/cl_dx9_media_sharing.h @@ -0,0 +1,118 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H +#define __OPENCL_CL_DX9_MEDIA_SHARING_H + +#include <CL/cl.h> +#include <CL/cl_platform.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/******************************************************************************/ +/* cl_khr_dx9_media_sharing */ +#define cl_khr_dx9_media_sharing 1 + +typedef cl_uint cl_dx9_media_adapter_type_khr; +typedef cl_uint cl_dx9_media_adapter_set_khr; + +#if defined(_WIN32) +#include <d3d9.h> +typedef struct _cl_dx9_surface_info_khr +{ + IDirect3DSurface9 *resource; + HANDLE shared_handle; +} cl_dx9_surface_info_khr; +#endif + + +/******************************************************************************/ + +/* Error Codes */ +#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010 +#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011 +#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012 +#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013 + +/* cl_media_adapter_type_khr */ +#define CL_ADAPTER_D3D9_KHR 0x2020 +#define CL_ADAPTER_D3D9EX_KHR 0x2021 +#define CL_ADAPTER_DXVA_KHR 0x2022 + +/* cl_media_adapter_set_khr */ +#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023 +#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024 + +/* cl_context_info */ +#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025 +#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026 +#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027 + +/* cl_mem_info */ +#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028 +#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029 + +/* cl_image_info */ +#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A + +/* cl_command_type */ +#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B +#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C + +/******************************************************************************/ + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)( + cl_platform_id platform, + cl_uint num_media_adapters, + cl_dx9_media_adapter_type_khr * media_adapter_type, + void * media_adapters, + cl_dx9_media_adapter_set_khr media_adapter_set, + cl_uint num_entries, + cl_device_id * devices, + cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)( + cl_context context, + cl_mem_flags flags, + cl_dx9_media_adapter_type_khr adapter_type, + void * surface_info, + cl_uint plane, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_2; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */ + diff --git a/ext/cudart/include/CL/cl_egl.h b/ext/cudart/include/CL/cl_egl.h new file mode 100644 index 0000000000000000000000000000000000000000..6fb3721f12533925ba6ca3737eb48bd864aae444 --- /dev/null +++ b/ext/cudart/include/CL/cl_egl.h @@ -0,0 +1,123 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_EGL_H +#define __OPENCL_CL_EGL_H + +#ifdef __APPLE__ +#else +#include <CL/cl.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */ +#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F +#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D +#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E + +/* Error type for clCreateFromEGLImageKHR */ +#define CL_INVALID_EGL_OBJECT_KHR -1093 +#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092 + +/* CLeglImageKHR is an opaque handle to an EGLImage */ +typedef void* CLeglImageKHR; + +/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */ +typedef void* CLeglDisplayKHR; + +/* CLeglSyncKHR is an opaque handle to an EGLSync object */ +typedef void* CLeglSyncKHR; + +/* properties passed to clCreateFromEGLImageKHR */ +typedef intptr_t cl_egl_image_properties_khr; + + +#define cl_khr_egl_image 1 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromEGLImageKHR(cl_context context, + CLeglDisplayKHR egldisplay, + CLeglImageKHR eglimage, + cl_mem_flags flags, + const cl_egl_image_properties_khr * properties, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)( + cl_context context, + CLeglDisplayKHR egldisplay, + CLeglImageKHR eglimage, + cl_mem_flags flags, + const cl_egl_image_properties_khr * properties, + cl_int * errcode_ret); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)( + cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +#define cl_khr_egl_event 1 + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromEGLSyncKHR(cl_context context, + CLeglSyncKHR sync, + CLeglDisplayKHR display, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)( + cl_context context, + CLeglSyncKHR sync, + CLeglDisplayKHR display, + cl_int * errcode_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_EGL_H */ diff --git a/ext/cudart/include/CL/cl_ext.h b/ext/cudart/include/CL/cl_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..b8b26159e64945dc54c0457c7d6acd92487bd325 --- /dev/null +++ b/ext/cudart/include/CL/cl_ext.h @@ -0,0 +1,1131 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +/* cl_ext.h contains OpenCL extensions which don't have external */ +/* (OpenGL, D3D) dependencies. */ + +#ifndef __CL_EXT_H +#define __CL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include <OpenCL/cl.h> + #include <AvailabilityMacros.h> +#else + #include <CL/cl.h> +#endif + +/* cl_khr_fp64 extension - no extension #define since it has no functions */ +/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */ +#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032 + +/* cl_khr_fp16 extension - no extension #define since it has no functions */ +#define CL_DEVICE_HALF_FP_CONFIG 0x1033 + +/* Memory object destruction + * + * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR + * + * Registers a user callback function that will be called when the memory object is deleted and its resources + * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback + * stack associated with memobj. The registered user callback functions are called in the reverse order in + * which they were registered. The user callback functions are called and then the memory object is deleted + * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be + * notified when the memory referenced by host_ptr, specified when the memory object is created and used as + * the storage bits for the memory object, can be reused or freed. + * + * The application may not call CL api's with the cl_mem object passed to the pfn_notify. + * + * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + */ +#define cl_APPLE_SetMemObjectDestructor 1 +cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem memobj, + void (* pfn_notify)(cl_mem memobj, void * user_data), + void * user_data) CL_EXT_SUFFIX__VERSION_1_0; + + +/* Context Logging Functions + * + * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext(). + * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS) + * before using. + * + * clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger + */ +#define cl_APPLE_ContextLoggingFunctions 1 +extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * errstr, + const void * private_info, + size_t cb, + void * user_data) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */ +extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * errstr, + const void * private_info, + size_t cb, + void * user_data) CL_EXT_SUFFIX__VERSION_1_0; + +/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */ +extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * errstr, + const void * private_info, + size_t cb, + void * user_data) CL_EXT_SUFFIX__VERSION_1_0; + + +/************************ +* cl_khr_icd extension * +************************/ +#define cl_khr_icd 1 + +/* cl_platform_info */ +#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920 + +/* Additional Error Codes */ +#define CL_PLATFORM_NOT_FOUND_KHR -1001 + +extern CL_API_ENTRY cl_int CL_API_CALL +clIcdGetPlatformIDsKHR(cl_uint num_entries, + cl_platform_id * platforms, + cl_uint * num_platforms); + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint num_entries, + cl_platform_id * platforms, + cl_uint * num_platforms); + + +/******************************* + * cl_khr_il_program extension * + *******************************/ +#define cl_khr_il_program 1 + +/* New property to clGetDeviceInfo for retrieving supported intermediate + * languages + */ +#define CL_DEVICE_IL_VERSION_KHR 0x105B + +/* New property to clGetProgramInfo for retrieving for retrieving the IL of a + * program + */ +#define CL_PROGRAM_IL_KHR 0x1169 + +extern CL_API_ENTRY cl_program CL_API_CALL +clCreateProgramWithILKHR(cl_context context, + const void * il, + size_t length, + cl_int * errcode_ret); + +typedef CL_API_ENTRY cl_program +(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context context, + const void * il, + size_t length, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; + +/* Extension: cl_khr_image2d_from_buffer + * + * This extension allows a 2D image to be created from a cl_mem buffer without + * a copy. The type associated with a 2D image created from a buffer in an + * OpenCL program is image2d_t. Both the sampler and sampler-less read_image + * built-in functions are supported for 2D images and 2D images created from + * a buffer. Similarly, the write_image built-ins are also supported for 2D + * images created from a buffer. + * + * When the 2D image from buffer is created, the client must specify the + * width, height, image format (i.e. channel order and channel data type) + * and optionally the row pitch. + * + * The pitch specified must be a multiple of + * CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels. + * The base address of the buffer must be aligned to + * CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels. + */ + +#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR 0x104A +#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR 0x104B + + +/************************************** + * cl_khr_initialize_memory extension * + **************************************/ + +#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030 + + +/************************************** + * cl_khr_terminate_context extension * + **************************************/ + +#define CL_CONTEXT_TERMINATED_KHR -1121 + +#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031 +#define CL_CONTEXT_TERMINATE_KHR 0x2032 + +#define cl_khr_terminate_context 1 +extern CL_API_ENTRY cl_int CL_API_CALL +clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2; + + +/* + * Extension: cl_khr_spir + * + * This extension adds support to create an OpenCL program object from a + * Standard Portable Intermediate Representation (SPIR) instance + */ + +#define CL_DEVICE_SPIR_VERSIONS 0x40E0 +#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1 + + +/***************************************** + * cl_khr_create_command_queue extension * + *****************************************/ +#define cl_khr_create_command_queue 1 + +typedef cl_properties cl_queue_properties_khr; + +extern CL_API_ENTRY cl_command_queue CL_API_CALL +clCreateCommandQueueWithPropertiesKHR(cl_context context, + cl_device_id device, + const cl_queue_properties_khr* properties, + cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; + +typedef CL_API_ENTRY cl_command_queue +(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context, + cl_device_id device, + const cl_queue_properties_khr* properties, + cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2; + + +/****************************************** +* cl_khr_semaphore extension * +******************************************/ + +typedef enum _cl_semaphore_type__enum { + CL_SEMAPHORE_TYPE_BINARY_KHR = 1, +} cl_semaphore_type; + +typedef cl_properties cl_semaphore_properties_khr; + +typedef cl_uint cl_semaphore_info_khr; + +typedef struct _cl_semaphore* cl_semaphore_khr; +typedef cl_ulong cl_semaphore_payload_khr; + +extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL +clCreateSemaphoreWithPropertiesKHR(cl_context context, + cl_semaphore_properties_khr *sema_props, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueWaitSemaphoresKHR(cl_command_queue command_queue, + cl_uint num_sema_objects, + const cl_semaphore_khr *sema_objects, + const cl_semaphore_payload_khr *sema_payload_list, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSignalSemaphoresKHR(cl_command_queue command_queue, + cl_uint num_sema_objects, + const cl_semaphore_khr *sema_objects, + const cl_semaphore_payload_khr *sema_payload_list, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSemaphoreInfoKHR(const cl_semaphore_khr sema_object, + cl_semaphore_info_khr param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSemaphoreKHR(cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainSemaphoreKHR(cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseSemaphoreObjectKHR(cl_semaphore_khr sema_object) CL_API_SUFFIX__VERSION_1_2; + +#define CL_COMMAND_SEMAPHORE_WAIT_KHR 0x2042 +#define CL_COMMAND_SEMAPHORE_SIGNAL_KHR 0x2043 +#define CL_SEMAPHORE_CONTEXT_KHR 0x2039 +#define CL_SEMAPHORE_REFERENCE_COUNT_KHR 0x203A +#define CL_SEMAPHORE_PROPERTIES_KHR 0x203B +#define CL_SEMAPHORE_TYPE_KHR 0x203D +#define CL_PLATFORM_SEMAPHORE_TYPES_KHR 0x2036 +#define CL_SEMAPHORE_PAYLOAD_KHR 0x203C + +/****************************************** +* cl_khr_external_semaphore extension * +******************************************/ + +typedef enum _cl_external_context_type_enum { + CL_EXTERNAL_CONTEXT_TYPE_NONE = 0, + CL_EXTERNAL_CONTEXT_TYPE_CL = 1, + CL_EXTERNAL_CONTEXT_TYPE_VULKAN = 2, +} cl_external_context_type_khr; + +typedef cl_uint cl_external_semaphore_handle_type_khr; +// API-agnostic semaphore handles are defined here in this spec. +#define CL_SEMAPHORE_HANDLE_OPAQUE_FD_KHR 0x2055 +#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR 0x2056 +#define CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR 0x2057 + +typedef struct _cl_semaphore_desc_khr_st { + cl_external_semaphore_handle_type_khr type; + void *handle_ptr; +} cl_semaphore_desc_khr; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetSemaphoreHandleForTypeKHR(const cl_semaphore_khr sema_object, + const cl_device_id device, + cl_external_semaphore_handle_type_khr handle_type, + size_t handle_size, + void *handle_ptr, + size_t *handle_size_ret) CL_API_SUFFIX__VERSION_1_2; + +#define CL_SEMAPHORE_DESC_KHR 0x2460 +#define CL_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x203F + +#define CL_PLATFORM_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR 0x2037 +#define CL_PLATFORM_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x2038 + +#define CL_DEVICE_SEMAPHORE_IMPORT_HANDLE_TYPES_KHR 0x204D +#define CL_DEVICE_SEMAPHORE_EXPORT_HANDLE_TYPES_KHR 0x204E + +// error codes +#define CL_INVALID_SEMAPHORE_KHR -1142 + +/****************************************** +* cl_khr_external_memory extension * +******************************************/ + +typedef cl_uint cl_external_context_info; + +typedef enum _cl_external_context_type_enum cl_external_context_type_khr; + +typedef cl_properties cl_mem_properties_khr; + +typedef cl_uint cl_external_mem_handle_type_khr; +// API-agnostic memory handles are defined here in this spec. +#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_FD_KHR 0x2060 +#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KHR 0x2061 +#define CL_EXTERNAL_MEMORY_HANDLE_OPAQUE_WIN32_KMT_KHR 0x2062 + +typedef struct _cl_external_mem_desc_khr_st { + cl_external_mem_handle_type_khr type; + void *handle_ptr; + size_t offset; + unsigned long long size; +} cl_external_mem_desc_khr; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetExternalContextInfoKHR(const cl_context_properties *properties, + cl_external_context_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireExternalMemObjectsKHR(cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseExternalMemObjectsKHR(cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem *mem_objects, + cl_uint num_events_in_wait_list, + const cl_event *event_wait_list, + cl_event *event) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateBufferFromExternalMemoryKHR(cl_context context, + const cl_mem_properties_khr* properties, + cl_mem_flags flags, + cl_external_mem_desc_khr extMem, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateImageFromExternalMemoryKHR(cl_context context, + const cl_mem_properties_khr* properties, + cl_mem_flags flags, + cl_external_mem_desc_khr extMem, + const cl_image_format *image_format, + const cl_image_desc *image_desc, + cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_semaphore_khr CL_API_CALL +clCreateFromExternalSemaphoreKHR(cl_context context, + cl_semaphore_properties_khr *sema_props, + cl_semaphore_desc_khr sema_desc, + cl_int *errcode_ret) + CL_API_SUFFIX__VERSION_1_2; + +#define CL_INVALID_EXTERNAL_DEVICEGROUP_REFERENCE_KHR -1122 +#define CL_INVALID_EXT_MEM_DESC_KHR -1123 +#define CL_INVALID_EXT_MEM_HANDLE_TYPE_KHR -1148 +#define CL_INVALID_EXT_MEM_HANDLE_KHR -1149 +#define CL_INVALID_EXT_MEM_OFFSET_KHR -1150 +#define CL_INVALID_EXT_MEM_SIZE_KHR -1140 + +#define CL_CURRENT_DEVICE_FOR_EXTERNAL_CONTEXT_KHR 0x2036 +#define CL_DEVICES_FOR_EXTERNAL_CONTEXT_KHR 0x2037 +#define CL_EXTERNAL_DEVICE_KHR 0x2038 +#define CL_EXTERNAL_DEVICEGROUP_KHR 0x2039 +#define CL_EXTERNAL_CONTEXT_TYPE_KHR 0x204B +#define CL_DEVICE_HANDLE_LIST_KHR 0x2051 +#define CL_DEVICE_HANDLE_LIST_END_KHR 0x0 + + +#define CL_COMMAND_ACQUIRE_EXTERNAL_MEM_OBJECTS_KHR 0x2047 +#define CL_COMMAND_RELEASE_EXTERNAL_MEM_OBJECTS_KHR 0x2048 +#define CL_EXTERNAL_MEM_DESC_KHR 0x203C +#define CL_EXTERNAL_IMAGE_INFO_KHR 0x203D +#define CL_PLATFORM_EXTERNAL_HANDLE_TYPES_KHR 0x203E +#define CL_PLATFORM_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x2044 +#define CL_DEVICE_EXTERNAL_MEMORY_IMPORT_HANDLE_TYPES_KHR 0x204F +#define CL_DEVICE_EXTERNAL_MEMORY_PROPERTIES_KHR 0x2050 + + +/****************************************** +* cl_nv_device_attribute_query extension * +******************************************/ + +/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */ +#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000 +#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001 +#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002 +#define CL_DEVICE_WARP_SIZE_NV 0x4003 +#define CL_DEVICE_GPU_OVERLAP_NV 0x4004 +#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005 +#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006 +#define CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV 0x4007 +#define CL_DEVICE_PCI_BUS_ID_NV 0x4008 +#define CL_DEVICE_PCI_SLOT_ID_NV 0x4009 +#define CL_DEVICE_PCI_DOMAIN_ID_NV 0x400A +#define CL_DEVICE_MAX_LOCAL_MEMORY_PER_SM_NV 0x400B +#define CL_DEVICE_UUID_KHR 0x106A +#define CL_DRIVER_UUID_KHR 0x106B +#define CL_DEVICE_LUID_VALID_KHR 0x106C +#define CL_DEVICE_LUID_KHR 0x106D +#define CL_DEVICE_NODE_MASK_KHR 0x106E +#define CL_UUID_SIZE_KHR 16 +#define CL_LUID_SIZE_KHR 8 + +/****************************************** +* cl_nv_create_buffer extension * +******************************************/ + +typedef cl_bitfield cl_mem_flags_NV; +CL_API_ENTRY cl_mem CL_API_CALL +clCreateBufferNV(cl_context context, + cl_mem_flags flags, + cl_mem_flags_NV flags_NV, + size_t size, + void *host_ptr, + cl_int *errcode_ret); + +/****************************************** +* cl_kernel_attribute_nv extension * +*******************************************/ + +typedef enum kernel_attribute_enum { + CL_KERNEL_PREFERRED_LOCAL_MEMORY_SIZE_NV = 0, /* setting preferred shared memory size */ +} cl_kernel_attribute_nv; + +CL_API_ENTRY cl_int CL_API_CALL +clSetKernelAttributeNV(cl_kernel kernel, + cl_device_id device, + cl_kernel_attribute_nv k_attr, + size_t param_value_size, + const void *param_value); + +CL_API_ENTRY cl_int CL_API_CALL +clGetKernelAttributeNV(cl_kernel kernel, + cl_device_id device, + cl_kernel_attribute_nv k_attr, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +#define CL_MEM_LOCATION_HOST_NV (1 << 0) +#define CL_MEM_PINNED_NV (1 << 1) + +/********************************* +* cl_amd_device_attribute_query * +*********************************/ + +#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036 +#define CL_DEVICE_TOPOLOGY_AMD 0x4037 +#define CL_DEVICE_BOARD_NAME_AMD 0x4038 +#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039 +#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040 +#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041 +#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042 +#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043 +#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044 +#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045 +#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046 +#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047 +#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048 +#define CL_DEVICE_THREAD_TRACE_SUPPORTED_AMD 0x4049 +#define CL_DEVICE_GFXIP_MAJOR_AMD 0x404A +#define CL_DEVICE_GFXIP_MINOR_AMD 0x404B +#define CL_DEVICE_AVAILABLE_ASYNC_QUEUES_AMD 0x404C +#define CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_AMD 0x4030 +#define CL_DEVICE_MAX_WORK_GROUP_SIZE_AMD 0x4031 +#define CL_DEVICE_PREFERRED_CONSTANT_BUFFER_SIZE_AMD 0x4033 +#define CL_DEVICE_PCIE_ID_AMD 0x4034 + + +/********************************* +* cl_arm_printf extension +*********************************/ + +#define CL_PRINTF_CALLBACK_ARM 0x40B0 +#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1 + + +/*********************************** +* cl_ext_device_fission extension +***********************************/ +#define cl_ext_device_fission 1 + +extern CL_API_ENTRY cl_int CL_API_CALL +clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; + +extern CL_API_ENTRY cl_int CL_API_CALL +clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1; + +typedef cl_ulong cl_device_partition_property_ext; +extern CL_API_ENTRY cl_int CL_API_CALL +clCreateSubDevicesEXT(cl_device_id in_device, + const cl_device_partition_property_ext * properties, + cl_uint num_entries, + cl_device_id * out_devices, + cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id in_device, + const cl_device_partition_property_ext * properties, + cl_uint num_entries, + cl_device_id * out_devices, + cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1; + +/* cl_device_partition_property_ext */ +#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050 +#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051 +#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052 +#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053 + +/* clDeviceGetInfo selectors */ +#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054 +#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055 +#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056 +#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057 +#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058 + +/* error codes */ +#define CL_DEVICE_PARTITION_FAILED_EXT -1057 +#define CL_INVALID_PARTITION_COUNT_EXT -1058 +#define CL_INVALID_PARTITION_NAME_EXT -1059 + +/* CL_AFFINITY_DOMAINs */ +#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1 +#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2 +#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3 +#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4 +#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10 +#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100 + +/* cl_device_partition_property_ext list terminators */ +#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0) +#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0) +#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1) + + +/*********************************** + * cl_ext_migrate_memobject extension definitions + ***********************************/ +#define cl_ext_migrate_memobject 1 + +typedef cl_bitfield cl_mem_migration_flags_ext; + +#define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1 + +#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem * mem_objects, + cl_mem_migration_flags_ext flags, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + +typedef CL_API_ENTRY cl_int +(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue, + cl_uint num_mem_objects, + const cl_mem * mem_objects, + cl_mem_migration_flags_ext flags, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event); + + +/********************************* +* cl_ext_cxx_for_opencl extension +*********************************/ +#define cl_ext_cxx_for_opencl 1 + +#define CL_DEVICE_CXX_FOR_OPENCL_NUMERIC_VERSION_EXT 0x4230 + +/********************************* +* cl_qcom_ext_host_ptr extension +*********************************/ +#define cl_qcom_ext_host_ptr 1 + +#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29) + +#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0 +#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1 +#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2 +#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3 +#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4 +#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5 +#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6 +#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7 + +typedef cl_uint cl_image_pitch_info_qcom; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetDeviceImageInfoQCOM(cl_device_id device, + size_t image_width, + size_t image_height, + const cl_image_format *image_format, + cl_image_pitch_info_qcom param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); + +typedef struct _cl_mem_ext_host_ptr +{ + /* Type of external memory allocation. */ + /* Legal values will be defined in layered extensions. */ + cl_uint allocation_type; + + /* Host cache policy for this external memory allocation. */ + cl_uint host_cache_policy; + +} cl_mem_ext_host_ptr; + + +/******************************************* +* cl_qcom_ext_host_ptr_iocoherent extension +********************************************/ + +/* Cache policy specifying io-coherence */ +#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9 + + +/********************************* +* cl_qcom_ion_host_ptr extension +*********************************/ + +#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8 + +typedef struct _cl_mem_ion_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* ION file descriptor */ + int ion_filedesc; + + /* Host pointer to the ION allocated memory */ + void* ion_hostptr; + +} cl_mem_ion_host_ptr; + + +/********************************* +* cl_qcom_android_native_buffer_host_ptr extension +*********************************/ + +#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6 + +typedef struct _cl_mem_android_native_buffer_host_ptr +{ + /* Type of external memory allocation. */ + /* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */ + cl_mem_ext_host_ptr ext_host_ptr; + + /* Virtual pointer to the android native buffer */ + void* anb_ptr; + +} cl_mem_android_native_buffer_host_ptr; + + +/****************************************** + * cl_img_yuv_image extension * + ******************************************/ + +/* Image formats used in clCreateImage */ +#define CL_NV21_IMG 0x40D0 +#define CL_YV12_IMG 0x40D1 + + +/****************************************** + * cl_img_cached_allocations extension * + ******************************************/ + +/* Flag values used by clCreateBuffer */ +#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26) +#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27) + + +/****************************************** + * cl_img_use_gralloc_ptr extension * + ******************************************/ +#define cl_img_use_gralloc_ptr 1 + +/* Flag values used by clCreateBuffer */ +#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28) + +/* To be used by clGetEventInfo: */ +#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2 +#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3 + +/* Error code from clEnqueueReleaseGrallocObjectsIMG */ +#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4 + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGrallocObjectsIMG(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGrallocObjectsIMG(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; + + +/********************************* +* cl_khr_subgroups extension +*********************************/ +#define cl_khr_subgroups 1 + +#if !defined(CL_VERSION_2_1) +/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h. + In hindsight, there should have been a khr suffix on this type for + the extension, but keeping it un-suffixed to maintain backwards + compatibility. */ +typedef cl_uint cl_kernel_sub_group_info; +#endif + +/* cl_kernel_sub_group_info */ +#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033 +#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034 + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetKernelSubGroupInfoKHR(cl_kernel in_kernel, + cl_device_id in_device, + cl_kernel_sub_group_info param_name, + size_t input_value_size, + const void * input_value, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + +typedef CL_API_ENTRY cl_int +(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel in_kernel, + cl_device_id in_device, + cl_kernel_sub_group_info param_name, + size_t input_value_size, + const void * input_value, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED; + + +/********************************* +* cl_khr_mipmap_image extension +*********************************/ + +/* cl_sampler_properties */ +#define CL_SAMPLER_MIP_FILTER_MODE_KHR 0x1155 +#define CL_SAMPLER_LOD_MIN_KHR 0x1156 +#define CL_SAMPLER_LOD_MAX_KHR 0x1157 + + +/********************************* +* cl_khr_priority_hints extension +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_priority_hints 1 + +typedef cl_uint cl_queue_priority_khr; + +/* cl_command_queue_properties */ +#define CL_QUEUE_PRIORITY_KHR 0x1096 + +/* cl_queue_priority_khr */ +#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0) +#define CL_QUEUE_PRIORITY_MED_KHR (1<<1) +#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2) + + +/********************************* +* cl_khr_throttle_hints extension +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_throttle_hints 1 + +typedef cl_uint cl_queue_throttle_khr; + +/* cl_command_queue_properties */ +#define CL_QUEUE_THROTTLE_KHR 0x1097 + +/* cl_queue_throttle_khr */ +#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0) +#define CL_QUEUE_THROTTLE_MED_KHR (1<<1) +#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2) + + +/********************************* +* cl_khr_subgroup_named_barrier +*********************************/ +/* This extension define is for backwards compatibility. + It shouldn't be required since this extension has no new functions. */ +#define cl_khr_subgroup_named_barrier 1 + +/* cl_device_info */ +#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035 + + +/********************************* +* cl_khr_extended_versioning +*********************************/ + +#define cl_khr_extended_versioning 1 + +#define CL_VERSION_MAJOR_BITS_KHR (10) +#define CL_VERSION_MINOR_BITS_KHR (10) +#define CL_VERSION_PATCH_BITS_KHR (12) + +#define CL_VERSION_MAJOR_MASK_KHR ((1 << CL_VERSION_MAJOR_BITS_KHR) - 1) +#define CL_VERSION_MINOR_MASK_KHR ((1 << CL_VERSION_MINOR_BITS_KHR) - 1) +#define CL_VERSION_PATCH_MASK_KHR ((1 << CL_VERSION_PATCH_BITS_KHR) - 1) + +#define CL_VERSION_MAJOR_KHR(version) ((version) >> (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) +#define CL_VERSION_MINOR_KHR(version) (((version) >> CL_VERSION_PATCH_BITS_KHR) & CL_VERSION_MINOR_MASK_KHR) +#define CL_VERSION_PATCH_KHR(version) ((version) & CL_VERSION_PATCH_MASK_KHR) + +#define CL_MAKE_VERSION_KHR(major, minor, patch) \ + ((((major) & CL_VERSION_MAJOR_MASK_KHR) << (CL_VERSION_MINOR_BITS_KHR + CL_VERSION_PATCH_BITS_KHR)) | \ + (((minor) & CL_VERSION_MINOR_MASK_KHR) << CL_VERSION_PATCH_BITS_KHR) | \ + ((patch) & CL_VERSION_PATCH_MASK_KHR)) + +typedef cl_uint cl_version_khr; + +#define CL_NAME_VERSION_MAX_NAME_SIZE_KHR 64 + +typedef struct _cl_name_version_khr +{ + cl_version_khr version; + char name[CL_NAME_VERSION_MAX_NAME_SIZE_KHR]; +} cl_name_version_khr; + +/* cl_platform_info */ +#define CL_PLATFORM_NUMERIC_VERSION_KHR 0x0906 +#define CL_PLATFORM_EXTENSIONS_WITH_VERSION_KHR 0x0907 + +/* cl_device_info */ +#define CL_DEVICE_NUMERIC_VERSION_KHR 0x105E +#define CL_DEVICE_OPENCL_C_NUMERIC_VERSION_KHR 0x105F +#define CL_DEVICE_EXTENSIONS_WITH_VERSION_KHR 0x1060 +#define CL_DEVICE_ILS_WITH_VERSION_KHR 0x1061 +#define CL_DEVICE_BUILT_IN_KERNELS_WITH_VERSION_KHR 0x1062 + + +/********************************* +* cl_khr_device_uuid extension +*********************************/ +#define cl_khr_device_uuid 1 + +#define CL_UUID_SIZE_KHR 16 +#define CL_LUID_SIZE_KHR 8 + +#define CL_DEVICE_UUID_KHR 0x106A +#define CL_DRIVER_UUID_KHR 0x106B +#define CL_DEVICE_LUID_VALID_KHR 0x106C +#define CL_DEVICE_LUID_KHR 0x106D +#define CL_DEVICE_NODE_MASK_KHR 0x106E + +/********************************** + * cl_khr_pci_bus_info extension * + **********************************/ +#define cl_khr_pci_bus_info 1 + +#define CL_DEVICE_PCI_BUS_INFO_KHR 0x410F + +typedef struct _cl_device_pci_bus_info_khr { + cl_uint pci_domain; + cl_uint pci_bus; + cl_uint pci_device; + cl_uint pci_function; +} cl_device_pci_bus_info_khr; + +/********************************** + * cl_arm_import_memory extension * + **********************************/ +#define cl_arm_import_memory 1 + +typedef intptr_t cl_import_properties_arm; + +/* Default and valid proporties name for cl_arm_import_memory */ +#define CL_IMPORT_TYPE_ARM 0x40B2 + +/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_HOST_ARM 0x40B3 + +/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4 + +/* Protected memory property */ +#define CL_IMPORT_TYPE_PROTECTED_ARM 0x40B5 + +/* Android hardware buffer type value for CL_IMPORT_TYPE_ARM property */ +#define CL_IMPORT_TYPE_ANDROID_HARDWARE_BUFFER_ARM 0x41E2 + +/* Data consistency with host property */ +#define CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM 0x41E3 + +/* Import memory size value to indicate a size for the whole buffer */ +#define CL_IMPORT_MEMORY_WHOLE_ALLOCATION_ARM SIZE_MAX + +/* This extension adds a new function that allows for direct memory import into + * OpenCL via the clImportMemoryARM function. + * + * Memory imported through this interface will be mapped into the device's page + * tables directly, providing zero copy access. It will never fall back to copy + * operations and aliased buffers. + * + * Types of memory supported for import are specified as additional extension + * strings. + * + * This extension produces cl_mem allocations which are compatible with all other + * users of cl_mem in the standard API. + * + * This extension maps pages with the same properties as the normal buffer creation + * function clCreateBuffer. + */ +extern CL_API_ENTRY cl_mem CL_API_CALL +clImportMemoryARM( cl_context context, + cl_mem_flags flags, + const cl_import_properties_arm *properties, + void *memory, + size_t size, + cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0; + + +/****************************************** + * cl_arm_shared_virtual_memory extension * + ******************************************/ +#define cl_arm_shared_virtual_memory 1 + +/* Used by clGetDeviceInfo */ +#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6 + +/* Used by clGetMemObjectInfo */ +#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7 + +/* Used by clSetKernelExecInfoARM: */ +#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8 +#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9 + +/* To be used by clGetEventInfo: */ +#define CL_COMMAND_SVM_FREE_ARM 0x40BA +#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB +#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC +#define CL_COMMAND_SVM_MAP_ARM 0x40BD +#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE + +/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */ +#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0) +#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1) +#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2) +#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3) + +/* Flag values used by clSVMAllocARM: */ +#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10) +#define CL_MEM_SVM_ATOMICS_ARM (1 << 11) + +typedef cl_bitfield cl_svm_mem_flags_arm; +typedef cl_uint cl_kernel_exec_info_arm; +typedef cl_bitfield cl_device_svm_capabilities_arm; + +extern CL_API_ENTRY void * CL_API_CALL +clSVMAllocARM(cl_context context, + cl_svm_mem_flags_arm flags, + size_t size, + cl_uint alignment) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY void CL_API_CALL +clSVMFreeARM(cl_context context, + void * svm_pointer) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMFreeARM(cl_command_queue command_queue, + cl_uint num_svm_pointers, + void * svm_pointers[], + void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue, + cl_uint num_svm_pointers, + void * svm_pointers[], + void * user_data), + void * user_data, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemcpyARM(cl_command_queue command_queue, + cl_bool blocking_copy, + void * dst_ptr, + const void * src_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMemFillARM(cl_command_queue command_queue, + void * svm_ptr, + const void * pattern, + size_t pattern_size, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMMapARM(cl_command_queue command_queue, + cl_bool blocking_map, + cl_map_flags flags, + void * svm_ptr, + size_t size, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueSVMUnmapARM(cl_command_queue command_queue, + void * svm_ptr, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelArgSVMPointerARM(cl_kernel kernel, + cl_uint arg_index, + const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_int CL_API_CALL +clSetKernelExecInfoARM(cl_kernel kernel, + cl_kernel_exec_info_arm param_name, + size_t param_value_size, + const void * param_value) CL_EXT_SUFFIX__VERSION_1_2; + +/******************************** + * cl_arm_get_core_id extension * + ********************************/ + +#ifdef CL_VERSION_1_2 + +#define cl_arm_get_core_id 1 + +/* Device info property for bitfield of cores present */ +#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM 0x40BF + +#endif /* CL_VERSION_1_2 */ + +/********************************* +* cl_arm_job_slot_selection +*********************************/ + +#define cl_arm_job_slot_selection 1 + +/* cl_device_info */ +#define CL_DEVICE_JOB_SLOTS_ARM 0x41E0 + +/* cl_command_queue_properties */ +#define CL_QUEUE_JOB_SLOT_ARM 0x41E1 + +/********************************* +* cl_arm_scheduling_controls +*********************************/ + +#define cl_arm_scheduling_controls 1 + +/* cl_device_info */ +#define CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM 0x41E4 + +#define CL_DEVICE_SCHEDULING_KERNEL_BATCHING_ARM (1 << 0) +#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_ARM (1 << 1) +#define CL_DEVICE_SCHEDULING_WORKGROUP_BATCH_SIZE_MODIFIER_ARM (1 << 2) + +/* cl_kernel_info */ +#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_ARM 0x41E5 +#define CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM 0x41E6 + +/* cl_queue_properties */ +#define CL_QUEUE_KERNEL_BATCHING_ARM 0x41E7 + +#ifdef __cplusplus +} +#endif + +#endif /* __CL_EXT_H */ diff --git a/ext/cudart/include/CL/cl_gl.h b/ext/cudart/include/CL/cl_gl.h new file mode 100644 index 0000000000000000000000000000000000000000..4e390d197fd1ef1b36b17c0bcabbabbac7a14d66 --- /dev/null +++ b/ext/cudart/include/CL/cl_gl.h @@ -0,0 +1,154 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_GL_H +#define __OPENCL_CL_GL_H + +#ifdef __APPLE__ +#include <OpenCL/cl.h> +#else +#include <CL/cl.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +typedef cl_uint cl_gl_object_type; +typedef cl_uint cl_gl_texture_info; +typedef cl_uint cl_gl_platform_info; +typedef struct __GLsync *cl_GLsync; + +/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */ +#define CL_GL_OBJECT_BUFFER 0x2000 +#define CL_GL_OBJECT_TEXTURE2D 0x2001 +#define CL_GL_OBJECT_TEXTURE3D 0x2002 +#define CL_GL_OBJECT_RENDERBUFFER 0x2003 +#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E +#define CL_GL_OBJECT_TEXTURE1D 0x200F +#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010 +#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011 + +/* cl_gl_texture_info */ +#define CL_GL_TEXTURE_TARGET 0x2004 +#define CL_GL_MIPMAP_LEVEL 0x2005 +#define CL_GL_NUM_SAMPLES 0x2012 + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLBuffer(cl_context context, + cl_mem_flags flags, + cl_GLuint bufobj, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLTexture(cl_context context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texture, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2; + +extern CL_API_ENTRY cl_mem CL_API_CALL +clCreateFromGLRenderbuffer(cl_context context, + cl_mem_flags flags, + cl_GLuint renderbuffer, + cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLObjectInfo(cl_mem memobj, + cl_gl_object_type * gl_object_type, + cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLTextureInfo(cl_mem memobj, + cl_gl_texture_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueAcquireGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + +extern CL_API_ENTRY cl_int CL_API_CALL +clEnqueueReleaseGLObjects(cl_command_queue command_queue, + cl_uint num_objects, + const cl_mem * mem_objects, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) CL_API_SUFFIX__VERSION_1_0; + + +/* Deprecated OpenCL 1.1 APIs */ +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture2D(cl_context context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texture, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL +clCreateFromGLTexture3D(cl_context context, + cl_mem_flags flags, + cl_GLenum target, + cl_GLint miplevel, + cl_GLuint texture, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED; + +/* cl_khr_gl_sharing extension */ + +#define cl_khr_gl_sharing 1 + +typedef cl_uint cl_gl_context_info; + +/* Additional Error Codes */ +#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000 + +/* cl_gl_context_info */ +#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006 +#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007 + +/* Additional cl_context_properties */ +#define CL_GL_CONTEXT_KHR 0x2008 +#define CL_EGL_DISPLAY_KHR 0x2009 +#define CL_GLX_DISPLAY_KHR 0x200A +#define CL_WGL_HDC_KHR 0x200B +#define CL_CGL_SHAREGROUP_KHR 0x200C + +extern CL_API_ENTRY cl_int CL_API_CALL +clGetGLContextInfoKHR(const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0; + +typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)( + const cl_context_properties * properties, + cl_gl_context_info param_name, + size_t param_value_size, + void * param_value, + size_t * param_value_size_ret); + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_H */ \ No newline at end of file diff --git a/ext/cudart/include/CL/cl_gl_ext.h b/ext/cudart/include/CL/cl_gl_ext.h new file mode 100644 index 0000000000000000000000000000000000000000..83102e3435e93eaa34c99efae3680c913ba3990e --- /dev/null +++ b/ext/cudart/include/CL/cl_gl_ext.h @@ -0,0 +1,44 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_CL_GL_EXT_H +#define __OPENCL_CL_GL_EXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ + #include <OpenCL/cl_gl.h> +#else + #include <CL/cl_gl.h> +#endif + +/* + * cl_khr_gl_event extension + */ +#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D + +extern CL_API_ENTRY cl_event CL_API_CALL +clCreateEventFromGLsyncKHR(cl_context context, + cl_GLsync sync, + cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1; + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_CL_GL_EXT_H */ diff --git a/ext/cudart/include/CL/cl_platform.h b/ext/cudart/include/CL/cl_platform.h new file mode 100644 index 0000000000000000000000000000000000000000..0fba9080bd6cf87a1614c1378a1b9b151efbd21e --- /dev/null +++ b/ext/cudart/include/CL/cl_platform.h @@ -0,0 +1,1414 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __CL_PLATFORM_H +#define __CL_PLATFORM_H + +#ifdef __APPLE__ + /* Contains #defines for AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER below */ + #include <AvailabilityMacros.h> +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) + #define CL_API_ENTRY + #define CL_API_CALL __stdcall + #define CL_CALLBACK __stdcall +#else + #define CL_API_ENTRY + #define CL_API_CALL + #define CL_CALLBACK +#endif + +/* + * Deprecation flags refer to the last version of the header in which the + * feature was not deprecated. + * + * E.g. VERSION_1_1_DEPRECATED means the feature is present in 1.1 without + * deprecation but is deprecated in versions later than 1.1. + */ +#ifdef __APPLE__ + #define CL_EXTENSION_WEAK_LINK __attribute__((weak_import)) + #define CL_API_SUFFIX__VERSION_1_0 AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER + #define CL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_1 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_6_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_7 + + #ifdef AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_8_AND_LATER + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER_BUT_DEPRECATED_IN_MAC_OS_X_VERSION_10_8 + #else + #warning This path should never happen outside of internal operating system development. AvailabilityMacros do not function correctly here! + #define CL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define GCL_API_SUFFIX__VERSION_1_2 AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_2 CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXTENSION_WEAK_LINK AVAILABLE_MAC_OS_X_VERSION_10_7_AND_LATER + #endif + + + +#else + #define CL_EXTENSION_WEAK_LINK + #define CL_API_SUFFIX__VERSION_1_0 + #define CL_EXT_SUFFIX__VERSION_1_0 + #define CL_API_SUFFIX__VERSION_1_1 + #define CL_EXT_SUFFIX__VERSION_1_1 + #define CL_API_SUFFIX__VERSION_1_2 + #define CL_EXT_SUFFIX__VERSION_1_2 + #define CL_API_SUFFIX__VERSION_2_0 + #define CL_EXT_SUFFIX__VERSION_2_0 + #define CL_API_SUFFIX__VERSION_2_1 + #define CL_EXT_SUFFIX__VERSION_2_1 + #define CL_API_SUFFIX__VERSION_2_2 + #define CL_EXT_SUFFIX__VERSION_2_2 + #define CL_API_SUFFIX__VERSION_3_0 + #define CL_EXT_SUFFIX__VERSION_3_0 + #define CL_API_SUFFIX__EXPERIMENTAL + #define CL_EXT_SUFFIX__EXPERIMENTAL + + #ifdef __GNUC__ + #define CL_EXT_SUFFIX_DEPRECATED __attribute__((deprecated)) + #define CL_EXT_PREFIX_DEPRECATED + #elif defined(_WIN32) + #define CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX_DEPRECATED __declspec(deprecated) + #else + #define CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_0_APIS + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_1_APIS + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_1_2_APIS + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_1_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_1_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_0_APIS + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_0_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_1_APIS + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_1_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_1_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif + + #ifdef CL_USE_DEPRECATED_OPENCL_2_2_APIS + #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED + #else + #define CL_EXT_SUFFIX__VERSION_2_2_DEPRECATED CL_EXT_SUFFIX_DEPRECATED + #define CL_EXT_PREFIX__VERSION_2_2_DEPRECATED CL_EXT_PREFIX_DEPRECATED + #endif +#endif + +#if (defined (_WIN32) && defined(_MSC_VER)) + +/* scalar types */ +typedef signed __int8 cl_char; +typedef unsigned __int8 cl_uchar; +typedef signed __int16 cl_short; +typedef unsigned __int16 cl_ushort; +typedef signed __int32 cl_int; +typedef unsigned __int32 cl_uint; +typedef signed __int64 cl_long; +typedef unsigned __int64 cl_ulong; + +typedef unsigned __int16 cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 1.7976931348623158e+308 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#define CL_NAN (CL_INFINITY - CL_INFINITY) +#define CL_HUGE_VALF ((cl_float) 1e50) +#define CL_HUGE_VAL ((cl_double) 1e500) +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#else + +#include <stdint.h> + +/* scalar types */ +typedef int8_t cl_char; +typedef uint8_t cl_uchar; +typedef int16_t cl_short; +typedef uint16_t cl_ushort; +typedef int32_t cl_int; +typedef uint32_t cl_uint; +typedef int64_t cl_long; +typedef uint64_t cl_ulong; + +typedef uint16_t cl_half; +typedef float cl_float; +typedef double cl_double; + +/* Macro names and corresponding values defined by OpenCL */ +#define CL_CHAR_BIT 8 +#define CL_SCHAR_MAX 127 +#define CL_SCHAR_MIN (-127-1) +#define CL_CHAR_MAX CL_SCHAR_MAX +#define CL_CHAR_MIN CL_SCHAR_MIN +#define CL_UCHAR_MAX 255 +#define CL_SHRT_MAX 32767 +#define CL_SHRT_MIN (-32767-1) +#define CL_USHRT_MAX 65535 +#define CL_INT_MAX 2147483647 +#define CL_INT_MIN (-2147483647-1) +#define CL_UINT_MAX 0xffffffffU +#define CL_LONG_MAX ((cl_long) 0x7FFFFFFFFFFFFFFFLL) +#define CL_LONG_MIN ((cl_long) -0x7FFFFFFFFFFFFFFFLL - 1LL) +#define CL_ULONG_MAX ((cl_ulong) 0xFFFFFFFFFFFFFFFFULL) + +#define CL_FLT_DIG 6 +#define CL_FLT_MANT_DIG 24 +#define CL_FLT_MAX_10_EXP +38 +#define CL_FLT_MAX_EXP +128 +#define CL_FLT_MIN_10_EXP -37 +#define CL_FLT_MIN_EXP -125 +#define CL_FLT_RADIX 2 +#define CL_FLT_MAX 340282346638528859811704183484516925440.0f +#define CL_FLT_MIN 1.175494350822287507969e-38f +#define CL_FLT_EPSILON 1.1920928955078125e-7f + +#define CL_HALF_DIG 3 +#define CL_HALF_MANT_DIG 11 +#define CL_HALF_MAX_10_EXP +4 +#define CL_HALF_MAX_EXP +16 +#define CL_HALF_MIN_10_EXP -4 +#define CL_HALF_MIN_EXP -13 +#define CL_HALF_RADIX 2 +#define CL_HALF_MAX 65504.0f +#define CL_HALF_MIN 6.103515625e-05f +#define CL_HALF_EPSILON 9.765625e-04f + +#define CL_DBL_DIG 15 +#define CL_DBL_MANT_DIG 53 +#define CL_DBL_MAX_10_EXP +308 +#define CL_DBL_MAX_EXP +1024 +#define CL_DBL_MIN_10_EXP -307 +#define CL_DBL_MIN_EXP -1021 +#define CL_DBL_RADIX 2 +#define CL_DBL_MAX 179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.0 +#define CL_DBL_MIN 2.225073858507201383090e-308 +#define CL_DBL_EPSILON 2.220446049250313080847e-16 + +#define CL_M_E 2.7182818284590452354 +#define CL_M_LOG2E 1.4426950408889634074 +#define CL_M_LOG10E 0.43429448190325182765 +#define CL_M_LN2 0.69314718055994530942 +#define CL_M_LN10 2.30258509299404568402 +#define CL_M_PI 3.14159265358979323846 +#define CL_M_PI_2 1.57079632679489661923 +#define CL_M_PI_4 0.78539816339744830962 +#define CL_M_1_PI 0.31830988618379067154 +#define CL_M_2_PI 0.63661977236758134308 +#define CL_M_2_SQRTPI 1.12837916709551257390 +#define CL_M_SQRT2 1.41421356237309504880 +#define CL_M_SQRT1_2 0.70710678118654752440 + +#define CL_M_E_F 2.718281828f +#define CL_M_LOG2E_F 1.442695041f +#define CL_M_LOG10E_F 0.434294482f +#define CL_M_LN2_F 0.693147181f +#define CL_M_LN10_F 2.302585093f +#define CL_M_PI_F 3.141592654f +#define CL_M_PI_2_F 1.570796327f +#define CL_M_PI_4_F 0.785398163f +#define CL_M_1_PI_F 0.318309886f +#define CL_M_2_PI_F 0.636619772f +#define CL_M_2_SQRTPI_F 1.128379167f +#define CL_M_SQRT2_F 1.414213562f +#define CL_M_SQRT1_2_F 0.707106781f + +#if defined( __GNUC__ ) + #define CL_HUGE_VALF __builtin_huge_valf() + #define CL_HUGE_VAL __builtin_huge_val() + #define CL_NAN __builtin_nanf( "" ) +#else + #define CL_HUGE_VALF ((cl_float) 1e50) + #define CL_HUGE_VAL ((cl_double) 1e500) + float nanf( const char * ); + #define CL_NAN nanf( "" ) +#endif +#define CL_MAXFLOAT CL_FLT_MAX +#define CL_INFINITY CL_HUGE_VALF + +#endif + +#include <stddef.h> + +/* Mirror types to GL types. Mirror types allow us to avoid deciding which 87s to load based on whether we are using GL or GLES here. */ +typedef unsigned int cl_GLuint; +typedef int cl_GLint; +typedef unsigned int cl_GLenum; + +/* + * Vector types + * + * Note: OpenCL requires that all types be naturally aligned. + * This means that vector types must be naturally aligned. + * For example, a vector of four floats must be aligned to + * a 16 byte boundary (calculated as 4 * the natural 4-byte + * alignment of the float). The alignment qualifiers here + * will only function properly if your compiler supports them + * and if you don't actively work to defeat them. For example, + * in order for a cl_float4 to be 16 byte aligned in a struct, + * the start of the struct must itself be 16-byte aligned. + * + * Maintaining proper alignment is the user's responsibility. + */ + +/* Define basic vector types */ +#if defined( __VEC__ ) + #if !defined(__clang__) + #include <altivec.h> /* may be omitted depending on compiler. AltiVec spec provides no way to detect whether the header is required. */ + #endif + typedef __vector unsigned char __cl_uchar16; + typedef __vector signed char __cl_char16; + typedef __vector unsigned short __cl_ushort8; + typedef __vector signed short __cl_short8; + typedef __vector unsigned int __cl_uint4; + typedef __vector signed int __cl_int4; + typedef __vector float __cl_float4; + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_UINT4__ 1 + #define __CL_INT4__ 1 + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <xmmintrin.h> + #endif + #if defined( __GNUC__ ) + typedef float __cl_float4 __attribute__((vector_size(16))); + #else + typedef __m128 __cl_float4; + #endif + #define __CL_FLOAT4__ 1 +#endif + +#if defined( __SSE2__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <emmintrin.h> + #endif + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar16 __attribute__((vector_size(16))); + typedef cl_char __cl_char16 __attribute__((vector_size(16))); + typedef cl_ushort __cl_ushort8 __attribute__((vector_size(16))); + typedef cl_short __cl_short8 __attribute__((vector_size(16))); + typedef cl_uint __cl_uint4 __attribute__((vector_size(16))); + typedef cl_int __cl_int4 __attribute__((vector_size(16))); + typedef cl_ulong __cl_ulong2 __attribute__((vector_size(16))); + typedef cl_long __cl_long2 __attribute__((vector_size(16))); + typedef cl_double __cl_double2 __attribute__((vector_size(16))); + #else + typedef __m128i __cl_uchar16; + typedef __m128i __cl_char16; + typedef __m128i __cl_ushort8; + typedef __m128i __cl_short8; + typedef __m128i __cl_uint4; + typedef __m128i __cl_int4; + typedef __m128i __cl_ulong2; + typedef __m128i __cl_long2; + typedef __m128d __cl_double2; + #endif + #define __CL_UCHAR16__ 1 + #define __CL_CHAR16__ 1 + #define __CL_USHORT8__ 1 + #define __CL_SHORT8__ 1 + #define __CL_INT4__ 1 + #define __CL_UINT4__ 1 + #define __CL_ULONG2__ 1 + #define __CL_LONG2__ 1 + #define __CL_DOUBLE2__ 1 +#endif + +#if defined( __MMX__ ) + #include <mmintrin.h> + #if defined( __GNUC__ ) + typedef cl_uchar __cl_uchar8 __attribute__((vector_size(8))); + typedef cl_char __cl_char8 __attribute__((vector_size(8))); + typedef cl_ushort __cl_ushort4 __attribute__((vector_size(8))); + typedef cl_short __cl_short4 __attribute__((vector_size(8))); + typedef cl_uint __cl_uint2 __attribute__((vector_size(8))); + typedef cl_int __cl_int2 __attribute__((vector_size(8))); + typedef cl_ulong __cl_ulong1 __attribute__((vector_size(8))); + typedef cl_long __cl_long1 __attribute__((vector_size(8))); + typedef cl_float __cl_float2 __attribute__((vector_size(8))); + #else + typedef __m64 __cl_uchar8; + typedef __m64 __cl_char8; + typedef __m64 __cl_ushort4; + typedef __m64 __cl_short4; + typedef __m64 __cl_uint2; + typedef __m64 __cl_int2; + typedef __m64 __cl_ulong1; + typedef __m64 __cl_long1; + typedef __m64 __cl_float2; + #endif + #define __CL_UCHAR8__ 1 + #define __CL_CHAR8__ 1 + #define __CL_USHORT4__ 1 + #define __CL_SHORT4__ 1 + #define __CL_INT2__ 1 + #define __CL_UINT2__ 1 + #define __CL_ULONG1__ 1 + #define __CL_LONG1__ 1 + #define __CL_FLOAT2__ 1 +#endif + +#if defined( __AVX__ ) + #if defined( __MINGW64__ ) + #include <intrin.h> + #else + #include <immintrin.h> + #endif + #if defined( __GNUC__ ) + typedef cl_float __cl_float8 __attribute__((vector_size(32))); + typedef cl_double __cl_double4 __attribute__((vector_size(32))); + #else + typedef __m256 __cl_float8; + typedef __m256d __cl_double4; + #endif + #define __CL_FLOAT8__ 1 + #define __CL_DOUBLE4__ 1 +#endif + +/* Define capabilities for anonymous struct members. */ +#if !defined(__cplusplus) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ +#elif defined( __GNUC__) && ! defined( __STRICT_ANSI__ ) +#define __CL_HAS_ANON_STRUCT__ 1 +#define __CL_ANON_STRUCT__ __extension__ +#elif defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__) + #if _MSC_VER >= 1500 + /* Microsoft Developer Studio 2008 supports anonymous structs, but + * complains by default. */ + #define __CL_HAS_ANON_STRUCT__ 1 + #define __CL_ANON_STRUCT__ + /* Disable warning C4201: nonstandard extension used : nameless + * struct/union */ + #pragma warning( push ) + #pragma warning( disable : 4201 ) + #endif +#else +#define __CL_HAS_ANON_STRUCT__ 0 +#define __CL_ANON_STRUCT__ +#endif + +/* Define alignment keys */ +#if defined( __GNUC__ ) + #define CL_ALIGNED(_x) __attribute__ ((aligned(_x))) +#elif defined( _WIN32) && (_MSC_VER) + /* Alignment keys neutered on windows because MSVC can't swallow function arguments with alignment requirements */ + /* http://msdn.microsoft.com/en-us/library/373ak2y1%28VS.71%29.aspx */ + /* #include <crtdefs.h> */ + /* #define CL_ALIGNED(_x) _CRT_ALIGN(_x) */ + #define CL_ALIGNED(_x) +#else + #warning Need to implement some method to align data here + #define CL_ALIGNED(_x) +#endif + +/* Indicate whether .xyzw, .s0123 and .hi.lo are supported */ +#if __CL_HAS_ANON_STRUCT__ + /* .xyzw and .s0123...{f|F} are supported */ + #define CL_HAS_NAMED_VECTOR_FIELDS 1 + /* .hi and .lo are supported */ + #define CL_HAS_HI_LO_VECTOR_FIELDS 1 +#endif + +/* Define cl_vector types */ + +/* ---- cl_charn ---- */ +typedef union +{ + cl_char CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_char lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2; +#endif +}cl_char2; + +typedef union +{ + cl_char CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_char2 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[2]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4; +#endif +}cl_char4; + +/* cl_char3 is identical in size, alignment and behavior to cl_char4. See section 6.1.5. */ +typedef cl_char4 cl_char3; + +typedef union +{ + cl_char CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_char4 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[4]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[2]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8; +#endif +}cl_char8; + +typedef union +{ + cl_char CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_char x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_char s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_char8 lo, hi; }; +#endif +#if defined( __CL_CHAR2__) + __cl_char2 v2[8]; +#endif +#if defined( __CL_CHAR4__) + __cl_char4 v4[4]; +#endif +#if defined( __CL_CHAR8__ ) + __cl_char8 v8[2]; +#endif +#if defined( __CL_CHAR16__ ) + __cl_char16 v16; +#endif +}cl_char16; + + +/* ---- cl_ucharn ---- */ +typedef union +{ + cl_uchar CL_ALIGNED(2) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uchar lo, hi; }; +#endif +#if defined( __cl_uchar2__) + __cl_uchar2 v2; +#endif +}cl_uchar2; + +typedef union +{ + cl_uchar CL_ALIGNED(4) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uchar2 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[2]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4; +#endif +}cl_uchar4; + +/* cl_uchar3 is identical in size, alignment and behavior to cl_uchar4. See section 6.1.5. */ +typedef cl_uchar4 cl_uchar3; + +typedef union +{ + cl_uchar CL_ALIGNED(8) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uchar4 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[4]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[2]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8; +#endif +}cl_uchar8; + +typedef union +{ + cl_uchar CL_ALIGNED(16) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uchar x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uchar s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uchar8 lo, hi; }; +#endif +#if defined( __CL_UCHAR2__) + __cl_uchar2 v2[8]; +#endif +#if defined( __CL_UCHAR4__) + __cl_uchar4 v4[4]; +#endif +#if defined( __CL_UCHAR8__ ) + __cl_uchar8 v8[2]; +#endif +#if defined( __CL_UCHAR16__ ) + __cl_uchar16 v16; +#endif +}cl_uchar16; + + +/* ---- cl_shortn ---- */ +typedef union +{ + cl_short CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_short lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2; +#endif +}cl_short2; + +typedef union +{ + cl_short CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_short2 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[2]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4; +#endif +}cl_short4; + +/* cl_short3 is identical in size, alignment and behavior to cl_short4. See section 6.1.5. */ +typedef cl_short4 cl_short3; + +typedef union +{ + cl_short CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_short4 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[4]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[2]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8; +#endif +}cl_short8; + +typedef union +{ + cl_short CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_short x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_short s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_short8 lo, hi; }; +#endif +#if defined( __CL_SHORT2__) + __cl_short2 v2[8]; +#endif +#if defined( __CL_SHORT4__) + __cl_short4 v4[4]; +#endif +#if defined( __CL_SHORT8__ ) + __cl_short8 v8[2]; +#endif +#if defined( __CL_SHORT16__ ) + __cl_short16 v16; +#endif +}cl_short16; + + +/* ---- cl_ushortn ---- */ +typedef union +{ + cl_ushort CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ushort lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2; +#endif +}cl_ushort2; + +typedef union +{ + cl_ushort CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ushort2 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[2]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4; +#endif +}cl_ushort4; + +/* cl_ushort3 is identical in size, alignment and behavior to cl_ushort4. See section 6.1.5. */ +typedef cl_ushort4 cl_ushort3; + +typedef union +{ + cl_ushort CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ushort4 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[4]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[2]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8; +#endif +}cl_ushort8; + +typedef union +{ + cl_ushort CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ushort x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ushort s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ushort8 lo, hi; }; +#endif +#if defined( __CL_USHORT2__) + __cl_ushort2 v2[8]; +#endif +#if defined( __CL_USHORT4__) + __cl_ushort4 v4[4]; +#endif +#if defined( __CL_USHORT8__ ) + __cl_ushort8 v8[2]; +#endif +#if defined( __CL_USHORT16__ ) + __cl_ushort16 v16; +#endif +}cl_ushort16; + + +/* ---- cl_halfn ---- */ +typedef union +{ + cl_half CL_ALIGNED(4) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_half lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2; +#endif +}cl_half2; + +typedef union +{ + cl_half CL_ALIGNED(8) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_half2 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[2]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4; +#endif +}cl_half4; + +/* cl_half3 is identical in size, alignment and behavior to cl_half4. See section 6.1.5. */ +typedef cl_half4 cl_half3; + +typedef union +{ + cl_half CL_ALIGNED(16) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_half4 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[4]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[2]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8; +#endif +}cl_half8; + +typedef union +{ + cl_half CL_ALIGNED(32) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_half x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_half s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_half8 lo, hi; }; +#endif +#if defined( __CL_HALF2__) + __cl_half2 v2[8]; +#endif +#if defined( __CL_HALF4__) + __cl_half4 v4[4]; +#endif +#if defined( __CL_HALF8__ ) + __cl_half8 v8[2]; +#endif +#if defined( __CL_HALF16__ ) + __cl_half16 v16; +#endif +}cl_half16; + +/* ---- cl_intn ---- */ +typedef union +{ + cl_int CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_int lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2; +#endif +}cl_int2; + +typedef union +{ + cl_int CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_int2 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[2]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4; +#endif +}cl_int4; + +/* cl_int3 is identical in size, alignment and behavior to cl_int4. See section 6.1.5. */ +typedef cl_int4 cl_int3; + +typedef union +{ + cl_int CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_int4 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[4]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[2]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8; +#endif +}cl_int8; + +typedef union +{ + cl_int CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_int x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_int8 lo, hi; }; +#endif +#if defined( __CL_INT2__) + __cl_int2 v2[8]; +#endif +#if defined( __CL_INT4__) + __cl_int4 v4[4]; +#endif +#if defined( __CL_INT8__ ) + __cl_int8 v8[2]; +#endif +#if defined( __CL_INT16__ ) + __cl_int16 v16; +#endif +}cl_int16; + + +/* ---- cl_uintn ---- */ +typedef union +{ + cl_uint CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_uint lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2; +#endif +}cl_uint2; + +typedef union +{ + cl_uint CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_uint2 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[2]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4; +#endif +}cl_uint4; + +/* cl_uint3 is identical in size, alignment and behavior to cl_uint4. See section 6.1.5. */ +typedef cl_uint4 cl_uint3; + +typedef union +{ + cl_uint CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_uint4 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[4]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[2]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8; +#endif +}cl_uint8; + +typedef union +{ + cl_uint CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_uint x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_uint s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_uint8 lo, hi; }; +#endif +#if defined( __CL_UINT2__) + __cl_uint2 v2[8]; +#endif +#if defined( __CL_UINT4__) + __cl_uint4 v4[4]; +#endif +#if defined( __CL_UINT8__ ) + __cl_uint8 v8[2]; +#endif +#if defined( __CL_UINT16__ ) + __cl_uint16 v16; +#endif +}cl_uint16; + +/* ---- cl_longn ---- */ +typedef union +{ + cl_long CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_long lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2; +#endif +}cl_long2; + +typedef union +{ + cl_long CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_long2 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[2]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4; +#endif +}cl_long4; + +/* cl_long3 is identical in size, alignment and behavior to cl_long4. See section 6.1.5. */ +typedef cl_long4 cl_long3; + +typedef union +{ + cl_long CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_long4 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[4]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[2]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8; +#endif +}cl_long8; + +typedef union +{ + cl_long CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_long x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_long s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_long8 lo, hi; }; +#endif +#if defined( __CL_LONG2__) + __cl_long2 v2[8]; +#endif +#if defined( __CL_LONG4__) + __cl_long4 v4[4]; +#endif +#if defined( __CL_LONG8__ ) + __cl_long8 v8[2]; +#endif +#if defined( __CL_LONG16__ ) + __cl_long16 v16; +#endif +}cl_long16; + + +/* ---- cl_ulongn ---- */ +typedef union +{ + cl_ulong CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_ulong lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2; +#endif +}cl_ulong2; + +typedef union +{ + cl_ulong CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_ulong2 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[2]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4; +#endif +}cl_ulong4; + +/* cl_ulong3 is identical in size, alignment and behavior to cl_ulong4. See section 6.1.5. */ +typedef cl_ulong4 cl_ulong3; + +typedef union +{ + cl_ulong CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_ulong4 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[4]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[2]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8; +#endif +}cl_ulong8; + +typedef union +{ + cl_ulong CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_ulong x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_ulong s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_ulong8 lo, hi; }; +#endif +#if defined( __CL_ULONG2__) + __cl_ulong2 v2[8]; +#endif +#if defined( __CL_ULONG4__) + __cl_ulong4 v4[4]; +#endif +#if defined( __CL_ULONG8__ ) + __cl_ulong8 v8[2]; +#endif +#if defined( __CL_ULONG16__ ) + __cl_ulong16 v16; +#endif +}cl_ulong16; + + +/* --- cl_floatn ---- */ + +typedef union +{ + cl_float CL_ALIGNED(8) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_float lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2; +#endif +}cl_float2; + +typedef union +{ + cl_float CL_ALIGNED(16) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_float2 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[2]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4; +#endif +}cl_float4; + +/* cl_float3 is identical in size, alignment and behavior to cl_float4. See section 6.1.5. */ +typedef cl_float4 cl_float3; + +typedef union +{ + cl_float CL_ALIGNED(32) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_float4 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[4]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[2]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8; +#endif +}cl_float8; + +typedef union +{ + cl_float CL_ALIGNED(64) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_float x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_float s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_float8 lo, hi; }; +#endif +#if defined( __CL_FLOAT2__) + __cl_float2 v2[8]; +#endif +#if defined( __CL_FLOAT4__) + __cl_float4 v4[4]; +#endif +#if defined( __CL_FLOAT8__ ) + __cl_float8 v8[2]; +#endif +#if defined( __CL_FLOAT16__ ) + __cl_float16 v16; +#endif +}cl_float16; + +/* --- cl_doublen ---- */ + +typedef union +{ + cl_double CL_ALIGNED(16) s[2]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1; }; + __CL_ANON_STRUCT__ struct{ cl_double lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2; +#endif +}cl_double2; + +typedef union +{ + cl_double CL_ALIGNED(32) s[4]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3; }; + __CL_ANON_STRUCT__ struct{ cl_double2 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[2]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4; +#endif +}cl_double4; + +/* cl_double3 is identical in size, alignment and behavior to cl_double4. See section 6.1.5. */ +typedef cl_double4 cl_double3; + +typedef union +{ + cl_double CL_ALIGNED(64) s[8]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7; }; + __CL_ANON_STRUCT__ struct{ cl_double4 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[4]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[2]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8; +#endif +}cl_double8; + +typedef union +{ + cl_double CL_ALIGNED(128) s[16]; +#if __CL_HAS_ANON_STRUCT__ + __CL_ANON_STRUCT__ struct{ cl_double x, y, z, w, __spacer4, __spacer5, __spacer6, __spacer7, __spacer8, __spacer9, sa, sb, sc, sd, se, sf; }; + __CL_ANON_STRUCT__ struct{ cl_double s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA, sB, sC, sD, sE, sF; }; + __CL_ANON_STRUCT__ struct{ cl_double8 lo, hi; }; +#endif +#if defined( __CL_DOUBLE2__) + __cl_double2 v2[8]; +#endif +#if defined( __CL_DOUBLE4__) + __cl_double4 v4[4]; +#endif +#if defined( __CL_DOUBLE8__ ) + __cl_double8 v8[2]; +#endif +#if defined( __CL_DOUBLE16__ ) + __cl_double16 v16; +#endif +}cl_double16; + +/* Macro to facilitate debugging + * Usage: + * Place CL_PROGRAM_STRING_DEBUG_INFO on the line before the first line of your source. + * The first line ends with: CL_PROGRAM_STRING_DEBUG_INFO \" + * Each line thereafter of OpenCL C source must end with: \n\ + * The last line ends in "; + * + * Example: + * + * const char *my_program = CL_PROGRAM_STRING_DEBUG_INFO "\ + * kernel void foo( int a, float * b ) \n\ + * { \n\ + * // my comment \n\ + * *b[ get_global_id(0)] = a; \n\ + * } \n\ + * "; + * + * This should correctly set up the line, (column) and file information for your source + * string so you can do source level debugging. + */ +#define __CL_STRINGIFY( _x ) # _x +#define _CL_STRINGIFY( _x ) __CL_STRINGIFY( _x ) +#define CL_PROGRAM_STRING_DEBUG_INFO "#line " _CL_STRINGIFY(__LINE__) " \"" __FILE__ "\" \n\n" + +#ifdef __cplusplus +} +#endif + +#undef __CL_HAS_ANON_STRUCT__ +#undef __CL_ANON_STRUCT__ +#if defined( _WIN32) && defined(_MSC_VER) && ! defined(__STDC__) + #if _MSC_VER >=1500 + #pragma warning( pop ) + #endif +#endif + +#endif /* __CL_PLATFORM_H */ diff --git a/ext/cudart/include/CL/opencl.h b/ext/cudart/include/CL/opencl.h new file mode 100644 index 0000000000000000000000000000000000000000..bd7fa6be57b8df26003c414041683340bcb95404 --- /dev/null +++ b/ext/cudart/include/CL/opencl.h @@ -0,0 +1,40 @@ +/******************************************************************************* + * Copyright (c) 2008-2020 The Khronos Group Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + ******************************************************************************/ + +#ifndef __OPENCL_H +#define __OPENCL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __APPLE__ +#include <OpenCL/cl.h> +#include <OpenCL/cl_gl.h> +#include <OpenCL/cl_gl_ext.h> +#include <OpenCL/cl_ext.h> +#else +#include <CL/cl.h> +#include <CL/cl_gl.h> +#include <CL/cl_gl_ext.h> +#include <CL/cl_ext.h> +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* __OPENCL_H */ diff --git a/ext/cudart/include/builtin_types.h b/ext/cudart/include/builtin_types.h new file mode 100644 index 0000000000000000000000000000000000000000..5247c40807f0dd36a886513ab1bff5d2977364db --- /dev/null +++ b/ext/cudart/include/builtin_types.h @@ -0,0 +1,64 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "device_types.h" +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "driver_types.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "surface_types.h" +#include "texture_types.h" +#include "vector_types.h" diff --git a/ext/cudart/include/channel_descriptor.h b/ext/cudart/include/channel_descriptor.h new file mode 100644 index 0000000000000000000000000000000000000000..1d61d2974a4f2b527d04baf586c73c2af86358f4 --- /dev/null +++ b/ext/cudart/include/channel_descriptor.h @@ -0,0 +1,595 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CHANNEL_DESCRIPTOR_H__) +#define __CHANNEL_DESCRIPTOR_H__ + +#if defined(__cplusplus) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * \addtogroup CUDART_HIGHLEVEL + * + * @{ + */ + +/** + * \brief \hl Returns a channel descriptor using the specified format + * + * Returns a channel descriptor with format \p f and number of bits of each + * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is + * defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + * \endcode + * + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, cudaChannelFormatKindFloat, + * ::cudaChannelFormatKindSignedNormalized8X1, ::cudaChannelFormatKindSignedNormalized8X2, + * ::cudaChannelFormatKindSignedNormalized8X4, + * ::cudaChannelFormatKindUnsignedNormalized8X1, ::cudaChannelFormatKindUnsignedNormalized8X2, + * ::cudaChannelFormatKindUnsignedNormalized8X4, + * ::cudaChannelFormatKindSignedNormalized16X1, ::cudaChannelFormatKindSignedNormalized16X2, + * ::cudaChannelFormatKindSignedNormalized16X4, + * ::cudaChannelFormatKindUnsignedNormalized16X1, ::cudaChannelFormatKindUnsignedNormalized16X2, + * ::cudaChannelFormatKindUnsignedNormalized16X4 + * or ::cudaChannelFormatKindNV12. + * + * The format is specified by the template specialization. + * + * The template function specializes for the following scalar types: + * char, signed char, unsigned char, short, unsigned short, int, unsigned int, long, unsigned long, and float. + * The template function specializes for the following vector types: + * char{1|2|4}, uchar{1|2|4}, short{1|2|4}, ushort{1|2|4}, int{1|2|4}, uint{1|2|4}, long{1|2|4}, ulong{1|2|4}, float{1|2|4}. + * The template function specializes for following cudaChannelFormatKind enum values: + * ::cudaChannelFormatKind{Uns|S}ignedNormalized{8|16}X{1|2|4}, and ::cudaChannelFormatKindNV12. + * + * Invoking the function on a type without a specialization defaults to creating a channel format of kind ::cudaChannelFormatKindNone + * + * \return + * Channel descriptor with format \p f + * + * \sa \ref ::cudaCreateChannelDesc(int,int,int,int,cudaChannelFormatKind) "cudaCreateChannelDesc (Low level)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (High level)", + * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, size_t) "cudaBindTexture (High level, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (High level)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (High level)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (High level, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (High level)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (High level)" + */ +template<class T> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf1(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf2(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescHalf4(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char>(void) +{ + int e = (int)sizeof(char) * 8; + +#if defined(_CHAR_UNSIGNED) || defined(__CHAR_UNSIGNED__) + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +#else /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +#endif /* _CHAR_UNSIGNED || __CHAR_UNSIGNED__ */ +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<signed char>(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned char>(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char1>(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar1>(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char2>(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar2>(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<char4>(void) +{ + int e = (int)sizeof(signed char) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uchar4>(void) +{ + int e = (int)sizeof(unsigned char) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short>(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned short>(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short1>(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort1>(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short2>(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort2>(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<short4>(void) +{ + int e = (int)sizeof(short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ushort4>(void) +{ + int e = (int)sizeof(unsigned short) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int>(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned int>(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int1>(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint1>(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int2>(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint2>(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<int4>(void) +{ + int e = (int)sizeof(int) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<uint4>(void) +{ + int e = (int)sizeof(unsigned int) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +#if !defined(__LP64__) + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long>(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<unsigned long>(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long1>(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong1>(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long2>(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong2>(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindUnsigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<long4>(void) +{ + int e = (int)sizeof(long) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindSigned); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<ulong4>(void) +{ + int e = (int)sizeof(unsigned long) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindUnsigned); +} + +#endif /* !__LP64__ */ + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float>(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float1>(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float2>(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<float4>(void) +{ + int e = (int)sizeof(float) * 8; + + return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat); +} + +static __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDescNV12(void) +{ + int e = (int)sizeof(char) * 8; + + return cudaCreateChannelDesc(e, e, e, 0, cudaChannelFormatKindNV12); +} + +template<cudaChannelFormatKind> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc(void) +{ + return cudaCreateChannelDesc(0, 0, 0, 0, cudaChannelFormatKindNone); +} + +/* Signed 8-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X1>(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedNormalized8X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X2>(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedNormalized8X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized8X4>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindSignedNormalized8X4); +} + +/* Unsigned 8-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X1>(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized8X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X2>(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedNormalized8X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized8X4>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedNormalized8X4); +} + +/* Signed 16-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X1>(void) +{ + return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindSignedNormalized16X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X2>(void) +{ + return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindSignedNormalized16X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedNormalized16X4>(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindSignedNormalized16X4); +} + +/* Unsigned 16-bit normalized integer formats */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X1>(void) +{ + return cudaCreateChannelDesc(16, 0, 0, 0, cudaChannelFormatKindUnsignedNormalized16X1); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X2>(void) +{ + return cudaCreateChannelDesc(16, 16, 0, 0, cudaChannelFormatKindUnsignedNormalized16X2); +} + +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedNormalized16X4>(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 16, cudaChannelFormatKindUnsignedNormalized16X4); +} + +/* NV12 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindNV12>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 0, cudaChannelFormatKindNV12); +} + +/* BC1 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1); +} + +/* BC1sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed1SRGB>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed1SRGB); +} + +/* BC2 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2); +} + +/* BC2sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed2SRGB>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed2SRGB); +} + +/* BC3 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3); +} + +/* BC3sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed3SRGB>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed3SRGB); +} + +/* BC4 unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed4>(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed4); +} + +/* BC4 signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed4>(void) +{ + return cudaCreateChannelDesc(8, 0, 0, 0, cudaChannelFormatKindSignedBlockCompressed4); +} + +/* BC5 unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed5>(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindUnsignedBlockCompressed5); +} + +/* BC5 signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed5>(void) +{ + return cudaCreateChannelDesc(8, 8, 0, 0, cudaChannelFormatKindSignedBlockCompressed5); +} + +/* BC6H unsigned format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed6H>(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindUnsignedBlockCompressed6H); +} + +/* BC6H signed format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindSignedBlockCompressed6H>(void) +{ + return cudaCreateChannelDesc(16, 16, 16, 0, cudaChannelFormatKindSignedBlockCompressed6H); +} + +/* BC7 format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7); +} + +/* BC7sRGB format */ +template<> __inline__ __host__ cudaChannelFormatDesc cudaCreateChannelDesc<cudaChannelFormatKindUnsignedBlockCompressed7SRGB>(void) +{ + return cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsignedBlockCompressed7SRGB); +} + +#endif /* __cplusplus */ + +/** @} */ +/** @} */ /* END CUDART_TEXTURE_HL */ + +#endif /* !__CHANNEL_DESCRIPTOR_H__ */ diff --git a/ext/cudart/include/common_functions.h b/ext/cudart/include/common_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..5f8ea3d242640f2196b789c7da6c05d2ed1bed3e --- /dev/null +++ b/ext/cudart/include/common_functions.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__ +#endif + +#include "crt/common_functions.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__ +#endif diff --git a/ext/cudart/include/cooperative_groups.h b/ext/cudart/include/cooperative_groups.h new file mode 100644 index 0000000000000000000000000000000000000000..c8823fb79bc78e513861f9e7d3e671bf0effc6b7 --- /dev/null +++ b/ext/cudart/include/cooperative_groups.h @@ -0,0 +1,1828 @@ +/* + * Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_H_ +#define _COOPERATIVE_GROUPS_H_ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#include "cooperative_groups/details/info.h" +#include "cooperative_groups/details/driver_abi.h" +#include "cooperative_groups/details/helpers.h" + +#if defined(_CG_HAS_STL_ATOMICS) +#include <cuda/atomic> +#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope; +#else +#define _CG_THREAD_SCOPE(scope) +#endif + +_CG_BEGIN_NAMESPACE + +namespace details { + _CG_CONST_DECL unsigned int coalesced_group_id = 1; + _CG_CONST_DECL unsigned int multi_grid_group_id = 2; + _CG_CONST_DECL unsigned int grid_group_id = 3; + _CG_CONST_DECL unsigned int thread_block_id = 4; + _CG_CONST_DECL unsigned int multi_tile_group_id = 5; + _CG_CONST_DECL unsigned int cluster_group_id = 6; +} + +/** + * class thread_group; + * + * Generic thread group type, into which all groups are convertible. + * It acts as a container for all storage necessary for the derived groups, + * and will dispatch the API calls to the correct derived group. This means + * that all derived groups must implement the same interface as thread_group. + */ +class thread_group +{ +protected: + struct group_data { + unsigned int _unused : 1; + unsigned int type : 7, : 0; + }; + + struct gg_data { + details::grid_workspace *gridWs; + }; + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + struct mg_data { + unsigned long long _unused : 1; + unsigned long long type : 7; + unsigned long long handle : 56; + const details::multi_grid::multi_grid_functions *functions; + }; +#endif + + struct tg_data { + unsigned int is_tiled : 1; + unsigned int type : 7; + unsigned int size : 24; + // packed to 4b + unsigned int metaGroupSize : 16; + unsigned int metaGroupRank : 16; + // packed to 8b + unsigned int mask; + // packed to 12b + unsigned int _res; + }; + + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend class thread_block; + + union __align__(8) { + group_data group; + tg_data coalesced; + gg_data grid; +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + mg_data multi_grid; +#endif + } _data; + + _CG_QUALIFIER thread_group operator=(const thread_group& src); + + _CG_QUALIFIER thread_group(unsigned int type) { + _data.group.type = type; + _data.group._unused = false; + } + +#ifdef _CG_CPP11_FEATURES + static_assert(sizeof(tg_data) <= 16, "Failed size check"); + static_assert(sizeof(gg_data) <= 16, "Failed size check"); +# ifdef _CG_ABI_EXPERIMENTAL + static_assert(sizeof(mg_data) <= 16, "Failed size check"); +# endif +#endif + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device) + + _CG_QUALIFIER unsigned long long size() const; + _CG_QUALIFIER unsigned long long num_threads() const; + _CG_QUALIFIER unsigned long long thread_rank() const; + _CG_QUALIFIER void sync() const; + _CG_QUALIFIER unsigned int get_type() const { + return _data.group.type; + } + +}; + +template <unsigned int TyId> +struct thread_group_base : public thread_group { + _CG_QUALIFIER thread_group_base() : thread_group(TyId) {} + _CG_STATIC_CONST_DECL unsigned int id = TyId; +}; + +#if defined(_CG_HAS_MULTI_GRID_GROUP) + +/** + * class multi_grid_group; + * + * Threads within this this group are guaranteed to be co-resident on the + * same system, on multiple devices within the same launched kernels. + * To use this group, the kernel must have been launched with + * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent), + * and the device must support it (queryable device attribute). + * + * Constructed via this_multi_grid(); + */ + + +# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +class multi_grid_group; + +// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls +template <typename = void> +__device__ _CG_DEPRECATED multi_grid_group this_multi_grid(); + +class multi_grid_group : public thread_group_base<details::multi_grid_group_id> +{ +private: + template <typename = void> + _CG_QUALIFIER multi_grid_group() { + _data.multi_grid.functions = details::multi_grid::load_grid_intrinsics(); + _data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle(); + } + + friend multi_grid_group this_multi_grid<void>(); + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system) + + _CG_QUALIFIER bool is_valid() const { + return (_data.multi_grid.handle != 0); + } + + _CG_QUALIFIER void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + _data.multi_grid.functions->sync(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned long long num_threads() const { + _CG_ASSERT(is_valid()); + return _data.multi_grid.functions->size(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned long long size() const { + return num_threads(); + } + + _CG_QUALIFIER unsigned long long thread_rank() const { + _CG_ASSERT(is_valid()); + return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle); + } + + _CG_QUALIFIER unsigned int grid_rank() const { + _CG_ASSERT(is_valid()); + return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle)); + } + + _CG_QUALIFIER unsigned int num_grids() const { + _CG_ASSERT(is_valid()); + return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle)); + } +}; +# else +class multi_grid_group +{ +private: + unsigned long long _handle; + unsigned int _size; + unsigned int _rank; + + friend _CG_QUALIFIER multi_grid_group this_multi_grid(); + + _CG_QUALIFIER multi_grid_group() { + _handle = details::multi_grid::get_intrinsic_handle(); + _size = details::multi_grid::size(_handle); + _rank = details::multi_grid::thread_rank(_handle); + } + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system) + + _CG_QUALIFIER _CG_DEPRECATED bool is_valid() const { + return (_handle != 0); + } + + _CG_QUALIFIER _CG_DEPRECATED void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + details::multi_grid::sync(_handle); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const { + _CG_ASSERT(is_valid()); + return _size; + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const { + return num_threads(); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const { + _CG_ASSERT(is_valid()); + return _rank; + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const { + _CG_ASSERT(is_valid()); + return (details::multi_grid::grid_rank(_handle)); + } + + _CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const { + _CG_ASSERT(is_valid()); + return (details::multi_grid::num_grids(_handle)); + } +}; +# endif + +/** + * multi_grid_group this_multi_grid() + * + * Constructs a multi_grid_group + */ +# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +template <typename> +__device__ +#else +_CG_QUALIFIER +# endif +_CG_DEPRECATED +multi_grid_group this_multi_grid() +{ + return multi_grid_group(); +} +#endif + +/** + * class grid_group; + * + * Threads within this this group are guaranteed to be co-resident on the + * same device within the same launched kernel. To use this group, the kernel + * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent), + * and the device must support it (queryable device attribute). + * + * Constructed via this_grid(); + */ +class grid_group : public thread_group_base<details::grid_group_id> +{ + _CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id; + friend _CG_QUALIFIER grid_group this_grid(); + +private: + _CG_QUALIFIER grid_group(details::grid_workspace *gridWs) { + _data.grid.gridWs = gridWs; + } + + public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device) + + _CG_QUALIFIER bool is_valid() const { + return (_data.grid.gridWs != NULL); + } + + _CG_QUALIFIER void sync() const { + if (!is_valid()) { + _CG_ABORT(); + } + details::grid::sync(&_data.grid.gridWs->barrier); + } + + _CG_STATIC_QUALIFIER unsigned long long size() { + return details::grid::size(); + } + + _CG_STATIC_QUALIFIER unsigned long long thread_rank() { + return details::grid::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 group_dim() { + return details::grid::grid_dim(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_threads() { + return details::grid::num_threads(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() { + return details::grid::dim_blocks(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_blocks() { + return details::grid::num_blocks(); + } + + _CG_STATIC_QUALIFIER dim3 block_index() { + return details::grid::block_index(); + } + + _CG_STATIC_QUALIFIER unsigned long long block_rank() { + return details::grid::block_rank(); + } + +# if defined(_CG_HAS_CLUSTER_GROUP) + _CG_STATIC_QUALIFIER dim3 dim_clusters() { + return details::grid::dim_clusters(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_clusters() { + return details::grid::num_clusters(); + } + + _CG_STATIC_QUALIFIER dim3 cluster_index() { + return details::grid::cluster_index(); + } + + _CG_STATIC_QUALIFIER unsigned long long cluster_rank() { + return details::grid::cluster_rank(); + } +# endif +}; + +_CG_QUALIFIER grid_group this_grid() { + // Load a workspace from the driver + grid_group gg(details::get_grid_workspace()); +#ifdef _CG_DEBUG + // *all* threads must be available to synchronize + gg.sync(); +#endif // _CG_DEBUG + return gg; +} + +#if defined(_CG_HAS_CLUSTER_GROUP) +/** + * class cluster_group + * + * Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly + * divided along all dimensions to form groups of blocks, each group of which is + * a block cluster. Clustered grids are subject to various restrictions and + * limitations. Primarily, a cluster consists of at most 8 blocks by default + * (although the user is allowed to opt-in to non-standard sizes,) and clustered + * grids are subject to additional occupancy limitations due to per-cluster + * hardware resource consumption. In exchange, a block cluster is guaranteed to + * be a cooperative group, with access to all cooperative group capabilities, as + * well as cluster specific capabilities and accelerations. A cluster_group + * represents a block cluster. + * + * Constructed via this_cluster_group(); + */ +class cluster_group : public thread_group_base<details::cluster_group_id> +{ + // Friends + friend _CG_QUALIFIER cluster_group this_cluster(); + + // Disable constructor + _CG_QUALIFIER cluster_group() + { + } + + public: + //_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster) + + // Functionality exposed by the group + _CG_STATIC_QUALIFIER void sync() + { + return details::cluster::sync(); + } + + _CG_STATIC_QUALIFIER void barrier_arrive() + { + return details::cluster::barrier_arrive(); + } + + _CG_STATIC_QUALIFIER void barrier_wait() + { + return details::cluster::barrier_wait(); + } + + _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr) + { + return details::cluster::query_shared_rank(addr); + } + + template <typename T> + _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank) + { + return details::cluster::map_shared_rank(addr, rank); + } + + _CG_STATIC_QUALIFIER dim3 block_index() + { + return details::cluster::block_index(); + } + + _CG_STATIC_QUALIFIER unsigned int block_rank() + { + return details::cluster::block_rank(); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return details::cluster::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() + { + return details::cluster::dim_blocks(); + } + + _CG_STATIC_QUALIFIER unsigned int num_blocks() + { + return details::cluster::num_blocks(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + return details::cluster::dim_threads(); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() + { + return details::cluster::num_threads(); + } + + // Legacy aliases + _CG_STATIC_QUALIFIER unsigned int size() + { + return num_threads(); + } +}; + +/* + * cluster_group this_cluster() + * + * Constructs a cluster_group + */ +_CG_QUALIFIER cluster_group this_cluster() +{ + cluster_group cg; +#ifdef _CG_DEBUG + cg.sync(); +#endif + return cg; +} +#endif + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +namespace details { + + _CG_CONSTEXPR_QUALIFIER unsigned int scratch_sync_memory_size(unsigned int max_block_size) { + // One barrier per possible size of the group rounded up to multiple of 4. + return 8 * sizeof(details::barrier_t); + } + + _CG_CONSTEXPR_QUALIFIER unsigned int scratch_collectives_memory_size(unsigned int communication_size, unsigned int max_block_size) { + // One slot of collectives memory per warp. + return max_block_size / 32 * communication_size; + } + + _CG_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int communication_size, unsigned int max_block_size) { + return scratch_sync_memory_size(max_block_size) + scratch_collectives_memory_size(communication_size, max_block_size); + } + + _CG_CONSTEXPR_QUALIFIER size_t scratch_alignment(unsigned int communication_size) { + return ((communication_size & (communication_size - 1) == 0) && communication_size > 8) ? + communication_size : 8; + } + + _CG_CONST_DECL unsigned int default_tile_communication_size = 8; + _CG_CONST_DECL unsigned int default_max_block_size = 1024; + + struct multi_warp_scratch { + char memory[1]; + }; +} + +class thread_block; +namespace experimental { + template <unsigned int TileCommunicationSize = details::default_tile_communication_size, + unsigned int MaxBlockSize = details::default_max_block_size> + struct __align__(details::scratch_alignment(TileCommunicationSize)) block_tile_memory { + private: + char scratch[details::scratch_size_needed(TileCommunicationSize, MaxBlockSize)]; + + public: + _CG_QUALIFIER void* get_memory() { + return static_cast<void*>(scratch); + } + + _CG_STATIC_QUALIFIER unsigned int get_size() { + return details::scratch_size_needed(TileCommunicationSize, MaxBlockSize); + } + }; + + template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize> + _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch); +} +#endif + +/** + * class thread_block + * + * Every GPU kernel is executed by a grid of thread blocks, and threads within + * each block are guaranteed to reside on the same streaming multiprocessor. + * A thread_block represents a thread block whose dimensions are not known until runtime. + * + * Constructed via this_thread_block(); + */ +class thread_block : public thread_group_base<details::thread_block_id> +{ + // Friends + friend _CG_QUALIFIER thread_block this_thread_block(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz); + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize> + friend _CG_QUALIFIER thread_block experimental::this_thread_block( + experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch); + + const unsigned short communication_size; + const unsigned short max_block_size; + details::multi_warp_scratch* const tile_memory; + + template <unsigned int Size> + friend class __static_size_multi_warp_tile_base; + + template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize> + _CG_QUALIFIER thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) : + tile_memory(reinterpret_cast<details::multi_warp_scratch*>(&scratch)), + communication_size(TileCommunicationSize), max_block_size(MaxBlockSize) { + if (thread_rank() < details::scratch_sync_memory_size(MaxBlockSize) / sizeof(details::barrier_t)) { + details::barrier_t* barriers = reinterpret_cast<details::barrier_t*>(&tile_memory->memory); + barriers[thread_rank()] = 0; + } + sync(); + } +#endif + + // Disable constructor + _CG_QUALIFIER thread_block() +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + : tile_memory(NULL), communication_size(0), max_block_size(0) +#endif + { } + + // Internal Use + _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const { + const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); + + // Invalid, immediately fail + if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { + details::abort(); + return (thread_block()); + } + + unsigned int mask; + unsigned int base_offset = thread_rank() & (~(tilesz - 1)); + unsigned int masklength = min((unsigned int)size() - base_offset, tilesz); + + mask = (unsigned int)(-1) >> (32 - masklength); + mask <<= (details::laneid() & ~(tilesz - 1)); + thread_group tile = thread_group(details::coalesced_group_id); + tile._data.coalesced.mask = mask; + tile._data.coalesced.size = __popc(mask); + tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz; + tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz; + tile._data.coalesced.is_tiled = true; + return (tile); + } + + public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id; + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + _CG_STATIC_QUALIFIER void sync() { + details::cta::sync(); + } + + _CG_STATIC_QUALIFIER unsigned int size() { + return details::cta::size(); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() { + return details::cta::thread_rank(); + } + + // Additional functionality exposed by the group + _CG_STATIC_QUALIFIER dim3 group_index() { + return details::cta::group_index(); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() { + return details::cta::thread_index(); + } + + _CG_STATIC_QUALIFIER dim3 group_dim() { + return details::cta::block_dim(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() { + return details::cta::dim_threads(); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() { + return details::cta::num_threads(); + } + +}; + +/** + * thread_block this_thread_block() + * + * Constructs a thread_block group + */ +_CG_QUALIFIER thread_block this_thread_block() +{ + return (thread_block()); +} + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +namespace experimental { + template <unsigned int TileCommunicationSize, unsigned int MaxBlockSize> + _CG_QUALIFIER thread_block this_thread_block(experimental::block_tile_memory<TileCommunicationSize, MaxBlockSize>& scratch) { + return (thread_block(scratch)); + } +} +#endif + +/** + * class coalesced_group + * + * A group representing the current set of converged threads in a warp. + * The size of the group is not guaranteed and it may return a group of + * only one thread (itself). + * + * This group exposes warp-synchronous builtins. + * Constructed via coalesced_threads(); + */ +class coalesced_group : public thread_group_base<details::coalesced_group_id> +{ +private: + friend _CG_QUALIFIER coalesced_group coalesced_threads(); + friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz); + friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz); + friend class details::_coalesced_group_data_access; + + _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const { + unsigned int member_pack = 0; + unsigned int member_rank = 0; + for (int bit_idx = 0; bit_idx < 32; bit_idx++) { + unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); + if (lane_bit) { + if (laneMask & lane_bit) + member_pack |= 1 << member_rank; + member_rank++; + } + } + return (member_pack); + } + + // Internal Use + _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const { + const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0); + + // Invalid, immediately fail + if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) { + details::abort(); + return (coalesced_group(0)); + } + if (size() <= tilesz) { + return (*this); + } + + if ((_data.coalesced.is_tiled == true) && pow2_tilesz) { + unsigned int base_offset = (thread_rank() & (~(tilesz - 1))); + unsigned int masklength = min((unsigned int)size() - base_offset, tilesz); + unsigned int mask = (unsigned int)(-1) >> (32 - masklength); + + mask <<= (details::laneid() & ~(tilesz - 1)); + coalesced_group coalesced_tile = coalesced_group(mask); + coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz; + coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz; + coalesced_tile._data.coalesced.is_tiled = true; + return (coalesced_tile); + } + else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) { + unsigned int mask = 0; + unsigned int member_rank = 0; + int seen_lanes = (thread_rank() / tilesz) * tilesz; + for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) { + unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx); + if (lane_bit) { + if (seen_lanes <= 0 && member_rank < tilesz) { + mask |= lane_bit; + member_rank++; + } + seen_lanes--; + } + } + coalesced_group coalesced_tile = coalesced_group(mask); + // Override parent with the size of this group + coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz; + coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz; + return coalesced_tile; + } + else { + // None in _CG_VERSION 1000 + details::abort(); + } + + return (coalesced_group(0)); + } + + protected: + _CG_QUALIFIER coalesced_group(unsigned int mask) { + _data.coalesced.mask = mask; + _data.coalesced.size = __popc(mask); + _data.coalesced.metaGroupRank = 0; + _data.coalesced.metaGroupSize = 1; + _data.coalesced.is_tiled = false; + } + + _CG_QUALIFIER unsigned int get_mask() const { + return (_data.coalesced.mask); + } + + public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id; + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + _CG_QUALIFIER unsigned int num_threads() const { + return _data.coalesced.size; + } + + _CG_QUALIFIER unsigned int size() const { + return num_threads(); + } + + _CG_QUALIFIER unsigned int thread_rank() const { + return (__popc(_data.coalesced.mask & details::lanemask32_lt())); + } + + // Rank of this group in the upper level of the hierarchy + _CG_QUALIFIER unsigned int meta_group_rank() const { + return _data.coalesced.metaGroupRank; + } + + // Total num partitions created out of all CTAs when the group was created + _CG_QUALIFIER unsigned int meta_group_size() const { + return _data.coalesced.metaGroupSize; + } + + _CG_QUALIFIER void sync() const { + __syncwarp(_data.coalesced.mask); + } + +#ifdef _CG_CPP11_FEATURES + template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> + _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const { + unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 : + (size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1)); + + return details::tile::shuffle_dispatch<TyElem>::shfl( + _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32); + } + + template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> + _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const { + if (size() == 32) { + return details::tile::shuffle_dispatch<TyElem>::shfl_down( + _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32); + } + + unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1); + + if (lane >= 32) + lane = details::laneid(); + + return details::tile::shuffle_dispatch<TyElem>::shfl( + _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32); + } + + template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> + _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const { + if (size() == 32) { + return details::tile::shuffle_dispatch<TyElem>::shfl_up( + _CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32); + } + + unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1)); + if (lane >= 32) + lane = details::laneid(); + + return details::tile::shuffle_dispatch<TyElem>::shfl( + _CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32); + } +#else + template <typename TyIntegral> + _CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const { + details::assert_if_not_arithmetic<TyIntegral>(); + unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : + (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } + + template <typename TyIntegral> + _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const { + details::assert_if_not_arithmetic<TyIntegral>(); + if (size() == 32) { + return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32)); + } + unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1)); + if (lane >= 32) lane = details::laneid(); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } + + template <typename TyIntegral> + _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const { + details::assert_if_not_arithmetic<TyIntegral>(); + if (size() == 32) { + return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32)); + } + unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1); + if (lane >= 32) lane = details::laneid(); + return (__shfl_sync(_data.coalesced.mask, var, lane, 32)); + } +#endif + + _CG_QUALIFIER int any(int predicate) const { + return (__ballot_sync(_data.coalesced.mask, predicate) != 0); + } + _CG_QUALIFIER int all(int predicate) const { + return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask); + } + _CG_QUALIFIER unsigned int ballot(int predicate) const { + if (size() == 32) { + return (__ballot_sync(0xFFFFFFFF, predicate)); + } + unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate); + return (_packLanes(lane_ballot)); + } + +#ifdef _CG_HAS_MATCH_COLLECTIVE + + template <typename TyIntegral> + _CG_QUALIFIER unsigned int match_any(TyIntegral val) const { + details::assert_if_not_arithmetic<TyIntegral>(); + if (size() == 32) { + return (__match_any_sync(0xFFFFFFFF, val)); + } + unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); + return (_packLanes(lane_match)); + } + + template <typename TyIntegral> + _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const { + details::assert_if_not_arithmetic<TyIntegral>(); + if (size() == 32) { + return (__match_all_sync(0xFFFFFFFF, val, &pred)); + } + unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); + return (_packLanes(lane_match)); + } + +#endif /* !_CG_HAS_MATCH_COLLECTIVE */ + +}; + +_CG_QUALIFIER coalesced_group coalesced_threads() +{ + return (coalesced_group(__activemask())); +} + +namespace details { + template <unsigned int Size> struct verify_thread_block_tile_size; + template <> struct verify_thread_block_tile_size<32> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<16> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<8> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<4> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<2> { typedef void OK; }; + template <> struct verify_thread_block_tile_size<1> { typedef void OK; }; + +#ifdef _CG_CPP11_FEATURES + template <unsigned int Size> + using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>; + + template <unsigned int Size> + using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>; + template <unsigned int Size> + using _is_multi_warp = + _CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>; + + template <unsigned int Size> + using _is_valid_single_warp_tile = + _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>; + template <unsigned int Size> + using _is_valid_multi_warp_tile = + _CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>; +#else + template <unsigned int Size> + struct _is_multi_warp { + static const bool value = false; + }; +#endif +} + +template <unsigned int Size> +class __static_size_tile_base +{ +protected: + _CG_STATIC_CONST_DECL unsigned int numThreads = Size; + +public: + _CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block) + + // Rank of thread within tile + _CG_STATIC_QUALIFIER unsigned int thread_rank() { + return (details::cta::thread_rank() & (numThreads - 1)); + } + + // Number of threads within tile + _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() { + return numThreads; + } + + _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() { + return num_threads(); + } +}; + +template <unsigned int Size> +class __static_size_thread_block_tile_base : public __static_size_tile_base<Size> +{ + friend class details::_coalesced_group_data_access; + typedef details::tile::tile_helpers<Size> th; + +#ifdef _CG_CPP11_FEATURES + static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32"); +#else + typedef typename details::verify_thread_block_tile_size<Size>::OK valid; +#endif + using __static_size_tile_base<Size>::numThreads; + _CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF; + + protected: + _CG_STATIC_QUALIFIER unsigned int build_mask() { + unsigned int mask = fullMask; + if (numThreads != 32) { + // [0,31] representing the current active thread in the warp + unsigned int laneId = details::laneid(); + // shift mask according to the partition it belongs to + mask = th::tileMask << (laneId & ~(th::laneMask)); + } + return (mask); + } + +public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id; + + _CG_STATIC_QUALIFIER void sync() { + __syncwarp(build_mask()); + } + +#ifdef _CG_CPP11_FEATURES + // PTX supported collectives + template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> + _CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const { + return details::tile::shuffle_dispatch<TyElem>::shfl( + _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads); + } + + template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> + _CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const { + return details::tile::shuffle_dispatch<TyElem>::shfl_down( + _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads); + } + + template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> + _CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const { + return details::tile::shuffle_dispatch<TyElem>::shfl_up( + _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads); + } + + template <typename TyElem, typename TyRet = details::remove_qual<TyElem>> + _CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const { + return details::tile::shuffle_dispatch<TyElem>::shfl_xor( + _CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads); + } +#else + template <typename TyIntegral> + _CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const { + details::assert_if_not_arithmetic<TyIntegral>(); + return (__shfl_sync(build_mask(), var, srcRank, numThreads)); + } + + template <typename TyIntegral> + _CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const { + details::assert_if_not_arithmetic<TyIntegral>(); + return (__shfl_down_sync(build_mask(), var, delta, numThreads)); + } + + template <typename TyIntegral> + _CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const { + details::assert_if_not_arithmetic<TyIntegral>(); + return (__shfl_up_sync(build_mask(), var, delta, numThreads)); + } + + template <typename TyIntegral> + _CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const { + details::assert_if_not_arithmetic<TyIntegral>(); + return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads)); + } +#endif //_CG_CPP11_FEATURES + + _CG_QUALIFIER int any(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot != 0); + } + _CG_QUALIFIER int all(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot == build_mask()); + } + _CG_QUALIFIER unsigned int ballot(int predicate) const { + unsigned int lane_ballot = __ballot_sync(build_mask(), predicate); + return (lane_ballot >> (details::laneid() & (~(th::laneMask)))); + } + +#ifdef _CG_HAS_MATCH_COLLECTIVE + template <typename TyIntegral> + _CG_QUALIFIER unsigned int match_any(TyIntegral val) const { + details::assert_if_not_arithmetic<TyIntegral>(); + unsigned int lane_match = __match_any_sync(build_mask(), val); + return (lane_match >> (details::laneid() & (~(th::laneMask)))); + } + + template <typename TyIntegral> + _CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const { + details::assert_if_not_arithmetic<TyIntegral>(); + unsigned int lane_match = __match_all_sync(build_mask(), val, &pred); + return (lane_match >> (details::laneid() & (~(th::laneMask)))); + } +#endif + +}; + +template <unsigned int Size, typename ParentT> +class __static_parent_thread_block_tile_base +{ +public: + // Rank of this group in the upper level of the hierarchy + _CG_STATIC_QUALIFIER unsigned int meta_group_rank() { + return ParentT::thread_rank() / Size; + } + + // Total num partitions created out of all CTAs when the group was created + _CG_STATIC_QUALIFIER unsigned int meta_group_size() { + return (ParentT::size() + Size - 1) / Size; + } +}; + +/** + * class thread_block_tile<unsigned int Size, ParentT = void> + * + * Statically-sized group type, representing one tile of a thread block. + * The only specializations currently supported are those with native + * hardware support (1/2/4/8/16/32) + * + * This group exposes warp-synchronous builtins. + * Can only be constructed via tiled_partition<Size>(ParentT&) + */ + +template <unsigned int Size, typename ParentT = void> +class __single_warp_thread_block_tile : + public __static_size_thread_block_tile_base<Size>, + public __static_parent_thread_block_tile_base<Size, ParentT> +{ + typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT; + friend class details::_coalesced_group_data_access; + +protected: + _CG_QUALIFIER __single_warp_thread_block_tile() { }; + _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { }; + + _CG_STATIC_QUALIFIER unsigned int get_mask() { + return __static_size_thread_block_tile_base<Size>::build_mask(); + } +}; + +template <unsigned int Size> +class __single_warp_thread_block_tile<Size, void> : + public __static_size_thread_block_tile_base<Size>, + public thread_group_base<details::coalesced_group_id> +{ + _CG_STATIC_CONST_DECL unsigned int numThreads = Size; + + template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile; + friend class details::_coalesced_group_data_access; + + typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT; + +protected: + _CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank, unsigned int meta_group_size) { + _data.coalesced.mask = staticSizeBaseT::build_mask(); + _data.coalesced.size = numThreads; + _data.coalesced.metaGroupRank = meta_group_rank; + _data.coalesced.metaGroupSize = meta_group_size; + _data.coalesced.is_tiled = true; + } + + _CG_QUALIFIER unsigned int get_mask() const { + return (_data.coalesced.mask); + } + +public: + using staticSizeBaseT::sync; + using staticSizeBaseT::size; + using staticSizeBaseT::num_threads; + using staticSizeBaseT::thread_rank; + + _CG_QUALIFIER unsigned int meta_group_rank() const { + return _data.coalesced.metaGroupRank; + } + + _CG_QUALIFIER unsigned int meta_group_size() const { + return _data.coalesced.metaGroupSize; + } +}; + +/** + * Outer level API calls + * void sync(GroupT) - see <group_type>.sync() + * void thread_rank(GroupT) - see <group_type>.thread_rank() + * void group_size(GroupT) - see <group_type>.size() + */ +template <class GroupT> +_CG_QUALIFIER void sync(GroupT const &g) +{ + g.sync(); +} + +// TODO: Use a static dispatch to determine appropriate return type +// C++03 is stuck with unsigned long long for now +#ifdef _CG_CPP11_FEATURES +template <class GroupT> +_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) { + return g.thread_rank(); +} + + +template <class GroupT> +_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) { + return g.num_threads(); +} +#else +template <class GroupT> +_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) { + return static_cast<unsigned long long>(g.thread_rank()); +} + + +template <class GroupT> +_CG_QUALIFIER unsigned long long group_size(GroupT const &g) { + return static_cast<unsigned long long>(g.num_threads()); +} +#endif + + +/** + * tiled_partition + * + * The tiled_partition(parent, tilesz) method is a collective operation that + * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. + * + * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will + * be created where threads having identical k = (thread_rank(parent)/tilesz) + * will be members of the same subgroup. + * + * The implementation may cause the calling thread to wait until all the members + * of the parent group have invoked the operation before resuming execution. + * + * Functionality is limited to power-of-two sized subgorup instances of at most + * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be + * tiled_partition() in _CG_VERSION 1000. + */ +_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz) +{ + if (parent.get_type() == details::coalesced_group_id) { + const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent); + return _cg->_get_tiled_threads(tilesz); + } + else { + const thread_block *_tb = static_cast<const thread_block*>(&parent); + return _tb->_get_tiled_threads(tilesz); + } +} + +// Thread block type overload: returns a basic thread_group for now (may be specialized later) +_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz) +{ + return (parent._get_tiled_threads(tilesz)); +} + +// Coalesced group type overload: retains its ability to stay coalesced +_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz) +{ + return (parent._get_tiled_threads(tilesz)); +} + +namespace details { + template <unsigned int Size, typename ParentT> + class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {}; + + template <unsigned int Size, typename ParentT> + _CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() { + return internal_thread_block_tile<Size, ParentT>(); + } + + template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda> + _CG_QUALIFIER TyVal multi_warp_collectives_helper( + const GroupT& group, + WarpLambda warp_lambda, + InterWarpLambda inter_warp_lambda) { + return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda); + } + + template <typename T, typename GroupT> + _CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) { + return group.template get_scratch_location<T>(warp_id); + } + + template <typename GroupT> + _CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) { + return group.get_sync_location(); + } + +} +/** + * tiled_partition<tilesz> + * + * The tiled_partition<tilesz>(parent) method is a collective operation that + * partitions the parent group into a one-dimensional, row-major, tiling of subgroups. + * + * A total of ((size(parent)/tilesz) subgroups will be created, + * therefore the parent group size must be evenly divisible by the tilesz. + * The allow parent groups are thread_block or thread_block_tile<size>. + * + * The implementation may cause the calling thread to wait until all the members + * of the parent group have invoked the operation before resuming execution. + * + * Functionality is limited to native hardware sizes, 1/2/4/8/16/32. + * The size(parent) must be greater than the template Size parameter + * otherwise the results are undefined. + */ + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +template <unsigned int Size> +class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size> +{ + static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512"); + + template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda> + friend TyVal details::multi_warp_collectives_helper( + const GroupT& group, + WarpLambda warp_lambda, + InterWarpLambda inter_warp_lambda); + template <typename T, typename GroupT> + friend T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id); + template <typename GroupT> + friend details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group); + template <unsigned int OtherSize> + friend class __static_size_multi_warp_tile_base; + using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>; + using ThisType = __static_size_multi_warp_tile_base<Size>; + _CG_STATIC_CONST_DECL int numWarps = Size / 32; + const unsigned short communication_size; + const unsigned short max_block_size; + +protected: + details::multi_warp_scratch* const tile_memory; + + template <typename GroupT> + _CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : + tile_memory(g.tile_memory), communication_size(g.communication_size), max_block_size(g.max_block_size) {} + + +private: + _CG_QUALIFIER details::barrier_t* get_sync_location() const { + // Different group sizes use different barriers, all groups of a given size share one barrier. + unsigned int sync_id = details::log2(Size / 64); + return &(reinterpret_cast<details::barrier_t*>(tile_memory->memory)[sync_id]); + } + + template <typename T> + _CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const { + unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size); + unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id; + return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]); + } + + template <typename T> + _CG_QUALIFIER T* get_scratch_location() const { + unsigned int sync_mem_size = details::scratch_sync_memory_size(max_block_size); + unsigned int scratch_id = details::cta::thread_rank() / 32; + return reinterpret_cast<T*>(&tile_memory->memory[sync_mem_size + scratch_id * communication_size]); + } + + template <typename TyVal> + _CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const { + unsigned int src_warp = src / 32; + auto warp = details::tiled_partition_internal<32, ThisType>(); + details::barrier_t* sync_location = get_sync_location(); + + // Get warp slot of the source threads warp. + TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp); + + if (warp.meta_group_rank() == src_warp) { + // Put shuffled value into my warp slot and let my warp arrive at the barrier. + if (thread_rank() == src) { + *warp_scratch_location = val; + } + details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps); + TyVal result = *warp_scratch_location; + details::sync_warps_wait(sync_location, details::cta::thread_rank()); + return result; + } + else { + // Wait for the source warp to arrive on the barrier. + details::sync_warps_wait_for_warps<details::wait_for_specific_warp>( + (details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp), + sync_location, details::cta::thread_rank(), + numWarps); + TyVal result = *warp_scratch_location; + details::sync_warps(sync_location, details::cta::thread_rank(), numWarps); + return result; + } + } + + template <typename TyVal> + _CG_QUALIFIER TyVal shfl_iterative_impl(TyVal val, unsigned int src) const { + auto warp = details::tiled_partition_internal<32, ThisType>(); + + details::copy_channel<numWarps> broadcast_channel{ + get_scratch_location<char>(0), + get_sync_location(), + (size_t) communication_size * numWarps}; + + if (warp.meta_group_rank() == src / 32) { + val = warp.shfl(val, src); + broadcast_channel.template send_value< + TyVal, 32, decltype(broadcast_channel)::send_many_to_many>( + val, warp.thread_rank(), details::cta::thread_rank() / 32); + } + else { + broadcast_channel.template receive_value<TyVal>(val, warp.thread_rank() == 0); + } + sync(); + return val; + } + + template <typename TyVal, typename WarpLambda, typename InterWarpLambda> + _CG_QUALIFIER TyVal collectives_scheme_impl(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const { + auto warp = details::tiled_partition_internal<32, ThisType>(); + details::barrier_t* sync_location = get_sync_location(); + TyVal* warp_scratch_location = get_scratch_location<TyVal>(); + + warp_lambda(warp, warp_scratch_location); + + if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) { + auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>(); + if (subwarp.meta_group_rank() == 0) { + TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank()); + inter_warp_lambda(subwarp, thread_scratch_location); + } + warp.sync(); + details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps); + } + TyVal result = *warp_scratch_location; + warp.sync(); // Added warpsync, if all collectives do sync before writing to reduce_location (they does right now), + // we could delete it. + return result; + } + + template <typename TyVal, typename WarpLambda, typename InterWarpLambda> + _CG_QUALIFIER TyVal collectives_scheme_iterative_impl( + const WarpLambda& warp_lambda, + const InterWarpLambda& inter_warp_lambda) const { + auto warp = details::tiled_partition_internal<32, ThisType>(); + details::barrier_t* sync_location = get_sync_location(); + details::copy_channel<numWarps> final_result_channel{ + get_scratch_location<char>(0), + sync_location, + (size_t) communication_size * numWarps}; + + TyVal warp_result; + warp_lambda(warp, &warp_result); + + if (warp.meta_group_rank() == 0) { + auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>(); + details::copy_channel<numWarps> partial_results_channel{ + get_scratch_location<char>(subwarp.thread_rank()), + sync_location, + (size_t) communication_size}; + + // Thread 0 in subwarp set as inactive to not overwrite warp 0 warp_result. + partial_results_channel.template receive_value<TyVal>( + warp_result, + warp.thread_rank() == 0, + subwarp.thread_rank() != 0 && subwarp.meta_group_rank() == 0); + if (subwarp.meta_group_rank() == 0) { + inter_warp_lambda(subwarp, &warp_result); + } + warp_result = warp.shfl(warp_result, 0); + final_result_channel.template send_value<TyVal, 32, decltype(final_result_channel)::send_many_to_many>( + warp_result, + warp.thread_rank(), + details::cta::thread_rank() / 32); + } + else { + details::copy_channel<numWarps> partial_results_channel{get_scratch_location<char>(), sync_location, (size_t) communication_size}; + partial_results_channel.template send_value<TyVal, 32, decltype(partial_results_channel)::send_many_to_one>( + warp_result, + warp.thread_rank(), + (details::cta::thread_rank() - thread_rank()) / 32); + final_result_channel.template receive_value<TyVal>(warp_result, warp.thread_rank() == 0); + } + sync(); + return warp_result; + } + + template <typename TyVal, typename WarpLambda, typename InterWarpLambda> + _CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const { + if (sizeof(TyVal) > communication_size) { + return collectives_scheme_iterative_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda); + } + else { + return collectives_scheme_impl<TyVal, WarpLambda, InterWarpLambda>(warp_lambda, inter_warp_lambda); + } + } + +public: + _CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id; + + using __static_size_tile_base<Size>::thread_rank; + + template <typename TyVal> + _CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const { + if (sizeof(TyVal) > communication_size) { + return shfl_iterative_impl(val, src); + } + else { + return shfl_impl(val, src); + } + } + + _CG_QUALIFIER void sync() const { + details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps); + } + + _CG_QUALIFIER int any(int predicate) const { + auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) { + *warp_scratch_location = __any_sync(0xFFFFFFFF, predicate); + }; + auto inter_warp_lambda = + [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) { + *thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location); + }; + return collectives_scheme<int>(warp_lambda, inter_warp_lambda); + } + + _CG_QUALIFIER int all(int predicate) const { + auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) { + *warp_scratch_location = __all_sync(0xFFFFFFFF, predicate); + }; + auto inter_warp_lambda = + [] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) { + *thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location); + }; + return collectives_scheme<int>(warp_lambda, inter_warp_lambda); + } +}; + + +template <unsigned int Size, typename ParentT = void> +class __multi_warp_thread_block_tile : + public __static_size_multi_warp_tile_base<Size>, + public __static_parent_thread_block_tile_base<Size, ParentT> +{ + typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT; + typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT; +protected: + _CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) : + __static_size_multi_warp_tile_base<Size>(g) {} +}; + +template <unsigned int Size> +class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size> +{ + const unsigned int metaGroupRank; + const unsigned int metaGroupSize; + +protected: + template <unsigned int OtherSize, typename ParentT> + _CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) : + __static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {} + +public: + _CG_QUALIFIER unsigned int meta_group_rank() const { + return metaGroupRank; + } + + _CG_QUALIFIER unsigned int meta_group_size() const { + return metaGroupSize; + } +}; +#endif + +template <unsigned int Size, typename ParentT = void> +class thread_block_tile; + +namespace details { + template <unsigned int Size, typename ParentT, bool IsMultiWarp> + class thread_block_tile_impl; + + template <unsigned int Size, typename ParentT> + class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT> + { + protected: + template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp> + _CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) : + __single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {} + + _CG_QUALIFIER thread_block_tile_impl(const thread_block& g) : + __single_warp_thread_block_tile<Size, ParentT>() {} + }; + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + template <unsigned int Size, typename ParentT> + class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT> + { + protected: + template <typename GroupT> + _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) : + __multi_warp_thread_block_tile<Size, ParentT>(g) {} + }; +#else + template <unsigned int Size, typename ParentT> + class thread_block_tile_impl<Size, ParentT, true> + { + protected: + template <typename GroupT> + _CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {} + }; +#endif +} + +template <unsigned int Size, typename ParentT> +class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value> +{ + friend _CG_QUALIFIER thread_block_tile<1, void> this_thread(); + +protected: + _CG_QUALIFIER thread_block_tile(const ParentT& g) : + details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {} + +public: + _CG_QUALIFIER operator thread_block_tile<Size, void>() const { + return thread_block_tile<Size, void>(*this); + } +}; + +template <unsigned int Size> +class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value> +{ + template <unsigned int, typename ParentT> + friend class thread_block_tile; + +protected: + template <unsigned int OtherSize, typename OtherParentT> + _CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) : + details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {} + +public: + template <typename ParentT> + _CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) : + details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {} +}; + +namespace details { + template <unsigned int Size, typename ParentT> + struct tiled_partition_impl; + + template <unsigned int Size> + struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> { + _CG_QUALIFIER tiled_partition_impl(const thread_block& g) : + thread_block_tile<Size, thread_block>(g) {} + }; + + // ParentT = static thread_block_tile<ParentSize, GrandParent> specialization + template <unsigned int Size, unsigned int ParentSize, typename GrandParent> + struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > : + public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > { +#ifdef _CG_CPP11_FEATURES + static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size"); +#endif + _CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) : + thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {} + }; + +} + +namespace experimental { + template <unsigned int Size, typename ParentT> + _CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g) + { +#if defined(_CG_CPP11_FEATURES) && !defined(_CG_ABI_EXPERIMENTAL) + static_assert(details::_is_single_warp<Size>::value, "_CG_ABI_EXPERIMENTAL needs to be defined" + " before cooperative_groups header is included to enable experimental features"); +#endif + return details::tiled_partition_impl<Size, ParentT>(g); + } + +} + +template <unsigned int Size, typename ParentT> +_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g) +{ +#ifdef _CG_CPP11_FEATURES + static_assert(details::_is_single_warp<Size>::value, "Tiled partition with Size > 32 is supported only by" + " cooperative_groups::experimental::tiled_partition available with experimental features enabled"); +#endif + return details::tiled_partition_impl<Size, ParentT>(g); +} + +/** + * thread_group this_thread() + * + * Constructs a generic thread_group containing only the calling thread + */ +_CG_QUALIFIER thread_block_tile<1, void> this_thread() +{ + // Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its + // meta group rank and size set to 0 and 1 respectively. + return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block()); +} + +/** + * <group_type>.sync() + * + * Executes a barrier across the group + * + * Implements both a compiler fence and an architectural fence to prevent, + * memory reordering around the barrier. + */ +_CG_QUALIFIER void thread_group::sync() const +{ + switch (_data.group.type) { + case details::coalesced_group_id: + cooperative_groups::sync(*static_cast<const coalesced_group*>(this)); + break; + case details::thread_block_id: + cooperative_groups::sync(*static_cast<const thread_block*>(this)); + break; + case details::grid_group_id: + cooperative_groups::sync(*static_cast<const grid_group*>(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + cooperative_groups::sync(*static_cast<const multi_grid_group*>(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + cooperative_groups::sync(*static_cast<const cluster_group*>(this)); + break; +#endif + default: + break; + } +} + +/** + * <group_type>.size() + * + * Returns the total number of threads in the group. + */ +_CG_QUALIFIER unsigned long long thread_group::size() const +{ + unsigned long long size = 0; + switch (_data.group.type) { + case details::coalesced_group_id: + size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this)); + break; + case details::thread_block_id: + size = cooperative_groups::group_size(*static_cast<const thread_block*>(this)); + break; + case details::grid_group_id: + size = cooperative_groups::group_size(*static_cast<const grid_group*>(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this)); + break; +#endif + default: + break; + } + return size; +} + +/** + * <group_type>.thread_rank() + * + * Returns the linearized rank of the calling thread along the interval [0, size()). + */ +_CG_QUALIFIER unsigned long long thread_group::thread_rank() const +{ + unsigned long long rank = 0; + switch (_data.group.type) { + case details::coalesced_group_id: + rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this)); + break; + case details::thread_block_id: + rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this)); + break; + case details::grid_group_id: + rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this)); + break; +#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + case details::multi_grid_group_id: + rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this)); + break; +#endif +#if defined(_CG_HAS_CLUSTER_GROUP) + case details::cluster_group_id: + rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this)); + break; +#endif + default: + break; + } + return rank; +} + +_CG_END_NAMESPACE + +#include <cooperative_groups/details/partitioning.h> + +# endif /* ! (__cplusplus, __CUDACC__) */ + +#endif /* !_COOPERATIVE_GROUPS_H_ */ diff --git a/ext/cudart/include/cooperative_groups/details/async.h b/ext/cudart/include/cooperative_groups/details/async.h new file mode 100644 index 0000000000000000000000000000000000000000..1b7dcb2433f2cb7d1ef61290995ac871a901b1e8 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/async.h @@ -0,0 +1,452 @@ +/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_ASYNC_H +#define _CG_ASYNC_H + +#include "helpers.h" +#include "info.h" + +#include <cuda_pipeline.h> + +_CG_BEGIN_NAMESPACE + +namespace details { +// Groups supported by memcpy_async +template <class TyGroup> +struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {}; + +template <unsigned int Sz, typename TyPar> +struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> + : public _CG_STL_NAMESPACE::true_type {}; +template <> +struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {}; +template <> +struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {}; + +template <class TyGroup> +using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>; + +// Groups that require optimization +template <class TyGroup> +struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {}; + +template <typename TyPar> +struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>> + : public _CG_STL_NAMESPACE::false_type {}; + +template <unsigned int Sz, typename TyPar> +struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>> + : public _CG_STL_NAMESPACE::true_type {}; + +template <class TyGroup> +using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>; + +// SFINAE helpers for tile optimizations +template <class TyGroup> +using enable_tile_optimization = + typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type; + +template <class TyGroup> +using disable_tile_optimization = + typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type; + +// Segment for punning to aligned types +template <unsigned int N> +struct _Segment { + int _seg[N]; +}; + +// Trivial layout guaranteed-aligned copy-async compatible segments +template <unsigned int N> +struct Segment; +template <> +struct __align__(4) Segment<1> : public _Segment<1>{}; +template <> +struct __align__(8) Segment<2> : public _Segment<2>{}; +template <> +struct __align__(16) Segment<4> : public _Segment<4>{}; + +// Interleaved element by element copies from source to dest +template <typename TyGroup, typename TyElem> +_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src, + size_t count) { + const unsigned int rank = group.thread_rank(); + const unsigned int stride = group.size(); + + for (size_t idx = rank; idx < count; idx += stride) { + dst[idx] = src[idx]; + } +} + +template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr> +_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst, + const TyElem *__restrict__ src, size_t count) { + static_assert(async_copy_group_supported<TyGroup>::value, + "Async copy is only supported for groups that represent private shared memory"); + + if (count == 0) { + return; + } + + const bool dstIsNotShared = !__isShared(dst); + const bool srcIsNotGlobal = !__isGlobal(src); + + if (dstIsNotShared || srcIsNotGlobal) { + inline_copy(group, dst, src, count); + return; + } + + const unsigned int stride = group.size(); + const unsigned int rank = group.thread_rank(); + // Efficient copies require warps to operate on the same amount of work at each step. + // remainders are handled in a separate stage to prevent branching + const unsigned int subWarpMask = (stride - 1); + const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count); + const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1); + + const size_t warpCopies = (count & (~subWarpMask)); + + for (size_t idx = 0; idx < warpCopies; idx += stride) { + size_t _srcIdx = rank + idx; + size_t _dstIdx = rank + idx; + __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem)); + } + + if (subwarpCopies) { + size_t _srcIdx = warpCopies + maxSubwarpRank; + size_t _dstIdx = warpCopies + maxSubwarpRank; + __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem)); + } +} + +template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr> +_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst, + const TyElem *__restrict__ src, size_t count) { + static_assert(async_copy_group_supported<TyGroup>::value, + "Async copy is only supported for groups that represent private shared memory"); + + const bool dstIsNotShared = !__isShared(dst); + const bool srcIsNotGlobal = !__isGlobal(src); + + if (dstIsNotShared || srcIsNotGlobal) { + inline_copy(group, dst, src, count); + return; + } + + unsigned int stride = group.size(); + unsigned int rank = group.thread_rank(); + + for (size_t idx = rank; idx < count; idx += stride) { + size_t _srcIdx = idx; + size_t _dstIdx = idx; + __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem)); + } +} + +// Determine best possible alignment given an input and initial conditions +// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments +template <unsigned int MinAlignment, unsigned int MaxAlignment> +_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) { + // Narrowing conversion intentional + uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src); + uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst); + + uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1); + + // range [MaxAlignment, alignof(elem)], step: x >> 1 + // over range of possible alignments, choose best available out of range + uint32_t out = MaxAlignment; +#pragma unroll + for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) { + if (alignment & diff) + out = alignment; + } + + return out; +} + +// Determine best possible alignment given an input and initial conditions +// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments +template <typename TyType, typename TyGroup> +_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src, + size_t count) { + const char *src = reinterpret_cast<const char *>(_src); + char *dst = reinterpret_cast<char *>(_dst); + + constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType); + + uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src); + uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1); + + inline_copy(group, dst, src, alignOffset); + count -= alignOffset; + src += alignOffset; + dst += alignOffset; + + // Copy using the best available alignment, async_copy expects n-datums, not bytes + size_t asyncCount = count / sizeof(TyType); + accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount); + asyncCount *= sizeof(TyType); + + count -= asyncCount; + src += asyncCount; + dst += asyncCount; + inline_copy(group, dst, src, count); +} + +// We must determine alignment and manually align src/dst ourselves +template <size_t AlignHint> +struct _memcpy_async_align_dispatch { + template <typename TyGroup> + _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) { + uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src); + + // Avoid copying the extra bytes if desired copy count is smaller + alignment = count < alignment ? AlignHint : alignment; + + switch (alignment) { + default: + case 1: + inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count); + break; + case 2: + inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1); + break; + case 4: + copy_like<Segment<1>>(group, dst, src, count); + break; + case 8: + copy_like<Segment<2>>(group, dst, src, count); + break; + case 16: + copy_like<Segment<4>>(group, dst, src, count); + break; + } + } +}; + +// Specialization for 4 byte alignments +template <> +struct _memcpy_async_align_dispatch<4> { + template <typename TyGroup> + _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src, + size_t count) { + const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src); + Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst); + + // Dispatch straight to aligned LDGSTS calls + accelerated_async_copy(group, dst, src, count / sizeof(*dst)); + } +}; + +// Specialization for 8 byte alignments +template <> +struct _memcpy_async_align_dispatch<8> { + template <typename TyGroup> + _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src, + size_t count) { + const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src); + Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst); + + // Dispatch straight to aligned LDGSTS calls + accelerated_async_copy(group, dst, src, count / sizeof(*dst)); + } +}; + +// Alignments over 16 are truncated to 16 and bypass alignment +// This is the highest performing memcpy available +template <> +struct _memcpy_async_align_dispatch<16> { + template <typename TyGroup> + _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src, + size_t count) { + const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src); + Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst); + + // Dispatch straight to aligned LDGSTS calls + accelerated_async_copy(group, dst, src, count / sizeof(*dst)); + } +}; + +// byte-wide API +template <size_t Alignment, class TyGroup> +_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst, + const void *__restrict__ _src, size_t count) { + static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2"); + details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count); +} + +// Internal dispatch APIs +// These deduce the alignments and sizes necessary to invoke the underlying copy engine +template <typename Ty> +using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>; + +template <typename Ty> +using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type; + +template <typename Ty> +using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type; + +template <typename Ty> +using enable_if_integral = + typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type; + +// byte-wide API using aligned_sized_t +template <class TyGroup, template <size_t> typename Alignment, size_t Hint> +_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst, + const void *__restrict__ _src, const Alignment<Hint> &count) { + constexpr size_t _align = (Hint > 16) ? 16 : Hint; + + details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count); +} + +// byte-wide API using type for aligment +template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem), + enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr> +_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst, + const TyElem *__restrict__ _src, const TySize& count) { + constexpr size_t _align = (Hint > 16) ? 16 : Hint; + + details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count); +} + +// byte-wide API with full alignment deduction required +template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr, + enable_if_integral<TySize> = nullptr> +_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst, + const TyElem *__restrict__ _src, const TySize& count) { + details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count); +} + +// 1d-datum API +template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)> +_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount, + const TyElem *__restrict__ src, const size_t srcCount) { + constexpr unsigned int _align = Hint; + const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem); + + details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount); +} + +// 1d-datum API using aligned_size_t +template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint> +_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount, + const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) { + constexpr unsigned int _align = Hint; + const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem); + + details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount); +} + +} // namespace details + +/* + * Group submit batch of async-copy to cover contiguous 1D array + * and commit that batch to eventually wait for completion. + */ +template <class TyGroup, typename TyElem, typename TySizeT> +_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src, + const TySizeT &count) { + details::_memcpy_async_bytes(group, _dst, _src, count); + __pipeline_commit(); +} + +/* + * Group submit batch of async-copy to cover contiguous 1D array + * and commit that batch to eventually wait for completion. + * Object counts are in datum sized chunks, not bytes. + */ +template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout> +_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout, + const TyElem *__restrict__ src, const SrcLayout &srcLayout) { + details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout); + __pipeline_commit(); +} + +/* Group wait for prior Nth stage of memcpy_async to complete. */ +template <unsigned int Stage, class TyGroup> +_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) { + __pipeline_wait_prior(Stage); + group.sync(); +} + +/* Group wait all previously submitted memcpy_async to complete. */ +template <class TyGroup> +_CG_STATIC_QUALIFIER void wait(const TyGroup &group) { + __pipeline_wait_prior(0); + group.sync(); +} + +/***************** CG APIs including pipeline are deprecated *****************/ + +/* Group submit batch of async-copy to cover of contiguous 1D array + to a pipeline and commit the batch*/ +template <class TyGroup, class TyElem> +_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount, + nvcuda::experimental::pipeline &pipe) { + details::_memcpy_async_datum(group, dst, dstCount, src, srcCount); + pipe.commit(); +} + +/* Group wait for prior Nth stage of memcpy_async to complete. */ +template <unsigned int Stage, class TyGroup> +_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) { + pipe.wait_prior<Stage>(); + group.sync(); +} + +/* Group wait for stage-S of memcpy_async to complete. */ +template <class TyGroup> +_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) { + pipe.wait(stage); + group.sync(); +} +_CG_END_NAMESPACE + +#endif // _CG_ASYNC_H diff --git a/ext/cudart/include/cooperative_groups/details/coalesced_reduce.h b/ext/cudart/include/cooperative_groups/details/coalesced_reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..c3722fb5c22809027cee66ab05758e477e8ef2bf --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/coalesced_reduce.h @@ -0,0 +1,108 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_COALESCED_REDUCE_H_ +#define _CG_COALESCED_REDUCE_H_ + +#include "info.h" +#include "helpers.h" +#include "cooperative_groups.h" +#include "partitioning.h" +#include "coalesced_scan.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + +template <typename TyVal, typename TyOp> +_CG_QUALIFIER auto coalesced_reduce_to_one(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + if (group.size() == 32) { + auto out = val; + for (int offset = group.size() >> 1; offset > 0; offset >>= 1) { + out = op(out, group.shfl_up(out, offset)); + } + return out; + } + else { + auto scan_result = + inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op)); + return scan_result; + } +} + +template <typename TyVal, typename TyOp> +_CG_QUALIFIER auto coalesced_reduce(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + auto out = coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op)); + if (group.size() == 32) { + return group.shfl(out, 31); + } + else { + unsigned int group_mask = _coalesced_group_data_access::get_mask(group); + unsigned int last_thread_id = 31 - __clz(group_mask); + return details::tile::shuffle_dispatch<TyVal>::shfl( + _CG_STL_NAMESPACE::forward<TyVal>(out), group_mask, last_thread_id, 32); + } +} + +template <typename TyVal, typename TyOp, unsigned int TySize, typename ParentT> +_CG_QUALIFIER auto coalesced_reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, + TyVal&& val, + TyOp&& op) -> decltype(op(val, val)) { + auto out = val; + for (int mask = TySize >> 1; mask > 0; mask >>= 1) { + out = op(out, group.shfl_xor(out, mask)); + } + + return out; +} + +} // details + +_CG_END_NAMESPACE + +#endif // _CG_COALESCED_REDUCE_H_ diff --git a/ext/cudart/include/cooperative_groups/details/coalesced_scan.h b/ext/cudart/include/cooperative_groups/details/coalesced_scan.h new file mode 100644 index 0000000000000000000000000000000000000000..383f4bde059dd8daad7d1c56e99152ea7ee28a08 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/coalesced_scan.h @@ -0,0 +1,174 @@ +/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_COALESCED_SCAN_H_ +#define _CG_COALESCED_SCAN_H_ + +#include "info.h" +#include "helpers.h" +#include "cooperative_groups.h" +#include "partitioning.h" +#include "functional.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + +template <typename TyGroup, typename TyVal, typename TyOp> +_CG_QUALIFIER auto inclusive_scan_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + auto out = val; + for (int mask = 1; mask < group.size(); mask <<= 1) { + auto tmp = group.shfl_up(out, mask); + if (mask <= group.thread_rank()) { + out = op(out, tmp); + } + } + + return out; +} + +template <typename TyGroup, typename TyVal, typename TyOp> +_CG_QUALIFIER auto inclusive_scan_non_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + const unsigned int groupSize = group.size(); + auto out = val; + + const unsigned int mask = details::_coalesced_group_data_access::get_mask(group); + unsigned int lanemask = details::lanemask32_lt() & mask; + unsigned int srcLane = details::laneid(); + + const unsigned int base = __ffs(mask)-1; /* lane with rank == 0 */ + const unsigned int rank = __popc(lanemask); + + for (unsigned int i = 1, j = 1; i < groupSize; i <<= 1) { + if (i <= rank) { + srcLane -= j; + j = i; /* maximum possible lane */ + + unsigned int begLane = base + rank - i; /* minimum possible lane */ + + /* Next source lane is in the range [ begLane .. srcLane ] + * If begLane < srcLane then do a binary search. + */ + while (begLane < srcLane) { + const unsigned int halfLane = (begLane + srcLane) >> 1; + const unsigned int halfMask = lanemask >> halfLane; + const unsigned int d = __popc(halfMask); + if (d < i) { + srcLane = halfLane - 1; /* halfLane too large */ + } + else if ((i < d) || !(halfMask & 0x01)) { + begLane = halfLane + 1; /* halfLane too small */ + } + else { + begLane = srcLane = halfLane; /* happen to hit */ + } + } + } + + auto tmp = details::tile::shuffle_dispatch<TyVal>::shfl(out, mask, srcLane, 32); + if (i <= rank) { + out = op(out, tmp); + } + } + return out; +} + +template <unsigned int TySize, typename ParentT, typename TyVal, typename TyOp> +_CG_QUALIFIER auto coalesced_inclusive_scan(const __single_warp_thread_block_tile<TySize, ParentT>& group, + TyVal&& val, + TyOp&& op) -> decltype(op(val, val)) { + return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op)); +} + +template <typename TyVal, typename TyOp> +_CG_QUALIFIER auto coalesced_inclusive_scan(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + if (group.size() == 32) { + return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op)); + } + else { + return inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op)); + } +} + +template <bool IntegralOptimized> +struct scan_choose_convertion; + +template<> +struct scan_choose_convertion<true> { + template <typename TyGroup, typename TyRes, typename TyVal> + _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) { + return result - val; + } +}; + +template<> +struct scan_choose_convertion<false> { + template <typename TyGroup, typename TyRes, typename TyVal> + _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) { + auto ret = group.shfl_up(result, 1); + if (group.thread_rank() == 0) { + return {}; + } + else { + return ret; + } + } +}; + +template <typename TyGroup, typename TyRes, typename TyVal, typename TyFn> +_CG_QUALIFIER auto convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + using conversion = scan_choose_convertion<_CG_STL_NAMESPACE::is_same<remove_qual<TyFn>, cooperative_groups::plus<remove_qual<TyVal>>>::value + && _CG_STL_NAMESPACE::is_integral<remove_qual<TyVal>>::value>; + return conversion::convert_inclusive_to_exclusive(group, result, _CG_STL_NAMESPACE::forward<TyVal>(val)); +} + +} // details + +_CG_END_NAMESPACE + +#endif // _CG_COALESCED_SCAN_H_ \ No newline at end of file diff --git a/ext/cudart/include/cooperative_groups/details/driver_abi.h b/ext/cudart/include/cooperative_groups/details/driver_abi.h new file mode 100644 index 0000000000000000000000000000000000000000..9c866fcf740beb709a106057d28e8a2a1ac37924 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/driver_abi.h @@ -0,0 +1,99 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_DRIVER_API_H +#define _CG_DRIVER_API_H + +#include "info.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + template <unsigned int RegId> + _CG_QUALIFIER unsigned int load_env_reg() { + // Abort by default + _CG_ABORT(); + return 0; + } + + template <unsigned int HiReg, unsigned int LoReg> + _CG_QUALIFIER unsigned long long load_env_reg64() { + unsigned long long registerLo = load_env_reg<LoReg>(); + unsigned long long registerHi = load_env_reg<HiReg>(); + + return (registerHi << 32) | registerLo; + } + +// inline PTX for accessing registers requires an immediate for the special reg +# define LOAD_ENVREG(NUMBER) \ + template <> _CG_QUALIFIER unsigned int load_env_reg<NUMBER>() { \ + unsigned int r; \ + asm ("mov.u32 %0, %%envreg" #NUMBER ";" : "=r"(r)); \ + return r; \ + } + + // Instantiate loaders for registers used + LOAD_ENVREG(0); + LOAD_ENVREG(1); + LOAD_ENVREG(2); +# undef LOAD_ENVREG + + struct grid_workspace { + unsigned int wsSize; + unsigned int barrier; + }; + + _CG_QUALIFIER grid_workspace* get_grid_workspace() { + unsigned long long gridWsAbiAddress = load_env_reg64<1, 2>(); + // Interpret the address from envreg 1 and 2 as the driver's grid workspace + return (reinterpret_cast<grid_workspace*>(gridWsAbiAddress)); + } +} +_CG_END_NAMESPACE + +#endif // _CG_DRIVER_API_H diff --git a/ext/cudart/include/cooperative_groups/details/functional.h b/ext/cudart/include/cooperative_groups/details/functional.h new file mode 100644 index 0000000000000000000000000000000000000000..ca17d08ad8407af2ea8967e53a8bdc4b59079c77 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/functional.h @@ -0,0 +1,207 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_FUNCTIONAL_H +#define _CG_FUNCTIONAL_H + +#include "info.h" +#include "helpers.h" + +#ifdef _CG_CPP11_FEATURES +#ifdef _CG_USE_CUDA_STL +# include <cuda/std/functional> +#endif + +_CG_BEGIN_NAMESPACE + +namespace details { +#ifdef _CG_USE_CUDA_STL + using cuda::std::plus; + using cuda::std::bit_and; + using cuda::std::bit_xor; + using cuda::std::bit_or; +#else + template <typename Ty> struct plus {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 + arg2;}}; + template <typename Ty> struct bit_and {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 & arg2;}}; + template <typename Ty> struct bit_xor {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 ^ arg2;}}; + template <typename Ty> struct bit_or {__device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const {return arg1 | arg2;}}; +#endif // _CG_USE_PLATFORM_STL +} // details + +template <typename Ty> +struct plus : public details::plus<Ty> {}; + +template <typename Ty> +struct less { + __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const { + return (arg2 < arg1) ? arg2 : arg1; + } +}; + +template <typename Ty> +struct greater { + __device__ __forceinline__ Ty operator()(Ty arg1, Ty arg2) const { + return (arg1 < arg2) ? arg2 : arg1; + } +}; + +template <typename Ty> +struct bit_and : public details::bit_and<Ty> {}; + +template <typename Ty> +struct bit_xor : public details::bit_xor<Ty> {}; + +template <typename Ty> +struct bit_or : public details::bit_or<Ty> {}; + +#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL) +namespace details { + template <class Ty> + using _atomic_is_type_supported = _CG_STL_NAMESPACE::integral_constant<bool, + _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) == 4 || sizeof(Ty) == 8)>; + + template <typename TyOp> struct _atomic_op_supported : public _CG_STL_NAMESPACE::false_type {}; + template <typename Ty> struct _atomic_op_supported<cooperative_groups::plus<Ty>> : public _atomic_is_type_supported<Ty> {}; + template <typename Ty> struct _atomic_op_supported<cooperative_groups::less<Ty>> : public _atomic_is_type_supported<Ty> {}; + template <typename Ty> struct _atomic_op_supported<cooperative_groups::greater<Ty>> : public _atomic_is_type_supported<Ty> {}; + template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_and<Ty>> : public _atomic_is_type_supported<Ty> {}; + template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_or<Ty>> : public _atomic_is_type_supported<Ty> {}; + template <typename Ty> struct _atomic_op_supported<cooperative_groups::bit_xor<Ty>> : public _atomic_is_type_supported<Ty> {}; + + template<typename TyAtomic, typename TyVal, typename TyOp> + _CG_QUALIFIER remove_qual<TyVal> atomic_cas_fallback(TyAtomic&& atomic, TyVal&& val, TyOp&& op) { + remove_qual<TyVal> old = atomic; + while(!atomic.compare_exchange_weak(old, op(old, val))); + return old; + } + + template<typename TyOp> + struct op_picker; + + template<typename TyVal> + struct op_picker<cooperative_groups::plus<TyVal>> { + template<typename TyAtomic> + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_add(val); + } + }; + + template<typename TyVal> + struct op_picker<cooperative_groups::less<TyVal>> { + template<typename TyAtomic> + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_min(val); + } + }; + + template<typename TyVal> + struct op_picker<cooperative_groups::greater<TyVal>> { + template<typename TyAtomic> + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_max(val); + } + }; + + template<typename TyVal> + struct op_picker<cooperative_groups::bit_and<TyVal>> { + template<typename TyAtomic> + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_and(val); + } + }; + + template<typename TyVal> + struct op_picker<cooperative_groups::bit_xor<TyVal>> { + template<typename TyAtomic> + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_xor(val); + } + }; + + template<typename TyVal> + struct op_picker<cooperative_groups::bit_or<TyVal>> { + template<typename TyAtomic> + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val) { + return atomic.fetch_or(val); + } + }; + + template<bool atomic_supported> + struct atomic_update_dispatch {}; + + template<> + struct atomic_update_dispatch<false> { + template<typename TyAtomic, typename TyVal, typename TyOp> + _CG_STATIC_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) { + return atomic_cas_fallback(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op)); + } + }; + + template<> + struct atomic_update_dispatch<true> { + template<typename TyAtomic, typename TyVal, typename TyOp> + _CG_STATIC_QUALIFIER TyVal atomic_update(TyAtomic& atomic, TyVal val, TyOp&& op) { + using dispatch = op_picker<details::remove_qual<TyOp>>; + + return dispatch::atomic_update(atomic, val); + } + }; + + template<typename TyAtomic, typename TyVal, typename TyOp> + _CG_QUALIFIER remove_qual<TyVal> atomic_update(TyAtomic& atomic, TyVal&& val, TyOp&& op) { + using dispatch = atomic_update_dispatch<_atomic_op_supported<details::remove_qual<TyOp>>::value>; + + return dispatch::atomic_update(atomic, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op)); + } +} +#endif + +_CG_END_NAMESPACE + +#endif +#endif //_CG_FUNCTIONAL_H diff --git a/ext/cudart/include/cooperative_groups/details/helpers.h b/ext/cudart/include/cooperative_groups/details/helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..3e4f98b4eb59c68ed61c929b7b7982f9985b12f7 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/helpers.h @@ -0,0 +1,707 @@ + /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_HELPERS_H_ +# define _COOPERATIVE_GROUPS_HELPERS_H_ + +#include "info.h" +#include "sync.h" + +_CG_BEGIN_NAMESPACE + +namespace details { +#ifdef _CG_CPP11_FEATURES + template <typename Ty> struct _is_float_or_half : public _CG_STL_NAMESPACE::is_floating_point<Ty> {}; +# ifdef _CG_HAS_FP16_COLLECTIVE + template <> struct _is_float_or_half<__half> : public _CG_STL_NAMESPACE::true_type {}; + template <> struct _is_float_or_half<__half2> : public _CG_STL_NAMESPACE::true_type {}; +# endif + template <typename Ty> + using is_float_or_half = _is_float_or_half<typename _CG_STL_NAMESPACE::remove_cv<Ty>::type>; + + // Non-STL utility templates + template <typename Ty> + using remove_qual = typename _CG_STL_NAMESPACE::remove_cv<typename _CG_STL_NAMESPACE::remove_reference<Ty>::type>::type; + + template <typename TyLhs, typename TyRhs> + using is_op_type_same = _CG_STL_NAMESPACE::is_same<remove_qual<TyLhs>, remove_qual<TyRhs> + >; +#endif + + template <typename TyTrunc> + _CG_STATIC_QUALIFIER TyTrunc vec3_to_linear(dim3 index, dim3 nIndex) { + return ((TyTrunc)index.z * nIndex.y * nIndex.x) + + ((TyTrunc)index.y * nIndex.x) + + (TyTrunc)index.x; + } + + namespace cta { + + _CG_STATIC_QUALIFIER void sync() + { + __barrier_sync(0); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() + { + return static_cast<unsigned int>(blockDim.x * blockDim.y * blockDim.z); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return vec3_to_linear<unsigned int>(threadIdx, blockDim); + } + + _CG_STATIC_QUALIFIER dim3 group_index() + { + return dim3(blockIdx.x, blockIdx.y, blockIdx.z); + } + + _CG_STATIC_QUALIFIER dim3 thread_index() + { + return dim3(threadIdx.x, threadIdx.y, threadIdx.z); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + return dim3(blockDim.x, blockDim.y, blockDim.z); + } + + // Legacy aliases + _CG_STATIC_QUALIFIER unsigned int size() + { + return num_threads(); + } + + _CG_STATIC_QUALIFIER dim3 block_dim() + { + return dim_threads(); + } + + }; + + class _coalesced_group_data_access { + public: + // Retrieve mask of coalesced groups + template <typename TyGroup> + _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup &group) { + return group.get_mask(); + } + + // Retrieve mask of tiles + template <template <typename, typename> typename TyGroup, typename Sz, typename Parent> + _CG_STATIC_QUALIFIER unsigned int get_mask(const TyGroup<Sz, Parent> &group) { + return group.build_maks(); + } + + template <typename TyGroup> + _CG_STATIC_QUALIFIER TyGroup construct_from_mask(unsigned int mask) { + return TyGroup(mask); + } + + template <typename TyGroup> + _CG_STATIC_QUALIFIER void modify_meta_group(TyGroup &group, unsigned int mgRank, unsigned int mgSize) { + group._data.coalesced.metaGroupRank = mgRank; + group._data.coalesced.metaGroupSize = mgSize; + } + }; + + namespace tile { + template <unsigned int TileCount, unsigned int TileMask, unsigned int LaneMask, unsigned int ShiftCount> + struct _tile_helpers{ + _CG_STATIC_CONST_DECL unsigned int tileCount = TileCount; + _CG_STATIC_CONST_DECL unsigned int tileMask = TileMask; + _CG_STATIC_CONST_DECL unsigned int laneMask = LaneMask; + _CG_STATIC_CONST_DECL unsigned int shiftCount = ShiftCount; + }; + + template <unsigned int> struct tile_helpers; + template <> struct tile_helpers<32> : public _tile_helpers<1, 0xFFFFFFFF, 0x1F, 5> {}; + template <> struct tile_helpers<16> : public _tile_helpers<2, 0x0000FFFF, 0x0F, 4> {}; + template <> struct tile_helpers<8> : public _tile_helpers<4, 0x000000FF, 0x07, 3> {}; + template <> struct tile_helpers<4> : public _tile_helpers<8, 0x0000000F, 0x03, 2> {}; + template <> struct tile_helpers<2> : public _tile_helpers<16, 0x00000003, 0x01, 1> {}; + template <> struct tile_helpers<1> : public _tile_helpers<32, 0x00000001, 0x00, 0> {}; + +#ifdef _CG_CPP11_FEATURES + namespace shfl { + /*********************************************************************************** + * Recursively Sliced Shuffle + * Purpose: + * Slices an input type a number of times into integral types so that shuffles + * are well defined + * Expectations: + * This object *should not* be used from a reinterpret_cast pointer unless + * some alignment guarantees can be met. Use a memcpy to guarantee that loads + * from the integral types stored within are aligned and correct. + **********************************************************************************/ + template <unsigned int count, bool intSized = (count <= sizeof(int))> + struct recursive_sliced_shuffle_helper; + + template <unsigned int count> + struct recursive_sliced_shuffle_helper<count, true> { + int val; + + template <typename TyFn> + _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) { + val = shfl(val); + } + }; + + template <unsigned int count> + struct recursive_sliced_shuffle_helper<count, false> { + int val; + recursive_sliced_shuffle_helper<count - sizeof(int)> next; + + template <typename TyFn> + _CG_QUALIFIER void invoke_shuffle(const TyFn &shfl) { + val = shfl(val); + next.invoke_shuffle(shfl); + } + }; + } + + struct _memory_shuffle { + template <typename TyElem, typename TyShflFn> + _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) { + static_assert(sizeof(TyElem) > 0, "in memory shuffle is not yet implemented"); + return TyElem{}; + } + + template <typename TyElem, typename TyRet = remove_qual<TyElem>> + _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) { + auto shfl = [=](int val) -> int { + return 0; + }; + + return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl); + } + + template <typename TyElem, typename TyRet = remove_qual<TyElem>> + _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + auto shfl = [=](int val) -> int { + return 0; + }; + + return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl); + } + + template <typename TyElem, typename TyRet = remove_qual<TyElem>> + _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + auto shfl = [=](int val) -> int { + return 0; + }; + + return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl); + } + + template <typename TyElem, typename TyRet = remove_qual<TyElem>> + _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) { + auto shfl = [=](int val) -> int { + return 0; + }; + + return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl); + } + }; + + /*********************************************************************************** + * Intrinsic Device Function Shuffle + * Purpose: + * Uses a shuffle helper that has characteristics best suited for moving + * elements between threads + * Expectations: + * Object given will be forced into an l-value type so that it can be used + * with a helper structure that reinterprets the data into intrinsic compatible + * types + * Notes: + * !! TyRet is required so that objects are returned by value and not as + * dangling references depending on the value category of the passed object + **********************************************************************************/ + struct _intrinsic_compat_shuffle { + template <unsigned int count> + using shfl_helper = shfl::recursive_sliced_shuffle_helper<count>; + + template <typename TyElem, typename TyShflFn> + _CG_STATIC_QUALIFIER TyElem _shfl_internal(TyElem elem, const TyShflFn& fn) { + static_assert(__is_trivially_copyable(TyElem), "Type is not compatible with device shuffle"); + shfl_helper<sizeof(TyElem)> helper; + memcpy(&helper, &elem, sizeof(TyElem)); + helper.invoke_shuffle(fn); + memcpy(&elem, &helper, sizeof(TyElem)); + return elem; + } + + template <typename TyElem, typename TyRet = remove_qual<TyElem>> + _CG_STATIC_QUALIFIER TyRet shfl(TyElem&& elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) { + auto shfl = [=](int val) -> int { + return __shfl_sync(gMask, val, srcRank, threads); + }; + + return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl); + } + + template <typename TyElem, typename TyRet = remove_qual<TyElem>> + _CG_STATIC_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + auto shfl = [=](int val) -> int { + return __shfl_down_sync(gMask, val, delta, threads); + }; + + return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl); + } + + template <typename TyElem, typename TyRet = remove_qual<TyElem>> + _CG_STATIC_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + auto shfl = [=](int val) -> int { + return __shfl_up_sync(gMask, val, delta, threads); + }; + + return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl); + } + + template <typename TyElem, typename TyRet = remove_qual<TyElem>> + _CG_STATIC_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int gMask, unsigned int lMask, unsigned int threads) { + auto shfl = [=](int val) -> int { + return __shfl_xor_sync(gMask, val, lMask, threads); + }; + + return _shfl_internal<TyRet>(_CG_STL_NAMESPACE::forward<TyElem>(elem), shfl); + } + }; + + struct _native_shuffle { + template <typename TyElem> + _CG_STATIC_QUALIFIER TyElem shfl( + TyElem elem, unsigned int gMask, unsigned int srcRank, unsigned int threads) { + return static_cast<TyElem>(__shfl_sync(gMask, elem, srcRank, threads)); + } + + template <typename TyElem> + _CG_STATIC_QUALIFIER TyElem shfl_down( + TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + return static_cast<TyElem>(__shfl_down_sync(gMask, elem, delta, threads)); + } + + template <typename TyElem> + _CG_STATIC_QUALIFIER TyElem shfl_up( + TyElem elem, unsigned int gMask, unsigned int delta, unsigned int threads) { + return static_cast<TyElem>(__shfl_up_sync(gMask, elem, delta, threads)); + } + + template <typename TyElem> + _CG_STATIC_QUALIFIER TyElem shfl_xor( + TyElem elem, unsigned int gMask, unsigned int lMask, unsigned int threads) { + return static_cast<TyElem>(__shfl_xor_sync(gMask, elem, lMask, threads)); + } + }; + + // Almost all arithmetic types are supported by native shuffle + // Vector types are the exception + template <typename TyElem> + using use_native_shuffle = _CG_STL_NAMESPACE::integral_constant< + bool, + _CG_STL_NAMESPACE::is_integral< + remove_qual<TyElem>>::value || + details::is_float_or_half< + remove_qual<TyElem>>::value + >; + + constexpr unsigned long long _MemoryShuffleCutoff = 32; + + template <typename TyElem, + bool IsNative = use_native_shuffle<TyElem>::value, + bool InMem = (sizeof(TyElem) > _MemoryShuffleCutoff)> + struct shuffle_dispatch; + + template <typename TyElem> + struct shuffle_dispatch<TyElem, true, false> : public _native_shuffle {}; + + template <typename TyElem> + struct shuffle_dispatch<TyElem, false, false> : public _intrinsic_compat_shuffle {}; + + template <typename TyElem> + struct shuffle_dispatch<TyElem, false, true> : public _memory_shuffle {}; + +#endif //_CG_CPP11_FEATURES + }; + + namespace multi_grid { + struct multi_grid_functions; + }; + + namespace grid { + _CG_STATIC_QUALIFIER void sync(unsigned int *bar) { + unsigned int expected = gridDim.x * gridDim.y * gridDim.z; + + details::sync_grids(expected, bar); + } + + _CG_STATIC_QUALIFIER unsigned long long num_blocks() + { + // grid.y * grid.z -> [max(65535) * max(65535)] fits within 4b, promote after multiplication + // grid.x * (grid.y * grid.z) -> [max(2^31-1) * max(65535 * 65535)] exceeds 4b, promote before multiplication + return (unsigned long long)gridDim.x * (gridDim.y * gridDim.z); + } + + _CG_STATIC_QUALIFIER unsigned long long num_threads() + { + return num_blocks() * cta::num_threads(); + } + + _CG_STATIC_QUALIFIER unsigned long long block_rank() + { + return vec3_to_linear<unsigned long long>(blockIdx, gridDim); + } + + _CG_STATIC_QUALIFIER unsigned long long thread_rank() + { + return block_rank() * cta::num_threads() + cta::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() + { + return dim3(gridDim.x, gridDim.y, gridDim.z); + } + + _CG_STATIC_QUALIFIER dim3 block_index() + { + return dim3(blockIdx.x, blockIdx.y, blockIdx.z); + } + +#if defined(_CG_HAS_CLUSTER_GROUP) + _CG_STATIC_QUALIFIER dim3 dim_clusters() { + return __clusterGridDimInClusters(); + } + + _CG_STATIC_QUALIFIER unsigned long long num_clusters() { + const dim3 dimClusters = dim_clusters(); + return dimClusters.x * dimClusters.y * dimClusters.z; + } + + _CG_STATIC_QUALIFIER dim3 cluster_index() { + return __clusterIdx(); + } + + _CG_STATIC_QUALIFIER unsigned long long cluster_rank() { + return vec3_to_linear<unsigned long long>(cluster_index(), dim_clusters()); + } +#endif + + // Legacy aliases + _CG_STATIC_QUALIFIER unsigned long long size() + { + return num_threads(); + } + + _CG_STATIC_QUALIFIER dim3 grid_dim() + { + return dim_blocks(); + } + }; + + +#if defined(_CG_HAS_MULTI_GRID_GROUP) + + namespace multi_grid { + _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle() + { + return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid)); + } + + _CG_STATIC_QUALIFIER void sync(const unsigned long long handle) + { + cudaError_t err = cudaCGSynchronize(handle, 0); + } + + _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle) + { + unsigned int numThreads = 0; + cudaCGGetSize(&numThreads, NULL, handle); + return numThreads; + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle) + { + unsigned int threadRank = 0; + cudaCGGetRank(&threadRank, NULL, handle); + return threadRank; + } + + _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle) + { + unsigned int gridRank = 0; + cudaCGGetRank(NULL, &gridRank, handle); + return gridRank; + } + + _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle) + { + unsigned int numGrids = 0; + cudaCGGetSize(NULL, &numGrids, handle); + return numGrids; + } + +# ifdef _CG_CPP11_FEATURES + struct multi_grid_functions { + decltype(multi_grid::get_intrinsic_handle) *get_intrinsic_handle; + decltype(multi_grid::sync) *sync; + decltype(multi_grid::size) *size; + decltype(multi_grid::thread_rank) *thread_rank; + decltype(multi_grid::grid_rank) *grid_rank; + decltype(multi_grid::num_grids) *num_grids; + }; + + template <typename = void> + _CG_STATIC_QUALIFIER const multi_grid_functions* load_grid_intrinsics() { + __constant__ static const multi_grid_functions mgf { + &multi_grid::get_intrinsic_handle, + &multi_grid::sync, + &multi_grid::size, + &multi_grid::thread_rank, + &multi_grid::grid_rank, + &multi_grid::num_grids + }; + + return &mgf; + } +# endif + }; +#endif + +#if defined(_CG_HAS_CLUSTER_GROUP) + namespace cluster { + + _CG_STATIC_QUALIFIER bool isReal() + { + return __clusterDimIsSpecified(); + } + + _CG_STATIC_QUALIFIER void barrier_arrive() + { + __cluster_barrier_arrive(); + } + + _CG_STATIC_QUALIFIER void barrier_wait() + { + __cluster_barrier_wait(); + } + + _CG_STATIC_QUALIFIER void sync() + { + barrier_arrive(); + barrier_wait(); + } + + _CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr) + { + return __cluster_query_shared_rank(addr); + } + + template <typename T> + _CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank) + { + return static_cast<T*>(__cluster_map_shared_rank(addr, rank)); + } + + _CG_STATIC_QUALIFIER dim3 block_index() + { + return __clusterRelativeBlockIdx(); + } + + _CG_STATIC_QUALIFIER unsigned int block_rank() + { + return __clusterRelativeBlockRank(); + } + + _CG_STATIC_QUALIFIER unsigned int thread_rank() + { + return block_rank() * cta::num_threads() + cta::thread_rank(); + } + + _CG_STATIC_QUALIFIER dim3 dim_blocks() + { + return __clusterDim(); + } + + _CG_STATIC_QUALIFIER unsigned int num_blocks() + { + return __clusterSizeInBlocks(); + } + + _CG_STATIC_QUALIFIER dim3 dim_threads() + { + const dim3 dimBlocks = dim_blocks(); + const unsigned int x = dimBlocks.x * blockDim.x; + const unsigned int y = dimBlocks.y * blockDim.y; + const unsigned int z = dimBlocks.z * blockDim.z; + return dim3(x, y, z); + } + + _CG_STATIC_QUALIFIER unsigned int num_threads() + { + return num_blocks() * cta::num_threads(); + } + + }; +#endif + + _CG_STATIC_QUALIFIER unsigned int laneid() + { + unsigned int laneid; + asm ("mov.u32 %0, %%laneid;" : "=r"(laneid)); + return laneid; + } + + _CG_STATIC_QUALIFIER unsigned int lanemask32_eq() + { + unsigned int lanemask32_eq; + asm ("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq)); + return (lanemask32_eq); + } + + _CG_STATIC_QUALIFIER unsigned int lanemask32_lt() + { + unsigned int lanemask32_lt; + asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt)); + return (lanemask32_lt); + } + + _CG_STATIC_QUALIFIER void abort() + { + _CG_ABORT(); + } + + template <typename Ty> + _CG_QUALIFIER void assert_if_not_arithmetic() { +#ifdef _CG_CPP11_FEATURES + static_assert( + _CG_STL_NAMESPACE::is_integral<Ty>::value || + details::is_float_or_half<Ty>::value, + "Error: Ty is neither integer or float" + ); +#endif + } + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + template <unsigned int numWarps> + struct copy_channel { + char* channel_ptr; + barrier_t* sync_location; + size_t channel_size; + + // One warp sending to all other warps, it has to wait for all other warps. + struct send_many_to_many { + _CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_all_other_warps; + _CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) { + __syncwarp(0xFFFFFFFF); + details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps); + } + }; + + // One warp is receiving and all other warps are sending to that warp, they have to wait for that one warp. + struct send_many_to_one { + _CG_STATIC_CONST_DECL wait_for_warps_kind wait_kind = wait_for_specific_warp; + _CG_STATIC_QUALIFIER void post_iter_release(unsigned int thread_idx, barrier_t* sync_location) { + // Wait for all warps to finish and let the last warp release all threads. + if (details::sync_warps_last_releases(sync_location, cta::thread_rank(), numWarps)) { + details::sync_warps_release(sync_location, thread_idx == 0, cta::thread_rank(), numWarps); + } + } + }; + + template <unsigned int ThreadCnt, size_t ValSize, typename SendDetails> + _CG_QUALIFIER void _send_value_internal(char* val_ptr, unsigned int thread_idx, unsigned int warp_id) { + size_t thread_offset = thread_idx * sizeof(int); + + for (size_t i = 0; i < ValSize; i += channel_size) { + size_t bytes_left = ValSize - i; + size_t copy_chunk = min(bytes_left, channel_size); + + details::sync_warps_wait_for_warps<SendDetails::wait_kind>(warp_id, sync_location, cta::thread_rank(), numWarps); + #pragma unroll 1 + for (size_t j = thread_offset; j < copy_chunk ; j += sizeof(int) * ThreadCnt) { + size_t my_bytes_left = copy_chunk - j; + memcpy(channel_ptr + j, val_ptr + i + j, min(my_bytes_left, sizeof(int))); + } + SendDetails::post_iter_release(thread_idx, sync_location); + } + } + + + template <typename TyVal, unsigned int ThreadCnt, typename SendDetails> + _CG_QUALIFIER void send_value(TyVal& val, unsigned int thread_idx, unsigned int warp_id) { + _send_value_internal<ThreadCnt, sizeof(TyVal), SendDetails>(reinterpret_cast<char*>(&val), thread_idx, warp_id); + } + + template <size_t ValSize> + _CG_QUALIFIER void _receive_value_internal(char* val_ptr, bool warp_master, bool active) { + for (size_t i = 0; i < ValSize; i += channel_size) { + size_t bytes_left = ValSize - i; + details::sync_warps_wait_for_release(sync_location, warp_master, cta::thread_rank(), numWarps); + if (active) { + memcpy(val_ptr + i, channel_ptr, min(bytes_left, channel_size)); + } + } + } + + template <typename TyVal> + _CG_QUALIFIER void receive_value(TyVal& val, bool warp_master, bool active = true) { + _receive_value_internal<sizeof(TyVal)>(reinterpret_cast<char*>(&val), warp_master, active); + } + }; + + _CG_STATIC_QUALIFIER constexpr unsigned int log2(unsigned int x) { + return x == 1 ? 0 : 1 + log2(x / 2); + } +#endif //_CG_CPP11_FEATURES + +}; // !Namespace internal + +_CG_END_NAMESPACE + +#endif /* !_COOPERATIVE_GROUPS_HELPERS_H_ */ diff --git a/ext/cudart/include/cooperative_groups/details/info.h b/ext/cudart/include/cooperative_groups/details/info.h new file mode 100644 index 0000000000000000000000000000000000000000..4afc475dc59f2d638bc99756fa33db799523f518 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/info.h @@ -0,0 +1,323 @@ + /* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + + + +#ifndef _CG_INFO_H_ +#define _CG_INFO_H_ +/* +** Define: _CG_VERSION +*/ +#define _CG_VERSION 1000 + +/* +** Define: _CG_ABI_VERSION +*/ +#ifndef _CG_ABI_VERSION +# define _CG_ABI_VERSION 1 +#endif + +/* +** Define: _CG_ABI_EXPERIMENTAL +** Desc: If enabled, sets all features enabled (ABI-breaking or experimental) +*/ +#if defined(_CG_ABI_EXPERIMENTAL) +#endif + +#define _CG_CONCAT_INNER(x, y) x ## y +#define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y) +#define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION) + +#define _CG_BEGIN_NAMESPACE \ + namespace cooperative_groups { namespace _CG_NAMESPACE { +#define _CG_END_NAMESPACE \ + }; using namespace _CG_NAMESPACE; }; + +#if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900)) +# define _CG_CPP11_FEATURES +#endif + +#if !defined(_CG_QUALIFIER) +# define _CG_QUALIFIER __forceinline__ __device__ +#endif +#if !defined(_CG_STATIC_QUALIFIER) +# define _CG_STATIC_QUALIFIER static __forceinline__ __device__ +#endif +#if !defined(_CG_CONSTEXPR_QUALIFIER) +# if defined(_CG_CPP11_FEATURES) +# define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__ +# else +# define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER +# endif +#endif +#if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER) +# if defined(_CG_CPP11_FEATURES) +# define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__ +# else +# define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER +# endif +#endif + +#if defined(_MSC_VER) +# define _CG_DEPRECATED __declspec(deprecated) +#else +# define _CG_DEPRECATED __attribute__((deprecated)) +#endif + +#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__) +# define _CG_HAS_GRID_GROUP +#endif +#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__) +# define _CG_HAS_MULTI_GRID_GROUP +#endif +#if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__) +# define _CG_HAS_MATCH_COLLECTIVE +#endif +#if ((__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)) && (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE)) +# define _CG_HAS_CLUSTER_GROUP +#endif +// Has __half and __half2 +// Only usable if you include the cuda_fp16.h extension, and +// _before_ including cooperative_groups.h +#ifdef __CUDA_FP16_TYPES_EXIST__ +# define _CG_HAS_FP16_COLLECTIVE +#endif + +#if (__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__) && (defined(__NVCC__) || defined(__CUDACC_RTC__)) +# define _CG_HAS_OP_REDUX +#endif + +// Include libcu++ where supported. +#if defined(_CG_CPP11_FEATURES) && !defined(__QNX__) && !defined(__ibmxl__) && \ + (defined(__NVCC__) || defined(__CUDACC_RTC__)) && \ + (defined(__x86_64__) || defined(__aarch64__) || defined(__ppc64__)|| defined(_M_X64) || defined(_M_ARM64)) && \ + (defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__)) +# define _CG_USE_CUDA_STL +#else +# define _CG_USE_OWN_TRAITS +#endif + +#if defined(_CG_USE_CUDA_STL) && (!defined(__CUDA_ARCH__) || \ + ((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700))) +# define _CG_HAS_STL_ATOMICS +#endif + +#ifdef _CG_CPP11_FEATURES +// Use cuda::std:: for type_traits +# if defined(_CG_USE_CUDA_STL) +# define _CG_STL_NAMESPACE cuda::std +# include <cuda/std/type_traits> +// Use CG's implementation of type traits +# else +# define _CG_STL_NAMESPACE cooperative_groups::details::templates +# endif +#endif + +#ifdef _CG_CPP11_FEATURES +# define _CG_STATIC_CONST_DECL static constexpr +# define _CG_CONST_DECL constexpr +#else +# define _CG_STATIC_CONST_DECL static const +# define _CG_CONST_DECL const +#endif + +#if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__) +# define _CG_ASM_PTR_CONSTRAINT "r" +#else +# define _CG_ASM_PTR_CONSTRAINT "l" +#endif + +/* +** Define: CG_DEBUG +** What: Enables various runtime safety checks +*/ +#if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG) +# define _CG_DEBUG +#endif + +#if defined(_CG_DEBUG) +# include <assert.h> +# define _CG_ASSERT(x) assert((x)); +# define _CG_ABORT() assert(0); +#else +# define _CG_ASSERT(x) +# define _CG_ABORT() __trap(); +#endif + +#if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL) +_CG_BEGIN_NAMESPACE + +namespace details { +namespace templates { + +/** + * Integral constants + **/ +template <typename Ty, Ty Val> +struct integral_constant { + static constexpr Ty value = Val; + typedef Ty type; + + _CG_QUALIFIER constexpr operator type() const noexcept { return value; } + _CG_QUALIFIER constexpr type operator()() const noexcept { return value; } +}; + +typedef integral_constant<bool, true> true_type; +typedef integral_constant<bool, false> false_type; + +/** + * CV Qualifiers + **/ +template <class Ty> struct is_lvalue_reference : public details::templates::false_type {}; +template <class Ty> struct is_lvalue_reference<Ty&> : public details::templates::true_type {}; + +template <class Ty> struct remove_reference {typedef Ty type;}; +template <class Ty> struct remove_reference<Ty&> {typedef Ty type;}; +template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;}; + +template <class Ty> +using remove_reference_t = typename details::templates::remove_reference<Ty>::type; + +template <class Ty> struct remove_const {typedef Ty type;}; +template <class Ty> struct remove_const<const Ty> {typedef Ty type;}; + +template <class Ty> struct remove_volatile {typedef Ty type;}; +template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;}; + +template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;}; + +template <class Ty> +using remove_cv_t = typename details::templates::remove_cv<Ty>::type; + +template <class Ty> +_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept { + return static_cast<Ty&&>(t); +} + +template <class Ty> +_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept { + static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed."); + return static_cast<Ty&&>(t); +} + +/** + * is_integral + **/ +template <class Ty> struct _is_integral : public details::templates::false_type {}; +template <> struct _is_integral<bool> : public details::templates::true_type {}; +template <> struct _is_integral<char> : public details::templates::true_type {}; +template <> struct _is_integral<unsigned char> : public details::templates::true_type {}; +template <> struct _is_integral<short> : public details::templates::true_type {}; +template <> struct _is_integral<unsigned short> : public details::templates::true_type {}; +template <> struct _is_integral<int> : public details::templates::true_type {}; +template <> struct _is_integral<unsigned int> : public details::templates::true_type {}; +template <> struct _is_integral<long> : public details::templates::true_type {}; +template <> struct _is_integral<long long> : public details::templates::true_type {}; +template <> struct _is_integral<unsigned long> : public details::templates::true_type {}; +template <> struct _is_integral<unsigned long long> : public details::templates::true_type {}; +//Vector type support? + +template <typename Ty> +struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {}; + +/** + * is_floating_point + **/ +template <class Ty> struct _is_floating_point : public details::templates::false_type {}; +template <> struct _is_floating_point<float> : public details::templates::true_type {}; +template <> struct _is_floating_point<double> : public details::templates::true_type {}; +template <> struct _is_floating_point<long double> : public details::templates::true_type {}; +# ifdef __CUDA_FP16_TYPES_EXIST__ +template <> struct _is_floating_point<__half> : public details::templates::true_type {}; +template <> struct _is_floating_point<__half2> : public details::templates::true_type {}; +# endif +//Vector type support? + +template <typename Ty> +struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {}; + +template <class T> +struct is_arithmetic : details::templates::integral_constant< + bool, + details::templates::is_integral<T>::value || + details::templates::is_floating_point<T>::value> {}; + +template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value> +struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {}; + +template <typename Ty> +struct _is_unsigned<Ty,false> : details::templates::false_type {}; + +template <typename Ty> +struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {}; + +/** + * programmatic type traits + **/ +template<bool B, class Ty = void> +struct enable_if {}; + +template<class Ty> +struct enable_if<true, Ty> { typedef Ty type; }; + +template<bool Cond, typename Ty = void> +using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type; + +template<class Ty1, class Ty2> +struct is_same : details::templates::false_type {}; + +template<class Ty> +struct is_same<Ty, Ty> : details::templates::true_type {}; + +} // templates +} // details +_CG_END_NAMESPACE + +#endif // _CG_CPP11_FEATURES + +#endif // _CG_INFO_H_ diff --git a/ext/cudart/include/cooperative_groups/details/partitioning.h b/ext/cudart/include/cooperative_groups/details/partitioning.h new file mode 100644 index 0000000000000000000000000000000000000000..c38418657d149e9527f9a01ce5a9f18e0f2bec61 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/partitioning.h @@ -0,0 +1,133 @@ +/* + * Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_PARTITIONING_H +#define _CG_PARTITIONING_H + +#include "info.h" +#include "helpers.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + + template <typename TyGroup> + _CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) { + const unsigned int fullMask = ~0u; + + unsigned int thisMask = _coalesced_group_data_access::get_mask(tile); + unsigned int predMask = pred ? 0 : fullMask; + unsigned int setMask = __ballot_sync(thisMask, pred); + + if (setMask == thisMask || setMask == 0) { + coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask); + _coalesced_group_data_access::modify_meta_group(subTile, 0, 1); + return subTile; + } + else { + unsigned int subMask = thisMask & (setMask ^ predMask); + coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask); + _coalesced_group_data_access::modify_meta_group(subTile, pred, 2); + return subTile; + } + } + +#ifdef _CG_HAS_MATCH_COLLECTIVE + template <typename TyGroup, typename TyPredicate> + _CG_STATIC_QUALIFIER coalesced_group _labeled_partition(const TyGroup &tile, TyPredicate pred) { + unsigned int thisMask = _coalesced_group_data_access::get_mask(tile); + unsigned int thisBias = __ffs(thisMask) - 1; // Subtract 1 to index properly from [1-32] + unsigned int subMask = __match_any_sync(thisMask, pred); + + coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask); + + int leaderLaneId = subTile.shfl(details::laneid(), 0); + + bool isLeader = !subTile.thread_rank(); + unsigned int leaderMask = __ballot_sync(thisMask, isLeader); + unsigned int tileRank = __fns(leaderMask, leaderLaneId, 0) - thisBias; + + _coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask)); + + return subTile; + } +#endif +}; // namespace details + +_CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) { + return details::_binary_partition(tile, pred); +} + +template <unsigned int Size, typename ParentT> +_CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) { +#ifdef _CG_CPP11_FEATURES + static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32"); +#endif + return details::_binary_partition(tile, pred); +} + + +#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES) +template <typename TyPredicate> +_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) { + static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type"); + return details::_labeled_partition(tile, pred); +} + +template <typename TyPredicate, unsigned int Size, typename ParentT> +_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) { + static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value, "labeled_partition predicate must be an integral type"); + static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32"); + return details::_labeled_partition(tile, pred); +} +#endif + +_CG_END_NAMESPACE + +#endif // _CG_PARTITIONING_H diff --git a/ext/cudart/include/cooperative_groups/details/reduce.h b/ext/cudart/include/cooperative_groups/details/reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..240296133e1a7ef4bcf3249eb965ee101c6020de --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/reduce.h @@ -0,0 +1,430 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_REDUCE_H_ +#define _CG_REDUCE_H_ + +#include "info.h" +#include "helpers.h" +#include "coalesced_reduce.h" +#include "functional.h" +#include "cooperative_groups.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + + template <class Ty> + using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant< + bool, + _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>; + + template <class Ty> + using redux_is_add_supported = _redux_is_add_supported<Ty>; + + // A specialization for 64 bit logical operations is possible + // but for now only accelerate 32 bit bitwise ops + template <class Ty> + using redux_is_logical_supported = redux_is_add_supported<Ty>; + + // Base operator support case + template <class TyOp, class Ty> struct _redux_op_supported : public _CG_STL_NAMESPACE::false_type {}; +#ifdef _CG_HAS_OP_REDUX + template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>, Ty> : public redux_is_add_supported<Ty> {}; + template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>, Ty> : public redux_is_add_supported<Ty> {}; + template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {}; + template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {}; + template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>, Ty> : public redux_is_logical_supported<Ty> {}; + template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {}; +#endif + + template <class Ty, template <class> class TyOp> + using redux_op_supported = _redux_op_supported< + typename details::remove_qual<TyOp<Ty>>, + Ty>; + + // Groups smaller than 16 actually have worse performance characteristics when used with redux + // tiles of size 16 and 32 perform the same or better and have better code generation profiles + template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {}; + + template <unsigned int Sz, typename TyPar> + struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant< + bool, + (Sz >= 16)> {}; + template <unsigned int Sz, typename TyPar> + struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant< + bool, + (Sz >= 16)> {}; + template <> + struct _redux_group_optimized<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {}; + + template <typename TyGroup> + using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>; + + template <template <class> class TyOp> + _CG_STATIC_QUALIFIER int pick_redux(int mask, int val); + template <template <class> class TyOp> + _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val); + +#ifdef _CG_HAS_OP_REDUX + template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) { + return __reduce_add_sync(mask, val); + } + template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) { + return __reduce_min_sync(mask, val); + } + template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) { + return __reduce_max_sync(mask, val); + } + template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) { + return __reduce_and_sync(mask, val); + } + template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) { + return __reduce_xor_sync(mask, val); + } + template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) { + return __reduce_or_sync(mask, val); + } + + template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) { + return __reduce_add_sync(mask, val); + } + template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) { + return __reduce_min_sync(mask, val); + } + template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) { + return __reduce_max_sync(mask, val); + } + template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) { + return __reduce_and_sync(mask, val); + } + template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) { + return __reduce_xor_sync(mask, val); + } + template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) { + return __reduce_or_sync(mask, val); + } +#endif + + + template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value> + struct _accelerated_op; + + // Signed type redux intrinsic dispatch + template <typename TyVal> + struct _accelerated_op<TyVal, false> { + template <template <class> class TyOp> + _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) { + return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val))); + } + }; + + // Unsigned type redux intrinsic dispatch + template <typename TyVal> + struct _accelerated_op<TyVal, true> { + template <template <class> class TyOp> + _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) { + return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val))); + } + }; + + template <typename TyVal> + using accelerated_op = _accelerated_op<TyVal>; + + + template <typename TyVal, typename TyFnInput, typename TyGroup> + class _redux_dispatch { + template <class Ty, template <class> class TyOp> + using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool, + redux_op_supported<Ty, TyOp>::value && + redux_group_optimized<TyGroup>::value>; + + template <class Ty, template <class> class TyOp> + using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*; + + template <class Ty, template <class> class TyOp> + using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*; + + public: + // Dispatch to redux if the combination of op and args are supported + template< + template <class> class TyOp, + redux_is_usable<TyFnInput, TyOp> = nullptr> + _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) { + // Retrieve the mask for the group and dispatch to redux + return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val)); + } + + template< + template <class> class TyOp, + redux_is_usable<TyFnInput, TyOp> = nullptr> + _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) { + // Retrieve the mask for the group and dispatch to redux + return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val)); + } + + // Fallback shuffle sync reduction + template < + template <class> class TyOp, + redux_is_not_usable<TyFnInput, TyOp> = nullptr> + _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) { + //Dispatch to fallback shuffle sync accelerated reduction + return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op)); + } + + }; + + // Group support for reduce. + template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {}; + + template <unsigned int Sz, typename TyPar> + struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {}; + template <unsigned int Sz, typename TyPar> + struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {}; + template <> + struct _reduce_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {}; + + template <typename TyGroup> + using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>; + + template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup> + _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) { + static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ"); + + using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>; + return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op)); + } + + template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup> + _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) { + static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ"); + + using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>; + return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op)); + } + + + template <typename TyVal, typename TyOp, typename TyGroup> + _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) { + return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op)); + } + + template <unsigned int GroupId> + struct tile_reduce_dispatch; + + template <> + struct tile_reduce_dispatch<details::coalesced_group_id> { + template <typename TyGroup, typename TyVal, typename TyFn> + _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + }; + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + template <> + struct tile_reduce_dispatch<details::multi_tile_group_id> { + template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn> + _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>; + using TyRet = details::remove_qual<TyVal>; + const unsigned int num_warps = Size / 32; + + auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) { + *warp_scratch_location = + details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op); + }; + auto inter_warp_lambda = + [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) { + *thread_scratch_location = + details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op)); + }; + return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda); + } + }; + + enum class AsyncReduceType { store, update }; + + template <AsyncReduceType TyAsyncReduce> + struct async_reduce_result_handler; + + template<> + struct async_reduce_result_handler<AsyncReduceType::store> { + template<typename TyDst, typename TyVal, typename TyOp> + _CG_STATIC_QUALIFIER void handleResult(TyDst *dst, TyVal& result, TyOp&& op) { + *dst = result; + } + }; + +#if defined(_CG_HAS_STL_ATOMICS) + template<> + struct async_reduce_result_handler<AsyncReduceType::update> { + template<typename TyDst, typename TyVal, typename TyOp> + _CG_STATIC_QUALIFIER void handleResult(TyDst& dst, TyVal& result, TyOp&& op) { + atomic_update(dst, result, _CG_STL_NAMESPACE::forward<TyOp>(op)); + } + }; +#endif + + template <unsigned int GroupId, AsyncReduceType TyAsyncReduce> + struct tile_async_reduce_dispatch; + + template <AsyncReduceType TyAsyncReduce> + struct tile_async_reduce_dispatch<details::coalesced_group_id, TyAsyncReduce> { + template <unsigned int TySize, typename ParentT, typename TyDst, typename TyVal, typename TyFn> + _CG_STATIC_QUALIFIER void reduce(const __single_warp_thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyVal&& val, TyFn&& op) { + // Do regular, in group reduction + auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op); + + // One thread stores/updates the destination + if (group.thread_rank() == 0) { + async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + } + + template <typename TyDst, typename TyVal, typename TyFn> + _CG_STATIC_QUALIFIER void reduce(const coalesced_group& group, TyDst& dst, TyVal&& val, TyFn&& op) { + // Do in group reduction to the last thread + auto result = details::coalesced_reduce_to_one(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op); + + // One thread stores/updates the destination + if (group.thread_rank() == group.size() - 1) { + async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + } + }; + + template <AsyncReduceType TyAsyncReduce> + struct tile_async_reduce_dispatch<details::multi_tile_group_id, TyAsyncReduce> { + template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn> + _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op) { + using TyVal = remove_qual<TyInputVal>; + const unsigned int num_warps = TySize / 32; + details::barrier_t* sync_location = multi_warp_sync_location_getter(group); + auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32); + + // Do in warp reduce + auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>(); + *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op); + + // Tile of size num_warps from the last warp to arrive does final reduction step + if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) { + auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>(); + if (subwarp.meta_group_rank() == 0) { + auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank()); + auto thread_val = *thread_scratch_location; + // Release other warps, we read their contribution already. + subwarp.sync(); + details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps); + TyVal result = details::reduce(subwarp, thread_val, op); + // One thread stores the result or updates the atomic + if (subwarp.thread_rank() == 0) { + async_reduce_result_handler<TyAsyncReduce>::handleResult(dst, result, _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + } + warp.sync(); + } + } + }; +#endif + + template <typename TyGroup, typename TyInputVal, typename TyRetVal> + _CG_QUALIFIER void check_reduce_params() { + static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ"); + static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile"); + }; + + template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal> + _CG_QUALIFIER void check_async_reduce_params() { + check_reduce_params<TyGroup, TyInputVal, TyRetVal>(); + static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ"); + } +} // details + +template <typename TyGroup, typename TyVal, typename TyFn> +_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>(); + + using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>; + return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); +} + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) +namespace experimental { + + #if defined(_CG_HAS_STL_ATOMICS) + template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn> + void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) { + details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>(); + + using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>; + dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + + template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn> + void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) { + details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>(); + + using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::update>; + dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + #endif + + template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn> + void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) { + details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>(); + + using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id, details::AsyncReduceType::store>; + dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + +} +#endif + +_CG_END_NAMESPACE + +#endif // _CG_REDUCE_H_ diff --git a/ext/cudart/include/cooperative_groups/details/scan.h b/ext/cudart/include/cooperative_groups/details/scan.h new file mode 100644 index 0000000000000000000000000000000000000000..badc50de13a276784ba61397ad13e1dc87cec269 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/scan.h @@ -0,0 +1,324 @@ +/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_SCAN_H_ +#define _CG_SCAN_H_ + +#include "info.h" +#include "helpers.h" +#include "functional.h" +#include "coalesced_scan.h" + +_CG_BEGIN_NAMESPACE + +namespace details { + + // Group support for scan. + template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {}; + + template <unsigned int Sz, typename TyPar> + struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {}; + template <unsigned int Sz, typename TyPar> + struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {}; + template <> + struct _scan_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {}; + + template <typename TyGroup> + using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>; + + template <bool IsIntegralPlus> + struct integral_optimized_scan; + + enum class ScanType { exclusive, inclusive }; + + template <unsigned int GroupId, ScanType TyScan> + struct scan_dispatch; + + template <ScanType TyScan> + struct scan_dispatch<details::coalesced_group_id, TyScan> { + template <typename TyGroup, typename TyVal, typename TyFn> + _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + auto scan_result = coalesced_inclusive_scan(group, val, op); + if (TyScan == ScanType::exclusive) { + scan_result = convert_inclusive_to_exclusive(group, + scan_result, + _CG_STL_NAMESPACE::forward<TyVal>(val), + _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + return scan_result; + } + }; + +#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL) + template <ScanType TyScan> + struct scan_dispatch<details::multi_tile_group_id, TyScan> { + template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn> + _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>; + using TyRet = details::remove_qual<TyVal>; + const unsigned int num_warps = Size / 32; + // In warp scan result, calculated in warp_lambda + TyRet warp_scan; + + // In warp scan, put sum in the warp_scratch_location + auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) { + warp_scan = + details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op); + if (warp.thread_rank() + 1 == warp.size()) { + *warp_scratch_location = warp_scan; + } + if (TyScan == ScanType::exclusive) { + warp_scan = warp.shfl_up(warp_scan, 1); + } + }; + + // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it + // to its in-warp scan result + auto inter_warp_lambda = + [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) { + auto thread_val = *thread_scratch_location; + auto result = coalesced_inclusive_scan(subwarp, thread_val, op); + *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op); + }; + + TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda); + if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) { + return previous_warps_sum; + } + if (warpType::meta_group_rank() == 0) { + return warp_scan; + } + else { + return op(warp_scan, previous_warps_sum); + } + } + }; + +#if defined(_CG_HAS_STL_ATOMICS) + template <unsigned int GroupId, ScanType TyScan> + struct scan_update_dispatch; + + template <ScanType TyScan> + struct scan_update_dispatch<details::coalesced_group_id, TyScan> { + template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn> + _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + details::remove_qual<TyVal> old; + + // Do regular in group scan + auto scan_result = details::coalesced_inclusive_scan(group, val, op); + + // Last thread updates the atomic and distributes its old value to other threads + if (group.thread_rank() == group.size() - 1) { + old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + old = group.shfl(old, group.size() - 1); + if (TyScan == ScanType::exclusive) { + scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op); + } + scan_result = op(old, scan_result); + return scan_result; + } + }; + + template <ScanType TyScan> + struct scan_update_dispatch<details::multi_tile_group_id, TyScan> { + template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn> + _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>; + using TyRet = details::remove_qual<TyVal>; + const unsigned int num_warps = Size / 32; + // In warp scan result, calculated in warp_lambda + TyRet warp_scan; + + // In warp scan, put sum in the warp_scratch_location + auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) { + warp_scan = + details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op); + if (warp.thread_rank() + 1 == warp.size()) { + *warp_scratch_location = warp_scan; + } + if (TyScan == ScanType::exclusive) { + warp_scan = warp.shfl_up(warp_scan, 1); + } + }; + + // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it + // to its in-warp scan result + auto inter_warp_lambda = + [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) { + auto thread_val = *thread_scratch_location; + auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op); + TyRet offset; + // Single thread does the atomic update with sum of all contributions and reads the old value. + if (subwarp.thread_rank() == subwarp.size() - 1) { + offset = details::atomic_update(dst, scan_result, op); + } + offset = subwarp.shfl(offset, subwarp.size() - 1); + scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op); + // Add offset read from the atomic to the scanned warp sum. + // Skipping first thread, since it got defautly constructed value from the conversion, + // it should just return the offset received from the thread that did the atomic update. + if (subwarp.thread_rank() != 0) { + offset = op(scan_result, offset); + } + *thread_scratch_location = offset; + }; + + TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda); + if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) { + return previous_warps_sum; + } + return op(warp_scan, previous_warps_sum); + } + }; +#endif +#endif + + template <typename TyGroup, typename TyInputVal, typename TyRetVal> + _CG_QUALIFIER void check_scan_params() { + static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ"); + static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile"); + } + +#if defined(_CG_HAS_STL_ATOMICS) + template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal> + _CG_QUALIFIER void check_scan_update_params() { + check_scan_params<TyGroup, TyInputVal, TyRetVal>(); + static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ"); + } +#endif + +} // details + +template <typename TyGroup, typename TyVal, typename TyFn> +_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>(); + + using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>; + return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); +} + +template <typename TyGroup, typename TyVal> +_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) { + return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>()); +} + +template <typename TyGroup, typename TyVal, typename TyFn> +_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) { + details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>(); + + using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>; + return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); +} + +template <typename TyGroup, typename TyVal> +_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) { + return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>()); +} + +#if defined(_CG_HAS_STL_ATOMICS) && defined(_CG_ABI_EXPERIMENTAL) + +namespace experimental { + template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn> + _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) { + details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>(); + + using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>; + return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + + template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco> + _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) { + return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>()); + } + + template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn> + _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) { + details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>(); + + using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>; + return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + + template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco> + _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) { + return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>()); + } + + template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn> + _CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) { + details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>(); + + using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>; + return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + + template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco> + _CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) { + return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>()); + } + + template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn> + _CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) { + details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>(); + + using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>; + return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op)); + } + + template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco> + _CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) { + return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>()); + } +} + +#endif + +_CG_END_NAMESPACE + +#endif // _CG_SCAN_H_ diff --git a/ext/cudart/include/cooperative_groups/details/sync.h b/ext/cudart/include/cooperative_groups/details/sync.h new file mode 100644 index 0000000000000000000000000000000000000000..af144a68a877a00f662e2f6c36417f284e47d337 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/details/sync.h @@ -0,0 +1,276 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CG_GRID_H +#define _CG_GRID_H + +#include "info.h" + +_CG_BEGIN_NAMESPACE + +namespace details +{ + +typedef unsigned int barrier_t; + +_CG_STATIC_QUALIFIER bool bar_has_flipped(unsigned int old_arrive, unsigned int current_arrive) { + return (((old_arrive ^ current_arrive) & 0x80000000) != 0); +} + +_CG_STATIC_QUALIFIER void bar_flush(volatile unsigned int *addr) { +#if __CUDA_ARCH__ < 700 + __threadfence(); +#else + unsigned int val; + asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)addr) : "memory"); + // Avoids compiler warnings from unused variable val + (void)(val = val); +#endif +} + +_CG_STATIC_QUALIFIER unsigned int atomic_add(volatile unsigned int *addr, unsigned int val) { + unsigned int old; +#if __CUDA_ARCH__ < 700 + old = atomicAdd((unsigned int*)addr, val); +#else + asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)addr), "r"(val) : "memory"); +#endif + return old; +} + +_CG_STATIC_QUALIFIER void sync_grids(unsigned int expected, volatile barrier_t *arrived) { + bool cta_master = (threadIdx.x + threadIdx.y + threadIdx.z == 0); + bool gpu_master = (blockIdx.x + blockIdx.y + blockIdx.z == 0); + + __syncthreads(); + + if (cta_master) { + unsigned int nb = 1; + if (gpu_master) { + nb = 0x80000000 - (expected - 1); + } + + __threadfence(); + + unsigned int oldArrive; + oldArrive = atomic_add(arrived, nb); + + while (!bar_has_flipped(oldArrive, *arrived)); + + //flush barrier upon leaving + bar_flush((unsigned int*)arrived); + } + + __syncthreads(); +} + +/* - Multi warp groups synchronization routines - */ + +// Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them, +// thread ranks 32..63 second etc +// Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions +_CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) { + return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32))); +} + +// Default blocking sync. +_CG_STATIC_QUALIFIER void sync_warps(volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) { + unsigned int warp_id = thread_rank / 32; + bool warp_master = (thread_rank % 32 == 0); + unsigned int warp_bit = 1 << warp_id; + unsigned int group_mask = get_group_mask(thread_rank, num_warps); + + __syncwarp(0xFFFFFFFF); + + if (warp_master) { + unsigned int old = atomicOr((unsigned int *)arrived, warp_bit); + if (((old | warp_bit) & group_mask) == group_mask) { + atomicAnd((unsigned int *)arrived, ~group_mask); + } + else { + while(*arrived & warp_bit); + } + } + + __syncwarp(0xFFFFFFFF); +} + +// Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first. +// Warp returning true from this function needs to call sync_warps_release. +_CG_STATIC_QUALIFIER bool sync_warps_last_releases(volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) { + unsigned int warp_id = thread_rank / 32; + bool warp_master = (thread_rank % 32 == 0); + unsigned int warp_bit = 1 << warp_id; + unsigned int group_mask = get_group_mask(thread_rank, num_warps); + + __syncwarp(0xFFFFFFFF); + + unsigned int old = 0; + if (warp_master) { + old = atomicOr((unsigned int *)arrived, warp_bit); + } + old = __shfl_sync(0xFFFFFFFF, old, 0); + if (((old | warp_bit) & group_mask) == group_mask) { + return true; + } + while(*arrived & warp_bit); + + return false; +} + +// Release my group from the barrier. +_CG_STATIC_QUALIFIER void sync_warps_release(volatile barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) { + unsigned int group_mask = get_group_mask(thread_rank, num_warps); + if (is_master) { + atomicAnd((unsigned int *)arrived, ~group_mask); + } +} + +// Arrive at my group barrier, but don't block or release the barrier, even if every one arrives. +// sync_warps_release needs to be called by some warp after this one to reset the barrier. +_CG_STATIC_QUALIFIER void sync_warps_arrive(volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) { + unsigned int warp_id = thread_rank / 32; + bool warp_master = (thread_rank % 32 == 0); + unsigned int warp_bit = 1 << warp_id; + unsigned int group_mask = get_group_mask(thread_rank, num_warps); + + __syncwarp(0xFFFFFFFF); + + if (warp_master) { + unsigned int old = atomicOr((unsigned int *)arrived, warp_bit); + } + __syncwarp(0xFFFFFFFF); +} + +// Arrive at my group barrier, but don't block. Last arriving warp immediately releases the barrier. +_CG_STATIC_QUALIFIER void sync_warps_arrive_release(volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) { + unsigned int warp_id = thread_rank / 32; + bool warp_master = (thread_rank % 32 == 0); + unsigned int warp_bit = 1 << warp_id; + unsigned int group_mask = get_group_mask(thread_rank, num_warps); + + __syncwarp(0xFFFFFFFF); + + if (warp_master) { + unsigned int old = atomicOr((unsigned int *)arrived, warp_bit); + if (((old | warp_bit) & group_mask) == group_mask) { + atomicAnd((unsigned int *)arrived, ~group_mask); + } + } + __syncwarp(0xFFFFFFFF); +} + +// Wait for my warp to be released from the barrier. Warp must have arrived first. +_CG_STATIC_QUALIFIER void sync_warps_wait(volatile barrier_t *arrived, unsigned int thread_rank) { + unsigned int warp_id = thread_rank / 32; + unsigned int warp_bit = 1 << warp_id; + + while(*arrived & warp_bit); + __syncwarp(0xFFFFFFFF); +} + +// Arrive at my group barrier and block. Barrier is not released, even if every warp arrives. +// sync_warps_release needs to be called by some warp after this one. +_CG_STATIC_QUALIFIER void sync_warps_wait_for_release( + volatile barrier_t *arrived, + bool is_master, + unsigned int thread_rank, + unsigned int num_warps) { + + unsigned int warp_id = thread_rank / 32; + unsigned int warp_bit = 1 << warp_id; + + __syncwarp(0xFFFFFFFF); + + if (is_master) { + atomicOr((unsigned int *)arrived, warp_bit); + while(*arrived & warp_bit); + } + + __syncwarp(0xFFFFFFFF); +} + +enum wait_for_warps_kind { + wait_for_all_other_warps, + wait_for_specific_warp +}; + +// Wait for a combinantion of warps specified by Kind parameter to arrive at the group barrier. +// This function does not arrive at the barrier. +template <wait_for_warps_kind Kind> +_CG_QUALIFIER void sync_warps_wait_for_warps( + unsigned int wait_warp_id, volatile barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps); + +template <> +_CG_QUALIFIER void sync_warps_wait_for_warps<wait_for_all_other_warps>( + unsigned int wait_warp_id, + volatile barrier_t *arrived, + unsigned int thread_rank, + unsigned int num_warps) { + + unsigned int wait_mask = get_group_mask(thread_rank, num_warps); + wait_mask &= ~(1 << wait_warp_id); + while((*arrived & wait_mask) != wait_mask); +} + +template <> +_CG_QUALIFIER void sync_warps_wait_for_warps<wait_for_specific_warp>( + unsigned int wait_warp_id, + volatile barrier_t *arrived, + unsigned int thread_rank, + unsigned int num_warps) { + + unsigned int wait_mask = 1 << wait_warp_id; + while((*arrived & wait_mask) != wait_mask); +} + +} // details + +_CG_END_NAMESPACE + +#endif // _CG_GRID_H diff --git a/ext/cudart/include/cooperative_groups/memcpy_async.h b/ext/cudart/include/cooperative_groups/memcpy_async.h new file mode 100644 index 0000000000000000000000000000000000000000..50b907d9a1fe45cdc411891a20d8fd035118e5be --- /dev/null +++ b/ext/cudart/include/cooperative_groups/memcpy_async.h @@ -0,0 +1,62 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_MEMCPY_ASYNC +#define _COOPERATIVE_GROUPS_MEMCPY_ASYNC + +#include "../cooperative_groups.h" +#include "details/info.h" + +#ifdef _CG_CPP11_FEATURES +# include "details/async.h" +#else +# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \ + -std=c++11 compiler option. +#endif + +#endif // _COOPERATIVE_GROUPS_MEMCPY_ASYNC diff --git a/ext/cudart/include/cooperative_groups/reduce.h b/ext/cudart/include/cooperative_groups/reduce.h new file mode 100644 index 0000000000000000000000000000000000000000..3c87d780db0b437f1ae06e0ef8d60137233795c0 --- /dev/null +++ b/ext/cudart/include/cooperative_groups/reduce.h @@ -0,0 +1,63 @@ + /* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_REDUCE_H +#define _COOPERATIVE_GROUPS_REDUCE_H + +#include "../cooperative_groups.h" +#include "details/info.h" + +#ifdef _CG_CPP11_FEATURES +# include "details/reduce.h" +#else +# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \ + -std=c++11 compiler option. +#endif + + +#endif //_COOPERATIVE_GROUPS_REDUCE_H diff --git a/ext/cudart/include/cooperative_groups/scan.h b/ext/cudart/include/cooperative_groups/scan.h new file mode 100644 index 0000000000000000000000000000000000000000..9bc27078028318ada00cbcccd052e0d6cc930cfe --- /dev/null +++ b/ext/cudart/include/cooperative_groups/scan.h @@ -0,0 +1,63 @@ +/* Copyright 1993-2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * The source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * The Licensed Deliverables contained herein are PROPRIETARY and + * CONFIDENTIAL to NVIDIA and are being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _COOPERATIVE_GROUPS_SCAN_H +#define _COOPERATIVE_GROUPS_SCAN_H + +#include "../cooperative_groups.h" +#include "details/info.h" + +#ifdef _CG_CPP11_FEATURES +# include "details/scan.h" +#else +# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \ + -std=c++11 compiler option. +#endif + + +#endif //_COOPERATIVE_GROUPS_SCAN_H diff --git a/ext/cudart/include/cuComplex.h b/ext/cudart/include/cuComplex.h new file mode 100644 index 0000000000000000000000000000000000000000..7b167111b0b387a5279da6749d946560e1c42c1b --- /dev/null +++ b/ext/cudart/include/cuComplex.h @@ -0,0 +1,348 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(CU_COMPLEX_H_) +#define CU_COMPLEX_H_ + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif +#endif +#endif + +/* When trying to include C header file in C++ Code extern "C" is required + * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code + * extern "C" cannot be nested + * Hence keep the header out of extern "C" block + */ + +#if !defined(__CUDACC__) +#include <math.h> /* import fabsf, sqrt */ +#endif /* !defined(__CUDACC__) */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +#include "vector_types.h" + +typedef float2 cuFloatComplex; + +__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) +{ + return x.x; +} + +__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) +{ + return x.y; +} + +__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex + (float r, float i) +{ + cuFloatComplex res; + res.x = r; + res.y = i; + return res; +} + +__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x) +{ + return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x)); +} +__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x, + cuFloatComplex y) +{ + return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), + cuCimagf(x) + cuCimagf(y)); +} + +__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x, + cuFloatComplex y) +{ + return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), + cuCimagf(x) - cuCimagf(y)); +} + +/* This implementation could suffer from intermediate overflow even though + * the final result would be in range. However, various implementations do + * not guard against this (presumably to avoid losing performance), so we + * don't do it either to stay competitive. + */ +__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x, + cuFloatComplex y) +{ + cuFloatComplex prod; + prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) - + (cuCimagf(x) * cuCimagf(y)), + (cuCrealf(x) * cuCimagf(y)) + + (cuCimagf(x) * cuCrealf(y))); + return prod; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Such guarded implementations are usually the default for + * complex library implementations, with some also offering an unguarded, + * faster version. + */ +__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x, + cuFloatComplex y) +{ + cuFloatComplex quot; + float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y)); + float oos = 1.0f / s; + float ars = cuCrealf(x) * oos; + float ais = cuCimagf(x) * oos; + float brs = cuCrealf(y) * oos; + float bis = cuCimagf(y) * oos; + s = (brs * brs) + (bis * bis); + oos = 1.0f / s; + quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos, + ((ais * brs) - (ars * bis)) * oos); + return quot; +} + +/* + * We would like to call hypotf(), but it's not available on all platforms. + * This discrete implementation guards against intermediate underflow and + * overflow by scaling. Otherwise we would lose half the exponent range. + * There are various ways of doing guarded computation. For now chose the + * simplest and fastest solution, however this may suffer from inaccuracies + * if sqrt and division are not IEEE compliant. + */ +__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x) +{ + float a = cuCrealf(x); + float b = cuCimagf(x); + float v, w, t; + a = fabsf(a); + b = fabsf(b); + if (a > b) { + v = a; + w = b; + } else { + v = b; + w = a; + } + t = w / v; + t = 1.0f + t * t; + t = v * sqrtf(t); + if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) { + t = v + w; + } + return t; +} + +/* Double precision */ +typedef double2 cuDoubleComplex; + +__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) +{ + return x.x; +} + +__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) +{ + return x.y; +} + +__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex + (double r, double i) +{ + cuDoubleComplex res; + res.x = r; + res.y = i; + return res; +} + +__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x) +{ + return make_cuDoubleComplex (cuCreal(x), -cuCimag(x)); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x, + cuDoubleComplex y) +{ + return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), + cuCimag(x) + cuCimag(y)); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x, + cuDoubleComplex y) +{ + return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), + cuCimag(x) - cuCimag(y)); +} + +/* This implementation could suffer from intermediate overflow even though + * the final result would be in range. However, various implementations do + * not guard against this (presumably to avoid losing performance), so we + * don't do it either to stay competitive. + */ +__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x, + cuDoubleComplex y) +{ + cuDoubleComplex prod; + prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - + (cuCimag(x) * cuCimag(y)), + (cuCreal(x) * cuCimag(y)) + + (cuCimag(x) * cuCreal(y))); + return prod; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Such guarded implementations are usually the default for + * complex library implementations, with some also offering an unguarded, + * faster version. + */ +__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x, + cuDoubleComplex y) +{ + cuDoubleComplex quot; + double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y))); + double oos = 1.0 / s; + double ars = cuCreal(x) * oos; + double ais = cuCimag(x) * oos; + double brs = cuCreal(y) * oos; + double bis = cuCimag(y) * oos; + s = (brs * brs) + (bis * bis); + oos = 1.0 / s; + quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos, + ((ais * brs) - (ars * bis)) * oos); + return quot; +} + +/* This implementation guards against intermediate underflow and overflow + * by scaling. Otherwise we would lose half the exponent range. There are + * various ways of doing guarded computation. For now chose the simplest + * and fastest solution, however this may suffer from inaccuracies if sqrt + * and division are not IEEE compliant. + */ +__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x) +{ + double a = cuCreal(x); + double b = cuCimag(x); + double v, w, t; + a = fabs(a); + b = fabs(b); + if (a > b) { + v = a; + w = b; + } else { + v = b; + w = a; + } + t = w / v; + t = 1.0 + t * t; + t = v * sqrt(t); + if ((v == 0.0) || + (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) { + t = v + w; + } + return t; +} + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +/* aliases */ +typedef cuFloatComplex cuComplex; +__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, + float y) +{ + return make_cuFloatComplex (x, y); +} + +/* float-to-double promotion */ +__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble + (cuFloatComplex c) +{ + return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c)); +} + +__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat +(cuDoubleComplex c) +{ + return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c)); +} + + +__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d) +{ + float real_res; + float imag_res; + + real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d); + imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d); + + real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res; + imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res; + + return make_cuComplex(real_res, imag_res); +} + +__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d) +{ + double real_res; + double imag_res; + + real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d); + imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d); + + real_res = -(cuCimag(x) * cuCimag(y)) + real_res; + imag_res = (cuCimag(x) * cuCreal(y)) + imag_res; + + return make_cuDoubleComplex(real_res, imag_res); +} + +#endif /* !defined(CU_COMPLEX_H_) */ diff --git a/ext/cudart/include/cuda.h b/ext/cudart/include/cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..3d82e525f49cad8776af65024f6bdf140dd91833 --- /dev/null +++ b/ext/cudart/include/cuda.h @@ -0,0 +1,20295 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef __cuda_cuda_h__ +#define __cuda_cuda_h__ + + + +#include <stdlib.h> +#ifdef _MSC_VER +typedef unsigned __int32 cuuint32_t; +typedef unsigned __int64 cuuint64_t; +#else +#include <stdint.h> +typedef uint32_t cuuint32_t; +typedef uint64_t cuuint64_t; +#endif + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif + +#if defined(CUDA_FORCE_API_VERSION) +#error "CUDA_FORCE_API_VERSION is no longer supported." +#endif + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define __CUDA_API_PER_THREAD_DEFAULT_STREAM + #define __CUDA_API_PTDS(api) api ## _ptds + #define __CUDA_API_PTSZ(api) api ## _ptsz +#else + #define __CUDA_API_PTDS(api) api + #define __CUDA_API_PTSZ(api) api +#endif + +#define cuDeviceTotalMem cuDeviceTotalMem_v2 +#define cuCtxCreate cuCtxCreate_v2 +#define cuCtxCreate_v3 cuCtxCreate_v3 +#define cuModuleGetGlobal cuModuleGetGlobal_v2 +#define cuMemGetInfo cuMemGetInfo_v2 +#define cuMemAlloc cuMemAlloc_v2 +#define cuMemAllocPitch cuMemAllocPitch_v2 +#define cuMemFree cuMemFree_v2 +#define cuMemGetAddressRange cuMemGetAddressRange_v2 +#define cuMemAllocHost cuMemAllocHost_v2 +#define cuMemHostGetDevicePointer cuMemHostGetDevicePointer_v2 +#define cuMemcpyHtoD __CUDA_API_PTDS(cuMemcpyHtoD_v2) +#define cuMemcpyDtoH __CUDA_API_PTDS(cuMemcpyDtoH_v2) +#define cuMemcpyDtoD __CUDA_API_PTDS(cuMemcpyDtoD_v2) +#define cuMemcpyDtoA __CUDA_API_PTDS(cuMemcpyDtoA_v2) +#define cuMemcpyAtoD __CUDA_API_PTDS(cuMemcpyAtoD_v2) +#define cuMemcpyHtoA __CUDA_API_PTDS(cuMemcpyHtoA_v2) +#define cuMemcpyAtoH __CUDA_API_PTDS(cuMemcpyAtoH_v2) +#define cuMemcpyAtoA __CUDA_API_PTDS(cuMemcpyAtoA_v2) +#define cuMemcpyHtoAAsync __CUDA_API_PTSZ(cuMemcpyHtoAAsync_v2) +#define cuMemcpyAtoHAsync __CUDA_API_PTSZ(cuMemcpyAtoHAsync_v2) +#define cuMemcpy2D __CUDA_API_PTDS(cuMemcpy2D_v2) +#define cuMemcpy2DUnaligned __CUDA_API_PTDS(cuMemcpy2DUnaligned_v2) +#define cuMemcpy3D __CUDA_API_PTDS(cuMemcpy3D_v2) +#define cuMemcpyHtoDAsync __CUDA_API_PTSZ(cuMemcpyHtoDAsync_v2) +#define cuMemcpyDtoHAsync __CUDA_API_PTSZ(cuMemcpyDtoHAsync_v2) +#define cuMemcpyDtoDAsync __CUDA_API_PTSZ(cuMemcpyDtoDAsync_v2) +#define cuMemcpy2DAsync __CUDA_API_PTSZ(cuMemcpy2DAsync_v2) +#define cuMemcpy3DAsync __CUDA_API_PTSZ(cuMemcpy3DAsync_v2) +#define cuMemsetD8 __CUDA_API_PTDS(cuMemsetD8_v2) +#define cuMemsetD16 __CUDA_API_PTDS(cuMemsetD16_v2) +#define cuMemsetD32 __CUDA_API_PTDS(cuMemsetD32_v2) +#define cuMemsetD2D8 __CUDA_API_PTDS(cuMemsetD2D8_v2) +#define cuMemsetD2D16 __CUDA_API_PTDS(cuMemsetD2D16_v2) +#define cuMemsetD2D32 __CUDA_API_PTDS(cuMemsetD2D32_v2) +#define cuArrayCreate cuArrayCreate_v2 +#define cuArrayGetDescriptor cuArrayGetDescriptor_v2 +#define cuArray3DCreate cuArray3DCreate_v2 +#define cuArray3DGetDescriptor cuArray3DGetDescriptor_v2 +#define cuTexRefSetAddress cuTexRefSetAddress_v2 +#define cuTexRefGetAddress cuTexRefGetAddress_v2 +#define cuGraphicsResourceGetMappedPointer cuGraphicsResourceGetMappedPointer_v2 +#define cuCtxDestroy cuCtxDestroy_v2 +#define cuCtxPopCurrent cuCtxPopCurrent_v2 +#define cuCtxPushCurrent cuCtxPushCurrent_v2 +#define cuStreamDestroy cuStreamDestroy_v2 +#define cuEventDestroy cuEventDestroy_v2 +#define cuTexRefSetAddress2D cuTexRefSetAddress2D_v3 +#define cuLinkCreate cuLinkCreate_v2 +#define cuLinkAddData cuLinkAddData_v2 +#define cuLinkAddFile cuLinkAddFile_v2 +#define cuMemHostRegister cuMemHostRegister_v2 +#define cuGraphicsResourceSetMapFlags cuGraphicsResourceSetMapFlags_v2 +#define cuStreamBeginCapture __CUDA_API_PTSZ(cuStreamBeginCapture_v2) +#define cuDevicePrimaryCtxRelease cuDevicePrimaryCtxRelease_v2 +#define cuDevicePrimaryCtxReset cuDevicePrimaryCtxReset_v2 +#define cuDevicePrimaryCtxSetFlags cuDevicePrimaryCtxSetFlags_v2 +#define cuDeviceGetUuid_v2 cuDeviceGetUuid_v2 +#define cuIpcOpenMemHandle cuIpcOpenMemHandle_v2 +#define cuGraphInstantiate cuGraphInstantiate_v2 + +#if defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define cuMemcpy __CUDA_API_PTDS(cuMemcpy) + #define cuMemcpyAsync __CUDA_API_PTSZ(cuMemcpyAsync) + #define cuMemcpyPeer __CUDA_API_PTDS(cuMemcpyPeer) + #define cuMemcpyPeerAsync __CUDA_API_PTSZ(cuMemcpyPeerAsync) + #define cuMemcpy3DPeer __CUDA_API_PTDS(cuMemcpy3DPeer) + #define cuMemcpy3DPeerAsync __CUDA_API_PTSZ(cuMemcpy3DPeerAsync) + #define cuMemPrefetchAsync __CUDA_API_PTSZ(cuMemPrefetchAsync) + + #define cuMemsetD8Async __CUDA_API_PTSZ(cuMemsetD8Async) + #define cuMemsetD16Async __CUDA_API_PTSZ(cuMemsetD16Async) + #define cuMemsetD32Async __CUDA_API_PTSZ(cuMemsetD32Async) + #define cuMemsetD2D8Async __CUDA_API_PTSZ(cuMemsetD2D8Async) + #define cuMemsetD2D16Async __CUDA_API_PTSZ(cuMemsetD2D16Async) + #define cuMemsetD2D32Async __CUDA_API_PTSZ(cuMemsetD2D32Async) + + #define cuStreamGetPriority __CUDA_API_PTSZ(cuStreamGetPriority) + #define cuStreamGetFlags __CUDA_API_PTSZ(cuStreamGetFlags) + #define cuStreamGetCtx __CUDA_API_PTSZ(cuStreamGetCtx) + #define cuStreamWaitEvent __CUDA_API_PTSZ(cuStreamWaitEvent) + #define cuStreamEndCapture __CUDA_API_PTSZ(cuStreamEndCapture) + #define cuStreamIsCapturing __CUDA_API_PTSZ(cuStreamIsCapturing) + #define cuStreamGetCaptureInfo __CUDA_API_PTSZ(cuStreamGetCaptureInfo) + #define cuStreamGetCaptureInfo_v2 __CUDA_API_PTSZ(cuStreamGetCaptureInfo_v2) + #define cuStreamUpdateCaptureDependencies __CUDA_API_PTSZ(cuStreamUpdateCaptureDependencies) + #define cuStreamAddCallback __CUDA_API_PTSZ(cuStreamAddCallback) + #define cuStreamAttachMemAsync __CUDA_API_PTSZ(cuStreamAttachMemAsync) + #define cuStreamQuery __CUDA_API_PTSZ(cuStreamQuery) + #define cuStreamSynchronize __CUDA_API_PTSZ(cuStreamSynchronize) + #define cuEventRecord __CUDA_API_PTSZ(cuEventRecord) + #define cuEventRecordWithFlags __CUDA_API_PTSZ(cuEventRecordWithFlags) + #define cuLaunchKernel __CUDA_API_PTSZ(cuLaunchKernel) + #define cuLaunchKernelEx __CUDA_API_PTSZ(cuLaunchKernelEx) + #define cuLaunchHostFunc __CUDA_API_PTSZ(cuLaunchHostFunc) + #define cuGraphicsMapResources __CUDA_API_PTSZ(cuGraphicsMapResources) + #define cuGraphicsUnmapResources __CUDA_API_PTSZ(cuGraphicsUnmapResources) + + #define cuStreamWriteValue32 __CUDA_API_PTSZ(cuStreamWriteValue32) + #define cuStreamWaitValue32 __CUDA_API_PTSZ(cuStreamWaitValue32) + #define cuStreamWriteValue64 __CUDA_API_PTSZ(cuStreamWriteValue64) + #define cuStreamWaitValue64 __CUDA_API_PTSZ(cuStreamWaitValue64) + #define cuStreamBatchMemOp __CUDA_API_PTSZ(cuStreamBatchMemOp) + #define cuStreamWriteValue32_v2 __CUDA_API_PTSZ(cuStreamWriteValue32_v2) + #define cuStreamWaitValue32_v2 __CUDA_API_PTSZ(cuStreamWaitValue32_v2) + #define cuStreamWriteValue64_v2 __CUDA_API_PTSZ(cuStreamWriteValue64_v2) + #define cuStreamWaitValue64_v2 __CUDA_API_PTSZ(cuStreamWaitValue64_v2) + #define cuStreamBatchMemOp_v2 __CUDA_API_PTSZ(cuStreamBatchMemOp_v2) + + #define cuLaunchCooperativeKernel __CUDA_API_PTSZ(cuLaunchCooperativeKernel) + + #define cuSignalExternalSemaphoresAsync __CUDA_API_PTSZ(cuSignalExternalSemaphoresAsync) + #define cuWaitExternalSemaphoresAsync __CUDA_API_PTSZ(cuWaitExternalSemaphoresAsync) + + #define cuGraphUpload __CUDA_API_PTSZ(cuGraphUpload) + #define cuGraphLaunch __CUDA_API_PTSZ(cuGraphLaunch) + #define cuStreamCopyAttributes __CUDA_API_PTSZ(cuStreamCopyAttributes) + #define cuStreamGetAttribute __CUDA_API_PTSZ(cuStreamGetAttribute) + #define cuStreamSetAttribute __CUDA_API_PTSZ(cuStreamSetAttribute) + #define cuMemMapArrayAsync __CUDA_API_PTSZ(cuMemMapArrayAsync) + + #define cuMemFreeAsync __CUDA_API_PTSZ(cuMemFreeAsync) + #define cuMemAllocAsync __CUDA_API_PTSZ(cuMemAllocAsync) + #define cuMemAllocFromPoolAsync __CUDA_API_PTSZ(cuMemAllocFromPoolAsync) +#endif + +/** + * \file cuda.h + * \brief Header file for the CUDA Toolkit application programming interface. + * + * \file cudaGL.h + * \brief Header file for the OpenGL interoperability functions of the + * low-level CUDA driver application programming interface. + * + * \file cudaD3D9.h + * \brief Header file for the Direct3D 9 interoperability functions of the + * low-level CUDA driver application programming interface. + */ + +/** + * \defgroup CUDA_TYPES Data types used by CUDA driver + * @{ + */ + +/** + * CUDA API version number + */ +#define CUDA_VERSION 11080 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * CUDA device pointer + * CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. + */ +#if defined(_WIN64) || defined(__LP64__) +typedef unsigned long long CUdeviceptr_v2; +#else +typedef unsigned int CUdeviceptr_v2; +#endif +typedef CUdeviceptr_v2 CUdeviceptr; /**< CUDA device pointer */ + +typedef int CUdevice_v1; /**< CUDA device */ +typedef CUdevice_v1 CUdevice; /**< CUDA device */ +typedef struct CUctx_st *CUcontext; /**< CUDA context */ +typedef struct CUmod_st *CUmodule; /**< CUDA module */ +typedef struct CUfunc_st *CUfunction; /**< CUDA function */ +typedef struct CUarray_st *CUarray; /**< CUDA array */ +typedef struct CUmipmappedArray_st *CUmipmappedArray; /**< CUDA mipmapped array */ +typedef struct CUtexref_st *CUtexref; /**< CUDA texture reference */ +typedef struct CUsurfref_st *CUsurfref; /**< CUDA surface reference */ +typedef struct CUevent_st *CUevent; /**< CUDA event */ +typedef struct CUstream_st *CUstream; /**< CUDA stream */ +typedef struct CUgraphicsResource_st *CUgraphicsResource; /**< CUDA graphics interop resource */ +typedef unsigned long long CUtexObject_v1; /**< An opaque value that represents a CUDA texture object */ +typedef CUtexObject_v1 CUtexObject; /**< An opaque value that represents a CUDA texture object */ +typedef unsigned long long CUsurfObject_v1; /**< An opaque value that represents a CUDA surface object */ +typedef CUsurfObject_v1 CUsurfObject; /**< An opaque value that represents a CUDA surface object */ +typedef struct CUextMemory_st *CUexternalMemory; /**< CUDA external memory */ +typedef struct CUextSemaphore_st *CUexternalSemaphore; /**< CUDA external semaphore */ +typedef struct CUgraph_st *CUgraph; /**< CUDA graph */ +typedef struct CUgraphNode_st *CUgraphNode; /**< CUDA graph node */ +typedef struct CUgraphExec_st *CUgraphExec; /**< CUDA executable graph */ +typedef struct CUmemPoolHandle_st *CUmemoryPool; /**< CUDA memory pool */ +typedef struct CUuserObject_st *CUuserObject; /**< CUDA user object for graphs */ + +#ifndef CU_UUID_HAS_BEEN_DEFINED +#define CU_UUID_HAS_BEEN_DEFINED +typedef struct CUuuid_st { /**< CUDA definition of UUID */ + char bytes[16]; +} CUuuid; +#endif + +/** + * CUDA IPC handle size + */ +#define CU_IPC_HANDLE_SIZE 64 + +/** + * CUDA IPC event handle + */ +typedef struct CUipcEventHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcEventHandle_v1; +typedef CUipcEventHandle_v1 CUipcEventHandle; + +/** + * CUDA IPC mem handle + */ +typedef struct CUipcMemHandle_st { + char reserved[CU_IPC_HANDLE_SIZE]; +} CUipcMemHandle_v1; +typedef CUipcMemHandle_v1 CUipcMemHandle; + +/** + * CUDA Ipc Mem Flags + */ +typedef enum CUipcMem_flags_enum { + CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1 /**< Automatically enable peer access between remote devices as needed */ +} CUipcMem_flags; + + +/** + * CUDA Mem Attach Flags + */ +typedef enum CUmemAttach_flags_enum { + CU_MEM_ATTACH_GLOBAL = 0x1, /**< Memory can be accessed by any stream on any device */ + CU_MEM_ATTACH_HOST = 0x2, /**< Memory cannot be accessed by any stream on any device */ + CU_MEM_ATTACH_SINGLE = 0x4 /**< Memory can only be accessed by a single stream on the associated device */ +} CUmemAttach_flags; + +/** + * Context creation flags + */ +typedef enum CUctx_flags_enum { + CU_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + CU_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + CU_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + CU_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ + CU_CTX_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling + * \deprecated This flag was deprecated as of CUDA 4.0 + * and was replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. */ + CU_CTX_SCHED_MASK = 0x07, + CU_CTX_MAP_HOST = 0x08, /**< \deprecated This flag was deprecated as of CUDA 11.0 + * and it no longer has any effect. All contexts + * as of CUDA 3.2 behave as though the flag is enabled. */ + CU_CTX_LMEM_RESIZE_TO_MAX = 0x10, /**< Keep local memory allocation after launch */ + CU_CTX_FLAGS_MASK = 0x1f +} CUctx_flags; + +/** + * Event sched flags + */ +typedef enum CUevent_sched_flags_enum { + CU_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + CU_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + CU_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + CU_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ +} CUevent_sched_flags; + +/** + * NVCL event scheduling flags + */ +typedef enum cl_event_flags_enum { + NVCL_EVENT_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + NVCL_EVENT_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + NVCL_EVENT_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + NVCL_EVENT_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ +} cl_event_flags; + +/** + * NVCL context scheduling flags + */ +typedef enum cl_context_flags_enum { + NVCL_CTX_SCHED_AUTO = 0x00, /**< Automatic scheduling */ + NVCL_CTX_SCHED_SPIN = 0x01, /**< Set spin as default scheduling */ + NVCL_CTX_SCHED_YIELD = 0x02, /**< Set yield as default scheduling */ + NVCL_CTX_SCHED_BLOCKING_SYNC = 0x04, /**< Set blocking synchronization as default scheduling */ +} cl_context_flags; + + +/** + * Stream creation flags + */ +typedef enum CUstream_flags_enum { + CU_STREAM_DEFAULT = 0x0, /**< Default stream flag */ + CU_STREAM_NON_BLOCKING = 0x1 /**< Stream does not synchronize with stream 0 (the NULL stream) */ +} CUstream_flags; + +/** + * Legacy stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with legacy synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_LEGACY ((CUstream)0x1) + +/** + * Per-thread stream handle + * + * Stream handle that can be passed as a CUstream to use an implicit stream + * with per-thread synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define CU_STREAM_PER_THREAD ((CUstream)0x2) + +/** + * Event creation flags + */ +typedef enum CUevent_flags_enum { + CU_EVENT_DEFAULT = 0x0, /**< Default event flag */ + CU_EVENT_BLOCKING_SYNC = 0x1, /**< Event uses blocking synchronization */ + CU_EVENT_DISABLE_TIMING = 0x2, /**< Event will not record timing data */ + CU_EVENT_INTERPROCESS = 0x4 /**< Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set */ +} CUevent_flags; + +/** + * Event record flags + */ +typedef enum CUevent_record_flags_enum { + CU_EVENT_RECORD_DEFAULT = 0x0, /**< Default event record flag */ + CU_EVENT_RECORD_EXTERNAL = 0x1 /**< When using stream capture, create an event record node + * instead of the default behavior. This flag is invalid + * when used outside of capture. */ +} CUevent_record_flags; + +/** + * Event wait flags + */ +typedef enum CUevent_wait_flags_enum { + CU_EVENT_WAIT_DEFAULT = 0x0, /**< Default event wait flag */ + CU_EVENT_WAIT_EXTERNAL = 0x1 /**< When using stream capture, create an event wait node + * instead of the default behavior. This flag is invalid + * when used outside of capture.*/ +} CUevent_wait_flags; + +/** + * Flags for ::cuStreamWaitValue32 and ::cuStreamWaitValue64 + */ +typedef enum CUstreamWaitValue_flags_enum { + CU_STREAM_WAIT_VALUE_GEQ = 0x0, /**< Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit + values). Note this is a cyclic comparison which ignores wraparound. + (Default behavior.) */ + CU_STREAM_WAIT_VALUE_EQ = 0x1, /**< Wait until *addr == value. */ + CU_STREAM_WAIT_VALUE_AND = 0x2, /**< Wait until (*addr & value) != 0. */ + CU_STREAM_WAIT_VALUE_NOR = 0x3, /**< Wait until ~(*addr | value) != 0. Support for this operation can be + queried with ::cuDeviceGetAttribute() and + ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR.*/ + CU_STREAM_WAIT_VALUE_FLUSH = 1<<30 /**< Follow the wait operation with a flush of outstanding remote writes. This + means that, if a remote write operation is guaranteed to have reached the + device before the wait can be satisfied, that write is guaranteed to be + visible to downstream device work. The device is permitted to reorder + remote writes internally. For example, this flag would be required if + two remote writes arrive in a defined order, the wait is satisfied by the + second write, and downstream work needs to observe the first write. + Support for this operation is restricted to selected platforms and can be + queried with ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.*/ +} CUstreamWaitValue_flags; + +/** + * Flags for ::cuStreamWriteValue32 + */ +typedef enum CUstreamWriteValue_flags_enum { + CU_STREAM_WRITE_VALUE_DEFAULT = 0x0, /**< Default behavior */ + CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1 /**< Permits the write to be reordered with writes which were issued + before it, as a performance optimization. Normally, + ::cuStreamWriteValue32 will provide a memory fence before the + write, which has similar semantics to + __threadfence_system() but is scoped to the stream + rather than a CUDA thread. + This flag is not supported in the v2 API. */ +} CUstreamWriteValue_flags; + +/** + * Operations for ::cuStreamBatchMemOp + */ +typedef enum CUstreamBatchMemOpType_enum { + CU_STREAM_MEM_OP_WAIT_VALUE_32 = 1, /**< Represents a ::cuStreamWaitValue32 operation */ + CU_STREAM_MEM_OP_WRITE_VALUE_32 = 2, /**< Represents a ::cuStreamWriteValue32 operation */ + CU_STREAM_MEM_OP_WAIT_VALUE_64 = 4, /**< Represents a ::cuStreamWaitValue64 operation */ + CU_STREAM_MEM_OP_WRITE_VALUE_64 = 5, /**< Represents a ::cuStreamWriteValue64 operation */ + CU_STREAM_MEM_OP_BARRIER = 6, /**< Insert a memory barrier of the specified type */ + CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3 /**< This has the same effect as ::CU_STREAM_WAIT_VALUE_FLUSH, but as a + standalone operation. */ +} CUstreamBatchMemOpType; + +/** + * Flags for ::cuStreamMemoryBarrier + */ +typedef enum CUstreamMemoryBarrier_flags_enum { + CU_STREAM_MEMORY_BARRIER_TYPE_SYS = 0x0, /**< System-wide memory barrier. */ + CU_STREAM_MEMORY_BARRIER_TYPE_GPU = 0x1 /**< Limit memory barrier scope to the GPU. */ +} CUstreamMemoryBarrier_flags; + +/** + * Per-operation parameters for ::cuStreamBatchMemOp + */ +typedef union CUstreamBatchMemOpParams_union { + CUstreamBatchMemOpType operation; + struct CUstreamMemOpWaitValueParams_st { + CUstreamBatchMemOpType operation; + CUdeviceptr address; + union { + cuuint32_t value; + cuuint64_t value64; + }; + unsigned int flags; + CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ + } waitValue; + struct CUstreamMemOpWriteValueParams_st { + CUstreamBatchMemOpType operation; + CUdeviceptr address; + union { + cuuint32_t value; + cuuint64_t value64; + }; + unsigned int flags; + CUdeviceptr alias; /**< For driver internal use. Initial value is unimportant. */ + } writeValue; + struct CUstreamMemOpFlushRemoteWritesParams_st { + CUstreamBatchMemOpType operation; + unsigned int flags; + } flushRemoteWrites; + struct CUstreamMemOpMemoryBarrierParams_st { /**< Only supported in the _v2 API */ + CUstreamBatchMemOpType operation; + unsigned int flags; + } memoryBarrier; + cuuint64_t pad[6]; +} CUstreamBatchMemOpParams_v1; +typedef CUstreamBatchMemOpParams_v1 CUstreamBatchMemOpParams; + +typedef struct CUDA_BATCH_MEM_OP_NODE_PARAMS_st { + CUcontext ctx; + unsigned int count; + CUstreamBatchMemOpParams *paramArray; + unsigned int flags; +} CUDA_BATCH_MEM_OP_NODE_PARAMS; + +/** + * Occupancy calculator flag + */ +typedef enum CUoccupancy_flags_enum { + CU_OCCUPANCY_DEFAULT = 0x0, /**< Default behavior */ + CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1 /**< Assume global caching is enabled and cannot be automatically turned off */ +} CUoccupancy_flags; + +/** + * Flags for ::cuStreamUpdateCaptureDependencies + */ +typedef enum CUstreamUpdateCaptureDependencies_flags_enum { + CU_STREAM_ADD_CAPTURE_DEPENDENCIES = 0x0, /**< Add new nodes to the dependency set */ + CU_STREAM_SET_CAPTURE_DEPENDENCIES = 0x1 /**< Replace the dependency set with the new nodes */ +} CUstreamUpdateCaptureDependencies_flags; + +/** + * Array formats + */ +typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, /**< Unsigned 8-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, /**< Unsigned 16-bit integers */ + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, /**< Unsigned 32-bit integers */ + CU_AD_FORMAT_SIGNED_INT8 = 0x08, /**< Signed 8-bit integers */ + CU_AD_FORMAT_SIGNED_INT16 = 0x09, /**< Signed 16-bit integers */ + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */ + CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */ + CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */ + CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */ + CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */ + CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */ + CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */ + CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */ + CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/ + CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */ + CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */ + CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */ + CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */ + CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */ + CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */ + CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */ + CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */ +} CUarray_format; + +/** + * Texture reference addressing modes + */ +typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, /**< Wrapping address mode */ + CU_TR_ADDRESS_MODE_CLAMP = 1, /**< Clamp to edge address mode */ + CU_TR_ADDRESS_MODE_MIRROR = 2, /**< Mirror address mode */ + CU_TR_ADDRESS_MODE_BORDER = 3 /**< Border address mode */ +} CUaddress_mode; + +/** + * Texture reference filtering modes + */ +typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, /**< Point filter mode */ + CU_TR_FILTER_MODE_LINEAR = 1 /**< Linear filter mode */ +} CUfilter_mode; + +/** + * Device properties + */ +typedef enum CUdevice_attribute_enum { + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1, /**< Maximum number of threads per block */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2, /**< Maximum block dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3, /**< Maximum block dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4, /**< Maximum block dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5, /**< Maximum grid dimension X */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y = 6, /**< Maximum grid dimension Y */ + CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z = 7, /**< Maximum grid dimension Z */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8, /**< Maximum shared memory available per block in bytes */ + CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = 8, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10, /**< Warp size in threads */ + CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11, /**< Maximum pitch in bytes allowed by memory copies */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12, /**< Maximum number of 32-bit registers available per block */ + CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = 12, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK */ + CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13, /**< Typical clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, /**< Alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15, /**< Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT. */ + CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16, /**< Number of multiprocessors on device */ + CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17, /**< Specifies whether there is a run time limit on kernels */ + CU_DEVICE_ATTRIBUTE_INTEGRATED = 18, /**< Device is integrated with host memory */ + CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19, /**< Device can map host memory into CUDA address space */ + CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20, /**< Compute mode (See ::CUcomputemode for details) */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21, /**< Maximum 1D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22, /**< Maximum 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23, /**< Maximum 2D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24, /**< Maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25, /**< Maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26, /**< Maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27, /**< Maximum 2D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28, /**< Maximum 2D layered texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29, /**< Maximum layers in a 2D layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = 27, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = 28, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = 29, /**< Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS */ + CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30, /**< Alignment requirement for surfaces */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31, /**< Device can possibly execute multiple kernels concurrently */ + CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32, /**< Device has ECC support enabled */ + CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33, /**< PCI bus ID of the device */ + CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34, /**< PCI device ID of the device */ + CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35, /**< Device is using TCC driver model */ + CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36, /**< Peak memory clock frequency in kilohertz */ + CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37, /**< Global memory bus width in bits */ + CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38, /**< Size of L2 cache in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39, /**< Maximum resident threads per multiprocessor */ + CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40, /**< Number of asynchronous engines */ + CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41, /**< Device shares a unified address space with the host */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42, /**< Maximum 1D layered texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43, /**< Maximum layers in a 1D layered texture */ + CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER = 44, /**< Deprecated, do not use. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45, /**< Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46, /**< Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47, /**< Alternate maximum 3D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48, /**< Alternate maximum 3D texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49, /**< Alternate maximum 3D texture depth */ + CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50, /**< PCI domain ID of the device */ + CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51, /**< Pitch alignment requirement for textures */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52, /**< Maximum cubemap texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53, /**< Maximum cubemap layered texture width/height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54, /**< Maximum layers in a cubemap layered texture */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55, /**< Maximum 1D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56, /**< Maximum 2D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57, /**< Maximum 2D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58, /**< Maximum 3D surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59, /**< Maximum 3D surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60, /**< Maximum 3D surface depth */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61, /**< Maximum 1D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62, /**< Maximum layers in a 1D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63, /**< Maximum 2D layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64, /**< Maximum 2D layered surface height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65, /**< Maximum layers in a 2D layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66, /**< Maximum cubemap surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67, /**< Maximum cubemap layered surface width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68, /**< Maximum layers in a cubemap layered surface */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69, /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70, /**< Maximum 2D linear texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71, /**< Maximum 2D linear texture height */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72, /**< Maximum 2D linear texture pitch in bytes */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73, /**< Maximum mipmapped 2D texture width */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74, /**< Maximum mipmapped 2D texture height */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, /**< Major compute capability version number */ + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, /**< Minor compute capability version number */ + CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77, /**< Maximum mipmapped 1D texture width */ + CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78, /**< Device supports stream priorities */ + CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79, /**< Device supports caching globals in L1 */ + CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80, /**< Device supports caching locals in L1 */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83, /**< Device can allocate managed memory on this system */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84, /**< Device is on a multi-GPU board */ + CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85, /**< Unique id for a group of devices on the same multi-GPU board */ + CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86, /**< Link between the device and the host supports native atomic operations (this is a placeholder attribute, and is not supported on any current hardware)*/ + CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89, /**< Device can coherently access managed memory concurrently with the CPU */ + CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90, /**< Device supports compute preemption. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS = 92, /**< ::cuStreamBatchMemOp and related APIs are supported. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS = 93, /**< 64-bit operations are supported in ::cuStreamBatchMemOp and related APIs. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR = 94, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported. */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH = 95, /**< Device supports launching cooperative kernels via ::cuLaunchCooperativeKernel */ + CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH = 96, /**< Deprecated, ::cuLaunchCooperativeKernelMultiDevice is deprecated. */ + CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN = 97, /**< Maximum optin shared memory per block */ + CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES = 98, /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. See \ref CUDA_MEMOP for additional details. */ + CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ + CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES = 100, /**< Device accesses pageable memory via the host's page tables. */ + CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST = 101, /**< The host can directly access managed memory on the device without migration. */ + CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = 102, /**< Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED*/ + CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */ + CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */ + CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ + CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE = 109, /**< Maximum value of CUaccessPolicyWindow::num_bytes. */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */ + CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ + CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ + CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */ + CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, /**< External timeline semaphore interop is supported on the device */ + CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum */ + CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. */ + CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES = 119, /**< Handle types supported with mempool based IPC */ + CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH = 120, /**< Indicates device supports cluster launch */ + CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */ + CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2 = 122, /**< 64-bit operations are supported in ::cuStreamBatchMemOp_v2 and related v2 MemOp APIs. */ + CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2 = 123, /**< ::CU_STREAM_WAIT_VALUE_NOR is supported by v2 MemOp APIs. */ + CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED = 124, /**< Device supports buffer sharing with dma_buf mechanism. */ + CU_DEVICE_ATTRIBUTE_MAX +} CUdevice_attribute; + +/** + * Legacy device properties + */ +typedef struct CUdevprop_st { + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int totalConstantMemory; /**< Constant memory available on device in bytes */ + int SIMDWidth; /**< Warp size in threads */ + int memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int regsPerBlock; /**< 32-bit registers available per block */ + int clockRate; /**< Clock frequency in kilohertz */ + int textureAlign; /**< Alignment requirement for textures */ +} CUdevprop_v1; +typedef CUdevprop_v1 CUdevprop; + +/** + * Pointer information + */ +typedef enum CUpointer_attribute_enum { + CU_POINTER_ATTRIBUTE_CONTEXT = 1, /**< The ::CUcontext on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_MEMORY_TYPE = 2, /**< The ::CUmemorytype describing the physical location of a pointer */ + CU_POINTER_ATTRIBUTE_DEVICE_POINTER = 3, /**< The address at which a pointer's memory may be accessed on the device */ + CU_POINTER_ATTRIBUTE_HOST_POINTER = 4, /**< The address at which a pointer's memory may be accessed on the host */ + CU_POINTER_ATTRIBUTE_P2P_TOKENS = 5, /**< A pair of tokens for use with the nv-p2p.h Linux kernel interface */ + CU_POINTER_ATTRIBUTE_SYNC_MEMOPS = 6, /**< Synchronize every synchronous memory operation initiated on this region */ + CU_POINTER_ATTRIBUTE_BUFFER_ID = 7, /**< A process-wide unique ID for an allocated memory region*/ + CU_POINTER_ATTRIBUTE_IS_MANAGED = 8, /**< Indicates if the pointer points to managed memory */ + CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL = 9, /**< A device ordinal of a device on which a pointer was allocated or registered */ + CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = 10, /**< 1 if this pointer maps to an allocation that is suitable for ::cudaIpcGetMemHandle, 0 otherwise **/ + CU_POINTER_ATTRIBUTE_RANGE_START_ADDR = 11, /**< Starting address for this requested pointer */ + CU_POINTER_ATTRIBUTE_RANGE_SIZE = 12, /**< Size of the address range for this requested pointer */ + CU_POINTER_ATTRIBUTE_MAPPED = 13, /**< 1 if this pointer is in a valid address range that is mapped to a backing allocation, 0 otherwise **/ + CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES = 14, /**< Bitmask of allowed ::CUmemAllocationHandleType for this allocation **/ + CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE = 15, /**< 1 if the memory this pointer is referencing can be used with the GPUDirect RDMA API **/ + CU_POINTER_ATTRIBUTE_ACCESS_FLAGS = 16, /**< Returns the access flags the device associated with the current context has on the corresponding memory referenced by the pointer given */ + CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE = 17 /**< Returns the mempool handle for the allocation if it was allocated from a mempool. Otherwise returns NULL. **/ + , + CU_POINTER_ATTRIBUTE_MAPPING_SIZE = 18, /**< Size of the actual underlying mapping that the pointer belongs to **/ + CU_POINTER_ATTRIBUTE_MAPPING_BASE_ADDR = 19, /**< The start address of the mapping that the pointer belongs to **/ + CU_POINTER_ATTRIBUTE_MEMORY_BLOCK_ID = 20 /**< A process-wide unique id corresponding to the physical allocation the pointer belongs to **/ +} CUpointer_attribute; + +/** + * Function properties + */ +typedef enum CUfunction_attribute_enum { + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + + /** + * The size in bytes of statically-allocated shared memory required by + * this function. This does not include dynamically-allocated shared + * memory requested by the user at runtime. + */ + CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + + /** + * The size in bytes of local memory used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + + /** + * The number of registers used by each thread of this function. + */ + CU_FUNC_ATTRIBUTE_NUM_REGS = 4, + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + * Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + */ + CU_FUNC_ATTRIBUTE_PTX_VERSION = 5, + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. Note that + * this will return a value of 10 for legacy cubins that do not have a + * properly-encoded binary architecture version. + */ + CU_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + + /** + * The attribute to indicate whether the function has been compiled with + * user specified option "-Xptxas --dlcm=ca" set . + */ + CU_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + + /** + * The maximum size in bytes of dynamically-allocated shared memory that can be used by + * this function. If the user-specified dynamic shared memory size is larger than this + * value, the launch will fail. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + + /** + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total shared memory. + * Refer to ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, + + /** + * If this attribute is set, the kernel must launch with a valid cluster + * size specified. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET = 10, + + /** + * The required cluster width in blocks. The values must either all be 0 or + * all be positive. The validity of the cluster dimensions is otherwise + * checked at launch time. + * + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH = 11, + + /** + * The required cluster height in blocks. The values must either all be 0 or + * all be positive. The validity of the cluster dimensions is otherwise + * checked at launch time. + * + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT = 12, + + /** + * The required cluster depth in blocks. The values must either all be 0 or + * all be positive. The validity of the cluster dimensions is otherwise + * checked at launch time. + * + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH = 13, + + /** + * Whether the function can be launched with non-portable cluster size. 1 is + * allowed, 0 is disallowed. A non-portable cluster size may only function + * on the specific SKUs the program is tested on. The launch might fail if + * the program is run on a different hardware platform. + * + * CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking + * whether the desired size can be launched on the current device. + * + * Portable Cluster Size + * + * A portable cluster size is guaranteed to be functional on all compute + * capabilities higher than the target compute capability. The portable + * cluster size for sm_90 is 8 blocks per cluster. This value may increase + * for future compute capabilities. + * + * The specific hardware unit may support higher cluster sizes that’s not + * guaranteed to be portable. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED = 14, + + /** + * The block scheduling policy of a function. The value type is + * CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. + * See ::cuFuncSetAttribute + */ + CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 15, + + CU_FUNC_ATTRIBUTE_MAX +} CUfunction_attribute; + +/** + * Function cache configurations + */ +typedef enum CUfunc_cache_enum { + CU_FUNC_CACHE_PREFER_NONE = 0x00, /**< no preference for shared memory or L1 (default) */ + CU_FUNC_CACHE_PREFER_SHARED = 0x01, /**< prefer larger shared memory and smaller L1 cache */ + CU_FUNC_CACHE_PREFER_L1 = 0x02, /**< prefer larger L1 cache and smaller shared memory */ + CU_FUNC_CACHE_PREFER_EQUAL = 0x03 /**< prefer equal sized L1 cache and shared memory */ +} CUfunc_cache; + +/** + * Shared memory configurations + */ +typedef enum CUsharedconfig_enum { + CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE = 0x00, /**< set default shared memory bank size */ + CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE = 0x01, /**< set shared memory bank width to four bytes */ + CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE = 0x02 /**< set shared memory bank width to eight bytes */ +} CUsharedconfig; + +/** + * Shared memory carveout configurations. These may be passed to ::cuFuncSetAttribute + */ +typedef enum CUshared_carveout_enum { + CU_SHAREDMEM_CARVEOUT_DEFAULT = -1, /**< No preference for shared memory or L1 (default) */ + CU_SHAREDMEM_CARVEOUT_MAX_SHARED = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ + CU_SHAREDMEM_CARVEOUT_MAX_L1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ +} CUshared_carveout; + +/** + * Memory types + */ +typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, /**< Host memory */ + CU_MEMORYTYPE_DEVICE = 0x02, /**< Device memory */ + CU_MEMORYTYPE_ARRAY = 0x03, /**< Array memory */ + CU_MEMORYTYPE_UNIFIED = 0x04 /**< Unified device or host memory */ +} CUmemorytype; + +/** + * Compute Modes + */ +typedef enum CUcomputemode_enum { + CU_COMPUTEMODE_DEFAULT = 0, /**< Default compute mode (Multiple contexts allowed per device) */ + CU_COMPUTEMODE_PROHIBITED = 2, /**< Compute-prohibited mode (No contexts can be created on this device at this time) */ + CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3 /**< Compute-exclusive-process mode (Only one context used by a single process can be present on this device at a time) */ +} CUcomputemode; + +/** + * Memory advise values + */ +typedef enum CUmem_advise_enum { + CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occassionally be written to */ + CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ + CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ + CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ + CU_MEM_ADVISE_SET_ACCESSED_BY = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ + CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ +} CUmem_advise; + +typedef enum CUmem_range_attribute_enum { + CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occassionally be written to */ + CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ + CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ + CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ +} CUmem_range_attribute; + +/** + * Online compiler and linker options + */ +typedef enum CUjit_option_enum +{ + /** + * Max number of registers that a thread may use.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_MAX_REGISTERS = 0, + + /** + * IN: Specifies minimum number of threads per block to target compilation + * for\n + * OUT: Returns the number of threads the compiler actually targeted. + * This restricts the resource utilization fo the compiler (e.g. max + * registers) such that a block with the given number of threads should be + * able to launch based on register limitations. Note, this option does not + * currently take into account any other resource limitations, such as + * shared memory utilization.\n + * Cannot be combined with ::CU_JIT_TARGET.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_THREADS_PER_BLOCK, + + /** + * Overwrites the option value with the total wall clock time, in + * milliseconds, spent in the compiler and linker\n + * Option type: float\n + * Applies to: compiler and linker + */ + CU_JIT_WALL_TIME, + + /** + * Pointer to a buffer in which to print any log messages + * that are informational in nature (the buffer size is specified via + * option ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + + /** + * Pointer to a buffer in which to print any log messages that + * reflect errors (the buffer size is specified via option + * ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES)\n + * Option type: char *\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER, + + /** + * IN: Log buffer size in bytes. Log messages will be capped at this size + * (including null terminator)\n + * OUT: Amount of log buffer filled with messages\n + * Option type: unsigned int\n + * Applies to: compiler and linker + */ + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + + /** + * Level of optimizations to apply to generated code (0 - 4), with 4 + * being the default and highest level of optimizations.\n + * Option type: unsigned int\n + * Applies to: compiler only + */ + CU_JIT_OPTIMIZATION_LEVEL, + + /** + * No option value required. Determines the target based on the current + * attached context (default)\n + * Option type: No option value needed\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET_FROM_CUCONTEXT, + + /** + * Target is chosen based on supplied ::CUjit_target. Cannot be + * combined with ::CU_JIT_THREADS_PER_BLOCK.\n + * Option type: unsigned int for enumerated type ::CUjit_target\n + * Applies to: compiler and linker + */ + CU_JIT_TARGET, + + /** + * Specifies choice of fallback strategy if matching cubin is not found. + * Choice is based on supplied ::CUjit_fallback. This option cannot be + * used with cuLink* APIs as the linker requires exact matches.\n + * Option type: unsigned int for enumerated type ::CUjit_fallback\n + * Applies to: compiler only + */ + CU_JIT_FALLBACK_STRATEGY, + + /** + * Specifies whether to create debug information in output (-g) + * (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_GENERATE_DEBUG_INFO, + + /** + * Generate verbose log messages (0: false, default)\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_LOG_VERBOSE, + + /** + * Generate line number information (-lineinfo) (0: false, default)\n + * Option type: int\n + * Applies to: compiler only + */ + CU_JIT_GENERATE_LINE_INFO, + + /** + * Specifies whether to enable caching explicitly (-dlcm) \n + * Choice is based on supplied ::CUjit_cacheMode_enum.\n + * Option type: unsigned int for enumerated type ::CUjit_cacheMode_enum\n + * Applies to: compiler only + */ + CU_JIT_CACHE_MODE, + + /** + * \deprecated + * This jit option is deprecated and should not be used. + */ + CU_JIT_NEW_SM3X_OPT, + + /** + * This jit option is used for internal purpose only. + */ + CU_JIT_FAST_COMPILE, + + /** + * Array of device symbol names that will be relocated to the corresponing + * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n + * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + * When loding a device module, driver will relocate all encountered + * unresolved symbols to the host addresses.\n + * It is only allowed to register symbols that correspond to unresolved + * global variables.\n + * It is illegal to register the same device symbol at multiple addresses.\n + * Option type: const char **\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_NAMES, + + /** + * Array of host addresses that will be used to relocate corresponding + * device symbols stored in ::CU_JIT_GLOBAL_SYMBOL_NAMES.\n + * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n + * Option type: void **\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_ADDRESSES, + + /** + * Number of entries in ::CU_JIT_GLOBAL_SYMBOL_NAMES and + * ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES arrays.\n + * Option type: unsigned int\n + * Applies to: dynamic linker only + */ + CU_JIT_GLOBAL_SYMBOL_COUNT, + + /** + * Enable link-time optimization (-dlto) for device code (Disabled by default).\n + * This option is not supported on 32-bit platforms.\n + * Option type: int\n + * Applies to: compiler and linker + */ + CU_JIT_LTO, + + /** + * Control single-precision denormals (-ftz) support (0: false, default). + * 1 : flushes denormal values to zero + * 0 : preserves denormal values + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_FTZ, + + /** + * Control single-precision floating-point division and reciprocals + * (-prec-div) support (1: true, default). + * 1 : Enables the IEEE round-to-nearest mode + * 0 : Enables the fast approximation mode + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_PREC_DIV, + + /** + * Control single-precision floating-point square root + * (-prec-sqrt) support (1: true, default). + * 1 : Enables the IEEE round-to-nearest mode + * 0 : Enables the fast approximation mode + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_PREC_SQRT, + + /** + * Enable/Disable the contraction of floating-point multiplies + * and adds/subtracts into floating-point multiply-add (-fma) + * operations (1: Enable, default; 0: Disable). + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_FMA, + + /** + * Array of kernel names that should be preserved at link time while others + * can be removed.\n + * Must contain ::CU_JIT_REFERENCED_KERNEL_COUNT entries.\n + * Note that kernel names can be mangled by the compiler in which case the + * mangled name needs to be specified.\n + * Wildcard "*" can be used to represent zero or more characters instead of + * specifying the full or mangled name.\n + * It is important to note that the wildcard "*" is also added implicitly. + * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + * thus preserve all kernels with those names. This can be avoided by providing + * a more specific name like "barfoobaz".\n + * Option type: const char **\n + * Applies to: dynamic linker only + */ + CU_JIT_REFERENCED_KERNEL_NAMES, + + /** + * Number of entries in ::CU_JIT_REFERENCED_KERNEL_NAMES array.\n + * Option type: unsigned int\n + * Applies to: dynamic linker only + */ + CU_JIT_REFERENCED_KERNEL_COUNT, + + /** + * Array of variable names (__device__ and/or __constant__) that should be + * preserved at link time while others can be removed.\n + * Must contain ::CU_JIT_REFERENCED_VARIABLE_COUNT entries.\n + * Note that variable names can be mangled by the compiler in which case the + * mangled name needs to be specified.\n + * Wildcard "*" can be used to represent zero or more characters instead of + * specifying the full or mangled name.\n + * It is important to note that the wildcard "*" is also added implicitly. + * For example, specifying "foo" will match "foobaz", "barfoo", "barfoobaz" and + * thus preserve all variables with those names. This can be avoided by providing + * a more specific name like "barfoobaz".\n + * Option type: const char **\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_REFERENCED_VARIABLE_NAMES, + + /** + * Number of entries in ::CU_JIT_REFERENCED_VARIABLE_NAMES array.\n + * Option type: unsigned int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_REFERENCED_VARIABLE_COUNT, + + /** + * This option serves as a hint to enable the JIT compiler/linker + * to remove constant (__constant__) and device (__device__) variables + * unreferenced in device code (Disabled by default).\n + * Note that host references to constant and device variables using APIs like + * ::cuModuleGetGlobal() with this option specified may result in undefined behavior unless + * the variables are explicitly specified using ::CU_JIT_REFERENCED_VARIABLE_NAMES.\n + * Option type: int\n + * Applies to: link-time optimization specified with CU_JIT_LTO + */ + CU_JIT_OPTIMIZE_UNUSED_DEVICE_VARIABLES, + + CU_JIT_NUM_OPTIONS + +} CUjit_option; + +/** + * Online compilation targets + */ +typedef enum CUjit_target_enum +{ + CU_TARGET_COMPUTE_20 = 20, /**< Compute device class 2.0 */ + CU_TARGET_COMPUTE_21 = 21, /**< Compute device class 2.1 */ + CU_TARGET_COMPUTE_30 = 30, /**< Compute device class 3.0 */ + CU_TARGET_COMPUTE_32 = 32, /**< Compute device class 3.2 */ + CU_TARGET_COMPUTE_35 = 35, /**< Compute device class 3.5 */ + CU_TARGET_COMPUTE_37 = 37, /**< Compute device class 3.7 */ + CU_TARGET_COMPUTE_50 = 50, /**< Compute device class 5.0 */ + CU_TARGET_COMPUTE_52 = 52, /**< Compute device class 5.2 */ + CU_TARGET_COMPUTE_53 = 53, /**< Compute device class 5.3 */ + CU_TARGET_COMPUTE_60 = 60, /**< Compute device class 6.0.*/ + CU_TARGET_COMPUTE_61 = 61, /**< Compute device class 6.1.*/ + CU_TARGET_COMPUTE_62 = 62, /**< Compute device class 6.2.*/ + CU_TARGET_COMPUTE_70 = 70, /**< Compute device class 7.0.*/ + CU_TARGET_COMPUTE_72 = 72, /**< Compute device class 7.2.*/ + CU_TARGET_COMPUTE_75 = 75, /**< Compute device class 7.5.*/ + CU_TARGET_COMPUTE_80 = 80, /**< Compute device class 8.0.*/ + CU_TARGET_COMPUTE_86 = 86, /**< Compute device class 8.6.*/ + CU_TARGET_COMPUTE_87 = 87, /**< Compute device class 8.7.*/ + CU_TARGET_COMPUTE_89 = 89, /**< Compute device class 8.9.*/ + CU_TARGET_COMPUTE_90 = 90 /**< Compute device class 9.0.*/ +} CUjit_target; + +/** + * Cubin matching fallback strategies + */ +typedef enum CUjit_fallback_enum +{ + CU_PREFER_PTX = 0, /**< Prefer to compile ptx if exact binary match not found */ + + CU_PREFER_BINARY /**< Prefer to fall back to compatible binary code if exact match not found */ + +} CUjit_fallback; + +/** + * Caching modes for dlcm + */ +typedef enum CUjit_cacheMode_enum +{ + CU_JIT_CACHE_OPTION_NONE = 0, /**< Compile with no -dlcm flag specified */ + CU_JIT_CACHE_OPTION_CG, /**< Compile with L1 cache disabled */ + CU_JIT_CACHE_OPTION_CA /**< Compile with L1 cache enabled */ +} CUjit_cacheMode; + +/** + * Device code formats + */ +typedef enum CUjitInputType_enum +{ + /** + * Compiled device-class-specific device code\n + * Applicable options: none + */ + CU_JIT_INPUT_CUBIN = 0, + + /** + * PTX source code\n + * Applicable options: PTX compiler options + */ + CU_JIT_INPUT_PTX, + + /** + * Bundle of multiple cubins and/or PTX of some device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_FATBINARY, + + /** + * Host object with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_OBJECT, + + /** + * Archive of host objects with embedded device code\n + * Applicable options: PTX compiler options, ::CU_JIT_FALLBACK_STRATEGY + */ + CU_JIT_INPUT_LIBRARY, + + /** + * High-level intermediate code for link-time optimization\n + * Applicable options: NVVM compiler options, PTX compiler options + */ + CU_JIT_INPUT_NVVM, + + CU_JIT_NUM_INPUT_TYPES +} CUjitInputType; + +typedef struct CUlinkState_st *CUlinkState; + +/** + * Flags to register a graphics resource + */ +typedef enum CUgraphicsRegisterFlags_enum { + CU_GRAPHICS_REGISTER_FLAGS_NONE = 0x00, + CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02, + CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04, + CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08 +} CUgraphicsRegisterFlags; + +/** + * Flags for mapping and unmapping interop resources + */ +typedef enum CUgraphicsMapResourceFlags_enum { + CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02 +} CUgraphicsMapResourceFlags; + +/** + * Array indices for cube faces + */ +typedef enum CUarray_cubemap_face_enum { + CU_CUBEMAP_FACE_POSITIVE_X = 0x00, /**< Positive X face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_X = 0x01, /**< Negative X face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Y = 0x02, /**< Positive Y face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Y = 0x03, /**< Negative Y face of cubemap */ + CU_CUBEMAP_FACE_POSITIVE_Z = 0x04, /**< Positive Z face of cubemap */ + CU_CUBEMAP_FACE_NEGATIVE_Z = 0x05 /**< Negative Z face of cubemap */ +} CUarray_cubemap_face; + +/** + * Limits + */ +typedef enum CUlimit_enum { + CU_LIMIT_STACK_SIZE = 0x00, /**< GPU thread stack size */ + CU_LIMIT_PRINTF_FIFO_SIZE = 0x01, /**< GPU printf FIFO size */ + CU_LIMIT_MALLOC_HEAP_SIZE = 0x02, /**< GPU malloc heap size */ + CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03, /**< GPU device runtime launch synchronize depth */ + CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04, /**< GPU device runtime pending launch count */ + CU_LIMIT_MAX_L2_FETCH_GRANULARITY = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ + CU_LIMIT_PERSISTING_L2_CACHE_SIZE = 0x06, /**< A size in bytes for L2 persisting lines cache size */ + CU_LIMIT_MAX +} CUlimit; + +/** + * Resource types + */ +typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ + CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ + CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ +} CUresourcetype; + +#ifdef _WIN32 +#define CUDA_CB __stdcall +#else +#define CUDA_CB +#endif + +/** + * CUDA host function + * \param userData Argument value passed to the function + */ +typedef void (CUDA_CB *CUhostFn)(void *userData); + +/** + * Specifies performance hint with ::CUaccessPolicyWindow for hitProp and missProp members. + */ +typedef enum CUaccessProperty_enum { + CU_ACCESS_PROPERTY_NORMAL = 0, /**< Normal cache persistence. */ + CU_ACCESS_PROPERTY_STREAMING = 1, /**< Streaming access is less likely to persit from cache. */ + CU_ACCESS_PROPERTY_PERSISTING = 2 /**< Persisting access is more likely to persist in cache.*/ +} CUaccessProperty; + +/** + * Specifies an access policy for a window, a contiguous extent of memory + * beginning at base_ptr and ending at base_ptr + num_bytes. + * num_bytes is limited by CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. + * Partition into many segments and assign segments such that: + * sum of "hit segments" / window == approx. ratio. + * sum of "miss segments" / window == approx 1-ratio. + * Segments and ratio specifications are fitted to the capabilities of + * the architecture. + * Accesses in a hit segment apply the hitProp access policy. + * Accesses in a miss segment apply the missProp access policy. + */ +typedef struct CUaccessPolicyWindow_st { + void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ + size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ + float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ + CUaccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ + CUaccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING */ +} CUaccessPolicyWindow_v1; +typedef CUaccessPolicyWindow_v1 CUaccessPolicyWindow; + +/** + * GPU kernel node parameters + */ +typedef struct CUDA_KERNEL_NODE_PARAMS_st { + CUfunction func; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + void **kernelParams; /**< Array of pointers to kernel parameters */ + void **extra; /**< Extra options */ +} CUDA_KERNEL_NODE_PARAMS_v1; +typedef CUDA_KERNEL_NODE_PARAMS_v1 CUDA_KERNEL_NODE_PARAMS; + +/** + * Memset node parameters + */ +typedef struct CUDA_MEMSET_NODE_PARAMS_st { + CUdeviceptr dst; /**< Destination device pointer */ + size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ + unsigned int value; /**< Value to be set */ + unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ + size_t width; /**< Width of the row in elements */ + size_t height; /**< Number of rows */ +} CUDA_MEMSET_NODE_PARAMS_v1; +typedef CUDA_MEMSET_NODE_PARAMS_v1 CUDA_MEMSET_NODE_PARAMS; + +/** + * Host node parameters + */ +typedef struct CUDA_HOST_NODE_PARAMS_st { + CUhostFn fn; /**< The function to call when the node executes */ + void* userData; /**< Argument to pass to the function */ +} CUDA_HOST_NODE_PARAMS_v1; +typedef CUDA_HOST_NODE_PARAMS_v1 CUDA_HOST_NODE_PARAMS; + +/** + * Graph node types + */ +typedef enum CUgraphNodeType_enum { + CU_GRAPH_NODE_TYPE_KERNEL = 0, /**< GPU kernel node */ + CU_GRAPH_NODE_TYPE_MEMCPY = 1, /**< Memcpy node */ + CU_GRAPH_NODE_TYPE_MEMSET = 2, /**< Memset node */ + CU_GRAPH_NODE_TYPE_HOST = 3, /**< Host (executable) node */ + CU_GRAPH_NODE_TYPE_GRAPH = 4, /**< Node which executes an embedded graph */ + CU_GRAPH_NODE_TYPE_EMPTY = 5, /**< Empty (no-op) node */ + CU_GRAPH_NODE_TYPE_WAIT_EVENT = 6, /**< External event wait node */ + CU_GRAPH_NODE_TYPE_EVENT_RECORD = 7, /**< External event record node */ + CU_GRAPH_NODE_TYPE_EXT_SEMAS_SIGNAL = 8, /**< External semaphore signal node */ + CU_GRAPH_NODE_TYPE_EXT_SEMAS_WAIT = 9, /**< External semaphore wait node */ + CU_GRAPH_NODE_TYPE_MEM_ALLOC = 10,/**< Memory Allocation Node */ + CU_GRAPH_NODE_TYPE_MEM_FREE = 11 /**< Memory Free Node */ + , + CU_GRAPH_NODE_TYPE_BATCH_MEM_OP = 12 /**< Batch MemOp Node */ +} CUgraphNodeType; + +typedef enum CUsynchronizationPolicy_enum { + CU_SYNC_POLICY_AUTO = 1, + CU_SYNC_POLICY_SPIN = 2, + CU_SYNC_POLICY_YIELD = 3, + CU_SYNC_POLICY_BLOCKING_SYNC = 4 +} CUsynchronizationPolicy; + +/** + * Cluster scheduling policies. These may be passed to ::cuFuncSetAttribute + */ +typedef enum CUclusterSchedulingPolicy_enum { + CU_CLUSTER_SCHEDULING_POLICY_DEFAULT = 0, /**< the default policy */ + CU_CLUSTER_SCHEDULING_POLICY_SPREAD = 1, /**< spread the blocks within a cluster to the SMs */ + CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING = 2 /**< allow the hardware to load-balance the blocks in a cluster to the SMs */ +} CUclusterSchedulingPolicy; + +typedef enum CUlaunchAttributeID_enum { + CU_LAUNCH_ATTRIBUTE_IGNORE = 0 /**< Ignored entry, for convenient composition */ + , CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1 /**< Valid for streams, graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_COOPERATIVE = 2 /**< Valid for graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3 /**< Valid for streams. */ + , CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION = 4 /**< Valid for graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5 /**< Valid for graph nodes, launches. */ + , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION = 6 /**< Valid for launches. Setting + programmaticStreamSerializationAllowed to non-0 + signals that the kernel will use programmatic + means to resolve its stream dependency, so that + the CUDA runtime should opportunistically allow + the grid's execution to overlap with the previous + kernel in the stream, if that kernel requests the + overlap. */ + , CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT = 7 /**< Valid for launches. Event recorded through this + launch attribute is guaranteed to only trigger + after all block in the associated kernel trigger + the event. A block can trigger the event through + PTX griddepcontrol.launch_dependents. A trigger + can also be inserted at the beginning of each + block's execution if triggerAtBlockStart is set to + non-0. Note that dependents (including the CPU + thread calling cuEventSynchronize()) are not + guaranteed to observe the release precisely when + it is released. For example, cuEventSynchronize() + may only observe the event trigger long after the + associated kernel has completed. This recording + type is primarily meant for establishing + programmatic dependency between device tasks. The + event supplied must not be an interprocess or + interop event. The event must disable timing + (i.e. created with ::CU_EVENT_DISABLE_TIMING flag + set). */ + , CU_LAUNCH_ATTRIBUTE_PRIORITY = 8 /**< Valid for graph nodes. */ +} CUlaunchAttributeID; + +typedef union CUlaunchAttributeValue_union { + char pad[64]; /**< Pad to 64 bytes */ + CUaccessPolicyWindow accessPolicyWindow; + int cooperative; + CUsynchronizationPolicy syncPolicy; + struct { + unsigned int x; + unsigned int y; + unsigned int z; + } clusterDim; + CUclusterSchedulingPolicy clusterSchedulingPolicyPreference; + int programmaticStreamSerializationAllowed; + struct { + CUevent event; + int flags; /* Does not accept ::CU_EVENT_RECORD_EXTERNAL */ + int triggerAtBlockStart; + } programmaticEvent; + int priority; +} CUlaunchAttributeValue; + +typedef struct CUlaunchAttribute_st { + CUlaunchAttributeID id; + char pad[8 - sizeof(CUlaunchAttributeID)]; + CUlaunchAttributeValue value; +} CUlaunchAttribute; + +typedef struct CUlaunchConfig_st { + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + CUstream hStream; /**< Stream identifier */ + CUlaunchAttribute *attrs; /**< nullable if numAttrs == 0 */ + unsigned int numAttrs; /**< number of attributes populated in attrs */ +} CUlaunchConfig; + +/** + * Graph kernel node Attributes + */ +typedef CUlaunchAttributeID CUkernelNodeAttrID; +#define CU_KERNEL_NODE_ATTRIBUTE_ACCESS_POLICY_WINDOW CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW +#define CU_KERNEL_NODE_ATTRIBUTE_COOPERATIVE CU_LAUNCH_ATTRIBUTE_COOPERATIVE +#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_DIMENSION CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION +#define CU_KERNEL_NODE_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE +#define CU_KERNEL_NODE_ATTRIBUTE_PRIORITY CU_LAUNCH_ATTRIBUTE_PRIORITY + +/** + * Graph kernel node attributes union, used with ::cuKernelNodeSetAttribute/::cuKernelNodeGetAttribute + */ +typedef CUlaunchAttributeValue CUkernelNodeAttrValue_v1; +typedef CUkernelNodeAttrValue_v1 CUkernelNodeAttrValue; + +/** + * Possible stream capture statuses returned by ::cuStreamIsCapturing + */ +typedef enum CUstreamCaptureStatus_enum { + CU_STREAM_CAPTURE_STATUS_NONE = 0, /**< Stream is not capturing */ + CU_STREAM_CAPTURE_STATUS_ACTIVE = 1, /**< Stream is actively capturing */ + CU_STREAM_CAPTURE_STATUS_INVALIDATED = 2 /**< Stream is part of a capture sequence that + has been invalidated, but not terminated */ +} CUstreamCaptureStatus; + +/** + * Possible modes for stream capture thread interactions. For more details see + * ::cuStreamBeginCapture and ::cuThreadExchangeStreamCaptureMode + */ +typedef enum CUstreamCaptureMode_enum { + CU_STREAM_CAPTURE_MODE_GLOBAL = 0, + CU_STREAM_CAPTURE_MODE_THREAD_LOCAL = 1, + CU_STREAM_CAPTURE_MODE_RELAXED = 2 +} CUstreamCaptureMode; + +/** + * Stream Attributes + */ +typedef CUlaunchAttributeID CUstreamAttrID; +#define CU_STREAM_ATTRIBUTE_ACCESS_POLICY_WINDOW CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW +#define CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY + +/** + * Stream attributes union, used with ::cuStreamSetAttribute/::cuStreamGetAttribute + */ +typedef CUlaunchAttributeValue CUstreamAttrValue_v1; +typedef CUstreamAttrValue_v1 CUstreamAttrValue; + +/** + * Flags to specify search options. For more details see ::cuGetProcAddress + */ +typedef enum CUdriverProcAddress_flags_enum { + CU_GET_PROC_ADDRESS_DEFAULT = 0, /**< Default search mode for driver symbols. */ + CU_GET_PROC_ADDRESS_LEGACY_STREAM = 1 << 0, /**< Search for legacy versions of driver symbols. */ + CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM = 1 << 1 /**< Search for per-thread versions of driver symbols. */ +} CUdriverProcAddress_flags; + +/** + * Execution Affinity Types + */ +typedef enum CUexecAffinityType_enum { + CU_EXEC_AFFINITY_TYPE_SM_COUNT = 0, /**< Create a context with limited SMs. */ + CU_EXEC_AFFINITY_TYPE_MAX +} CUexecAffinityType; + +/** + * Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT + */ +typedef struct CUexecAffinitySmCount_st { + unsigned int val; /**< The number of SMs the context is limited to use. */ +} CUexecAffinitySmCount_v1; +typedef CUexecAffinitySmCount_v1 CUexecAffinitySmCount; + +/** + * Execution Affinity Parameters + */ +typedef struct CUexecAffinityParam_st { + CUexecAffinityType type; + union { + CUexecAffinitySmCount smCount; /** Value for ::CU_EXEC_AFFINITY_TYPE_SM_COUNT */ + } param; +} CUexecAffinityParam_v1; +typedef CUexecAffinityParam_v1 CUexecAffinityParam; + +/** + * Error codes + */ +typedef enum cudaError_enum { + /** + * The API call returned with no errors. In the case of query calls, this + * also means that the operation being queried is complete (see + * ::cuEventQuery() and ::cuStreamQuery()). + */ + CUDA_SUCCESS = 0, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + CUDA_ERROR_INVALID_VALUE = 1, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + CUDA_ERROR_OUT_OF_MEMORY = 2, + + /** + * This indicates that the CUDA driver has not been initialized with + * ::cuInit() or that initialization has failed. + */ + CUDA_ERROR_NOT_INITIALIZED = 3, + + /** + * This indicates that the CUDA driver is in the process of shutting down. + */ + CUDA_ERROR_DEINITIALIZED = 4, + + /** + * This indicates profiler is not initialized for this run. This can + * happen when the application is running with external profiling tools + * like visual profiler. + */ + CUDA_ERROR_PROFILER_DISABLED = 5, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to attempt to enable/disable the profiling via ::cuProfilerStart or + * ::cuProfilerStop without initialization. + */ + CUDA_ERROR_PROFILER_NOT_INITIALIZED = 6, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStart() when profiling is already enabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STARTED = 7, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cuProfilerStop() when profiling is already disabled. + */ + CUDA_ERROR_PROFILER_ALREADY_STOPPED = 8, + + /** + * This indicates that the CUDA driver that the application has loaded is a + * stub library. Applications that run with the stub rather than a real + * driver loaded will result in CUDA API returning this error. + */ + CUDA_ERROR_STUB_LIBRARY = 34, + + /** + * This indicates that requested CUDA device is unavailable at the current + * time. Devices are often unavailable due to use of + * ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS or ::CU_COMPUTEMODE_PROHIBITED. + */ + CUDA_ERROR_DEVICE_UNAVAILABLE = 46, + + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + CUDA_ERROR_NO_DEVICE = 100, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device or that the action requested is + * invalid for the specified device. + */ + CUDA_ERROR_INVALID_DEVICE = 101, + + /** + * This error indicates that the Grid license is not applied. + */ + CUDA_ERROR_DEVICE_NOT_LICENSED = 102, + + /** + * This indicates that the device kernel image is invalid. This can also + * indicate an invalid CUDA module. + */ + CUDA_ERROR_INVALID_IMAGE = 200, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + CUDA_ERROR_INVALID_CONTEXT = 201, + + /** + * This indicated that the context being supplied as a parameter to the + * API call was already the active context. + * \deprecated + * This error return is deprecated as of CUDA 3.2. It is no longer an + * error to attempt to push the active context via ::cuCtxPushCurrent(). + */ + CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202, + + /** + * This indicates that a map or register operation has failed. + */ + CUDA_ERROR_MAP_FAILED = 205, + + /** + * This indicates that an unmap or unregister operation has failed. + */ + CUDA_ERROR_UNMAP_FAILED = 206, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + CUDA_ERROR_ARRAY_IS_MAPPED = 207, + + /** + * This indicates that the resource is already mapped. + */ + CUDA_ERROR_ALREADY_MAPPED = 208, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + CUDA_ERROR_NO_BINARY_FOR_GPU = 209, + + /** + * This indicates that a resource has already been acquired. + */ + CUDA_ERROR_ALREADY_ACQUIRED = 210, + + /** + * This indicates that a resource is not mapped. + */ + CUDA_ERROR_NOT_MAPPED = 211, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + CUDA_ERROR_NOT_MAPPED_AS_ARRAY = 212, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + CUDA_ERROR_ECC_UNCORRECTABLE = 214, + + /** + * This indicates that the ::CUlimit passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_LIMIT = 215, + + /** + * This indicates that the ::CUcontext passed to the API call can + * only be bound to a single CPU thread at a time but is already + * bound to a CPU thread. + */ + CUDA_ERROR_CONTEXT_ALREADY_IN_USE = 216, + + /** + * This indicates that peer access is not supported across the given + * devices. + */ + CUDA_ERROR_PEER_ACCESS_UNSUPPORTED = 217, + + /** + * This indicates that a PTX JIT compilation failed. + */ + CUDA_ERROR_INVALID_PTX = 218, + + /** + * This indicates an error with OpenGL or DirectX context. + */ + CUDA_ERROR_INVALID_GRAPHICS_CONTEXT = 219, + + /** + * This indicates that an uncorrectable NVLink error was detected during the + * execution. + */ + CUDA_ERROR_NVLINK_UNCORRECTABLE = 220, + + /** + * This indicates that the PTX JIT compiler library was not found. + */ + CUDA_ERROR_JIT_COMPILER_NOT_FOUND = 221, + + /** + * This indicates that the provided PTX was compiled with an unsupported toolchain. + */ + + CUDA_ERROR_UNSUPPORTED_PTX_VERSION = 222, + + /** + * This indicates that the PTX JIT compilation was disabled. + */ + CUDA_ERROR_JIT_COMPILATION_DISABLED = 223, + + /** + * This indicates that the ::CUexecAffinityType passed to the API call is not + * supported by the active device. + */ + CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224, + + /** + * This indicates that the device kernel source is invalid. This includes + * compilation/linker errors encountered in device code or user error. + */ + CUDA_ERROR_INVALID_SOURCE = 300, + + /** + * This indicates that the file specified was not found. + */ + CUDA_ERROR_FILE_NOT_FOUND = 301, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND = 302, + + /** + * This indicates that initialization of a shared object failed. + */ + CUDA_ERROR_SHARED_OBJECT_INIT_FAILED = 303, + + /** + * This indicates that an OS call failed. + */ + CUDA_ERROR_OPERATING_SYSTEM = 304, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::CUstream and ::CUevent. + */ + CUDA_ERROR_INVALID_HANDLE = 400, + + /** + * This indicates that a resource required by the API call is not in a + * valid state to perform the requested operation. + */ + CUDA_ERROR_ILLEGAL_STATE = 401, + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, driver function names, texture names, + * and surface names. + */ + CUDA_ERROR_NOT_FOUND = 500, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::CUDA_SUCCESS (which indicates completion). Calls that + * may return this value include ::cuEventQuery() and ::cuStreamQuery(). + */ + CUDA_ERROR_NOT_READY = 600, + + /** + * While executing a kernel, the device encountered a + * load or store instruction on an invalid memory address. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_ILLEGAL_ADDRESS = 700, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. This error usually indicates that the user has + * attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register + * count. Passing arguments of the wrong size (i.e. a 64-bit pointer + * when a 32-bit int is expected) is equivalent to passing too many + * arguments and can also result in this error. + */ + CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device attribute + * ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_LAUNCH_TIMEOUT = 702, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, + + /** + * This error indicates that a call to ::cuCtxEnablePeerAccess() is + * trying to re-enable peer access to a context which has already + * had peer access to it enabled. + */ + CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED = 704, + + /** + * This error indicates that ::cuCtxDisablePeerAccess() is + * trying to disable peer access which has not been enabled yet + * via ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_PEER_ACCESS_NOT_ENABLED = 705, + + /** + * This error indicates that the primary context for the specified device + * has already been initialized. + */ + CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE = 708, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy, or is a primary context which + * has not yet been initialized. + */ + CUDA_ERROR_CONTEXT_IS_DESTROYED = 709, + + /** + * A device-side assert triggered during kernel execution. The context + * cannot be used anymore, and must be destroyed. All existing device + * memory allocations from this context are invalid and must be + * reconstructed if the program is to continue using CUDA. + */ + CUDA_ERROR_ASSERT = 710, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cuCtxEnablePeerAccess(). + */ + CUDA_ERROR_TOO_MANY_PEERS = 711, + + /** + * This error indicates that the memory range passed to ::cuMemHostRegister() + * has already been registered. + */ + CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED = 712, + + /** + * This error indicates that the pointer passed to ::cuMemHostUnregister() + * does not correspond to any currently registered memory region. + */ + CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED = 713, + + /** + * While executing a kernel, the device encountered a stack error. + * This can be due to stack corruption or exceeding the stack size limit. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_HARDWARE_STACK_ERROR = 714, + + /** + * While executing a kernel, the device encountered an illegal instruction. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_ILLEGAL_INSTRUCTION = 715, + + /** + * While executing a kernel, the device encountered a load or store instruction + * on a memory address which is not aligned. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_MISALIGNED_ADDRESS = 716, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_INVALID_ADDRESS_SPACE = 717, + + /** + * While executing a kernel, the device program counter wrapped its address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_INVALID_PC = 718, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. Less common cases can be system specific - more + * information about these cases can be found in the system specific user guide. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + CUDA_ERROR_LAUNCH_FAILED = 719, + + /** + * This error indicates that the number of blocks launched per grid for a kernel that was + * launched via either ::cuLaunchCooperativeKernel or ::cuLaunchCooperativeKernelMultiDevice + * exceeds the maximum number of blocks as allowed by ::cuOccupancyMaxActiveBlocksPerMultiprocessor + * or ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + */ + CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE = 720, + + /** + * This error indicates that the attempted operation is not permitted. + */ + CUDA_ERROR_NOT_PERMITTED = 800, + + /** + * This error indicates that the attempted operation is not supported + * on the current system or device. + */ + CUDA_ERROR_NOT_SUPPORTED = 801, + + /** + * This error indicates that the system is not yet ready to start any CUDA + * work. To continue using CUDA, verify the system configuration is in a + * valid state and all required driver daemons are actively running. + * More information about this error can be found in the system specific + * user guide. + */ + CUDA_ERROR_SYSTEM_NOT_READY = 802, + + /** + * This error indicates that there is a mismatch between the versions of + * the display driver and the CUDA driver. Refer to the compatibility documentation + * for supported versions. + */ + CUDA_ERROR_SYSTEM_DRIVER_MISMATCH = 803, + + /** + * This error indicates that the system was upgraded to run with forward compatibility + * but the visible hardware detected by CUDA does not support this configuration. + * Refer to the compatibility documentation for the supported hardware matrix or ensure + * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES + * environment variable. + */ + CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE = 804, + + /** + * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. + */ + CUDA_ERROR_MPS_CONNECTION_FAILED = 805, + + /** + * This error indicates that the remote procedural call between the MPS server and the MPS client failed. + */ + CUDA_ERROR_MPS_RPC_FAILURE = 806, + + /** + * This error indicates that the MPS server is not ready to accept new MPS client requests. + * This error can be returned when the MPS server is in the process of recovering from a fatal failure. + */ + CUDA_ERROR_MPS_SERVER_NOT_READY = 807, + + /** + * This error indicates that the hardware resources required to create MPS client have been exhausted. + */ + CUDA_ERROR_MPS_MAX_CLIENTS_REACHED = 808, + + /** + * This error indicates the the hardware resources required to support device connections have been exhausted. + */ + CUDA_ERROR_MPS_MAX_CONNECTIONS_REACHED = 809, + + /** + * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched. + */ + CUDA_ERROR_MPS_CLIENT_TERMINATED = 810, + + /** + * This error indicates that the operation is not permitted when + * the stream is capturing. + */ + CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED = 900, + + /** + * This error indicates that the current capture sequence on the stream + * has been invalidated due to a previous error. + */ + CUDA_ERROR_STREAM_CAPTURE_INVALIDATED = 901, + + /** + * This error indicates that the operation would have resulted in a merge + * of two independent capture sequences. + */ + CUDA_ERROR_STREAM_CAPTURE_MERGE = 902, + + /** + * This error indicates that the capture was not initiated in this stream. + */ + CUDA_ERROR_STREAM_CAPTURE_UNMATCHED = 903, + + /** + * This error indicates that the capture sequence contains a fork that was + * not joined to the primary stream. + */ + CUDA_ERROR_STREAM_CAPTURE_UNJOINED = 904, + + /** + * This error indicates that a dependency would have been created which + * crosses the capture sequence boundary. Only implicit in-stream ordering + * dependencies are allowed to cross the boundary. + */ + CUDA_ERROR_STREAM_CAPTURE_ISOLATION = 905, + + /** + * This error indicates a disallowed implicit dependency on a current capture + * sequence from cudaStreamLegacy. + */ + CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = 906, + + /** + * This error indicates that the operation is not permitted on an event which + * was last recorded in a capturing stream. + */ + CUDA_ERROR_CAPTURED_EVENT = 907, + + /** + * A stream capture sequence not initiated with the ::CU_STREAM_CAPTURE_MODE_RELAXED + * argument to ::cuStreamBeginCapture was passed to ::cuStreamEndCapture in a + * different thread. + */ + CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD = 908, + + /** + * This error indicates that the timeout specified for the wait operation has lapsed. + */ + CUDA_ERROR_TIMEOUT = 909, + + /** + * This error indicates that the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + */ + CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE = 910, + + /** + * This indicates that an async error has occurred in a device outside of CUDA. + * If CUDA was waiting for an external device's signal before consuming shared data, + * the external device signaled an error indicating that the data is not valid for + * consumption. This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must be + * terminated and relaunched. + */ + CUDA_ERROR_EXTERNAL_DEVICE = 911, + + /** + * Indicates a kernel launch error due to cluster misconfiguration. + */ + CUDA_ERROR_INVALID_CLUSTER_SIZE = 912, + + /** + * This indicates that an unknown internal error has occurred. + */ + CUDA_ERROR_UNKNOWN = 999 +} CUresult; + +/** + * P2P Attributes + */ +typedef enum CUdevice_P2PAttribute_enum { + CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01, /**< A relative value indicating the performance of the link between two devices */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02, /**< P2P Access is enable */ + CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03, /**< Atomic operation over the link supported */ + CU_DEVICE_P2P_ATTRIBUTE_ACCESS_ACCESS_SUPPORTED = 0x04, /**< \deprecated use CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED instead */ + CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED = 0x04 /**< Accessing CUDA arrays over the link supported */ +} CUdevice_P2PAttribute; + +/** + * CUDA stream callback + * \param hStream The stream the callback was added to, as passed to ::cuStreamAddCallback. May be NULL. + * \param status ::CUDA_SUCCESS or any persistent error on the stream. + * \param userData User parameter provided at registration. + */ +typedef void (CUDA_CB *CUstreamCallback)(CUstream hStream, CUresult status, void *userData); + +/** + * Block size to per-block dynamic shared memory mapping for a certain + * kernel \param blockSize Block size of the kernel. + * + * \return The dynamic shared memory needed by a block. + */ +typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize); + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_DEVICEMAP 0x02 + +/** + * If set, host memory is allocated as write-combined - fast to write, + * faster to DMA, slow to read except via SSE4 streaming load instruction + * (MOVNTDQA). + * Flag for ::cuMemHostAlloc() + */ +#define CU_MEMHOSTALLOC_WRITECOMBINED 0x04 + +/** + * If set, host memory is portable between CUDA contexts. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_PORTABLE 0x01 + +/** + * If set, host memory is mapped into CUDA address space and + * ::cuMemHostGetDevicePointer() may be called on the host pointer. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_DEVICEMAP 0x02 + +/** + * If set, the passed memory pointer is treated as pointing to some + * memory-mapped I/O space, e.g. belonging to a third-party PCIe device. + * On Windows the flag is a no-op. + * On Linux that memory is marked as non cache-coherent for the GPU and + * is expected to be physically contiguous. It may return + * ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, + * ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. + * On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED + * is returned. + * Flag for ::cuMemHostRegister() + */ +#define CU_MEMHOSTREGISTER_IOMEMORY 0x04 + +/** +* If set, the passed memory pointer is treated as pointing to memory that is +* considered read-only by the device. On platforms without +* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is +* required in order to register memory mapped to the CPU as read-only. Support +* for the use of this flag can be queried from the device attribute +* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with +* a current context associated with a device that does not have this attribute +* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED. +*/ +#define CU_MEMHOSTREGISTER_READ_ONLY 0x08 + +/** + * 2D memory copy parameters + */ +typedef struct CUDA_MEMCPY2D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + + size_t WidthInBytes; /**< Width of 2D memory copy in bytes */ + size_t Height; /**< Height of 2D memory copy */ +} CUDA_MEMCPY2D_v2; +typedef CUDA_MEMCPY2D_v2 CUDA_MEMCPY2D; + +/** + * 3D memory copy parameters + */ +typedef struct CUDA_MEMCPY3D_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D_v2; +typedef CUDA_MEMCPY3D_v2 CUDA_MEMCPY3D; + +/** + * 3D memory cross-context copy parameters + */ +typedef struct CUDA_MEMCPY3D_PEER_st { + size_t srcXInBytes; /**< Source X in bytes */ + size_t srcY; /**< Source Y */ + size_t srcZ; /**< Source Z */ + size_t srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + CUcontext srcContext; /**< Source context (ignored with srcMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t srcPitch; /**< Source pitch (ignored when src is array) */ + size_t srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + size_t dstXInBytes; /**< Destination X in bytes */ + size_t dstY; /**< Destination Y */ + size_t dstZ; /**< Destination Z */ + size_t dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + CUcontext dstContext; /**< Destination context (ignored with dstMemoryType is ::CU_MEMORYTYPE_ARRAY) */ + size_t dstPitch; /**< Destination pitch (ignored when dst is array) */ + size_t dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + size_t WidthInBytes; /**< Width of 3D memory copy in bytes */ + size_t Height; /**< Height of 3D memory copy */ + size_t Depth; /**< Depth of 3D memory copy */ +} CUDA_MEMCPY3D_PEER_v1; +typedef CUDA_MEMCPY3D_PEER_v1 CUDA_MEMCPY3D_PEER; + +/** + * Array descriptor + */ +typedef struct CUDA_ARRAY_DESCRIPTOR_st +{ + size_t Width; /**< Width of array */ + size_t Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ +} CUDA_ARRAY_DESCRIPTOR_v2; +typedef CUDA_ARRAY_DESCRIPTOR_v2 CUDA_ARRAY_DESCRIPTOR; + +/** + * 3D array descriptor + */ +typedef struct CUDA_ARRAY3D_DESCRIPTOR_st +{ + size_t Width; /**< Width of 3D array */ + size_t Height; /**< Height of 3D array */ + size_t Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ +} CUDA_ARRAY3D_DESCRIPTOR_v2; +typedef CUDA_ARRAY3D_DESCRIPTOR_v2 CUDA_ARRAY3D_DESCRIPTOR; + +/** + * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers + */ +#define CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL 0x1 + +/** + * CUDA array sparse properties + */ +typedef struct CUDA_ARRAY_SPARSE_PROPERTIES_st { + struct { + unsigned int width; /**< Width of sparse tile in elements */ + unsigned int height; /**< Height of sparse tile in elements */ + unsigned int depth; /**< Depth of sparse tile in elements */ + } tileExtent; + + /** + * First mip level at which the mip tail begins. + */ + unsigned int miptailFirstLevel; + /** + * Total size of the mip tail. + */ + unsigned long long miptailSize; + /** + * Flags will either be zero or ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL + */ + unsigned int flags; + unsigned int reserved[4]; +} CUDA_ARRAY_SPARSE_PROPERTIES_v1; +typedef CUDA_ARRAY_SPARSE_PROPERTIES_v1 CUDA_ARRAY_SPARSE_PROPERTIES; + +/** + * CUDA array memory requirements + */ +typedef struct CUDA_ARRAY_MEMORY_REQUIREMENTS_st { + size_t size; /**< Total required memory size */ + size_t alignment; /**< alignment requirement */ + unsigned int reserved[4]; +} CUDA_ARRAY_MEMORY_REQUIREMENTS_v1; +typedef CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 CUDA_ARRAY_MEMORY_REQUIREMENTS; + +/** + * CUDA Resource descriptor + */ +typedef struct CUDA_RESOURCE_DESC_st +{ + CUresourcetype resType; /**< Resource type */ + + union { + struct { + CUarray hArray; /**< CUDA array */ + } array; + struct { + CUmipmappedArray hMipmappedArray; /**< CUDA mipmapped array */ + } mipmap; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + CUdeviceptr devPtr; /**< Device pointer */ + CUarray_format format; /**< Array format */ + unsigned int numChannels; /**< Channels per array element */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + struct { + int reserved[32]; + } reserved; + } res; + + unsigned int flags; /**< Flags (must be zero) */ +} CUDA_RESOURCE_DESC_v1; +typedef CUDA_RESOURCE_DESC_v1 CUDA_RESOURCE_DESC; + +/** + * Texture descriptor + */ +typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; /**< Address modes */ + CUfilter_mode filterMode; /**< Filter mode */ + unsigned int flags; /**< Flags */ + unsigned int maxAnisotropy; /**< Maximum anisotropy ratio */ + CUfilter_mode mipmapFilterMode; /**< Mipmap filter mode */ + float mipmapLevelBias; /**< Mipmap level bias */ + float minMipmapLevelClamp; /**< Mipmap minimum level clamp */ + float maxMipmapLevelClamp; /**< Mipmap maximum level clamp */ + float borderColor[4]; /**< Border Color */ + int reserved[12]; +} CUDA_TEXTURE_DESC_v1; +typedef CUDA_TEXTURE_DESC_v1 CUDA_TEXTURE_DESC; + +/** + * Resource view format + */ +typedef enum CUresourceViewFormat_enum +{ + CU_RES_VIEW_FORMAT_NONE = 0x00, /**< No resource view format (use underlying resource format) */ + CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01, /**< 1 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02, /**< 2 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03, /**< 4 channel unsigned 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04, /**< 1 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05, /**< 2 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06, /**< 4 channel signed 8-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07, /**< 1 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08, /**< 2 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09, /**< 4 channel unsigned 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a, /**< 1 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b, /**< 2 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c, /**< 4 channel signed 16-bit integers */ + CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10, /**< 1 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11, /**< 2 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12, /**< 4 channel signed 32-bit integers */ + CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13, /**< 1 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14, /**< 2 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15, /**< 4 channel 16-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16, /**< 1 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17, /**< 2 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18, /**< 4 channel 32-bit floating point */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19, /**< Block compressed 1 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a, /**< Block compressed 2 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b, /**< Block compressed 3 */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c, /**< Block compressed 4 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d, /**< Block compressed 4 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e, /**< Block compressed 5 unsigned */ + CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f, /**< Block compressed 5 signed */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20, /**< Block compressed 6 unsigned half-float */ + CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21, /**< Block compressed 6 signed half-float */ + CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22 /**< Block compressed 7 */ +} CUresourceViewFormat; + +/** + * Resource view descriptor + */ +typedef struct CUDA_RESOURCE_VIEW_DESC_st +{ + CUresourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ + unsigned int reserved[16]; +} CUDA_RESOURCE_VIEW_DESC_v1; +typedef CUDA_RESOURCE_VIEW_DESC_v1 CUDA_RESOURCE_VIEW_DESC; + +/** + * GPU Direct v3 tokens + */ +typedef struct CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_st { + unsigned long long p2pToken; + unsigned int vaSpaceToken; +} CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1; +typedef CUDA_POINTER_ATTRIBUTE_P2P_TOKENS_v1 CUDA_POINTER_ATTRIBUTE_P2P_TOKENS; + +/** +* Access flags that specify the level of access the current context's device has +* on the memory referenced. +*/ +typedef enum CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS_enum { + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_NONE = 0x0, /**< No access, meaning the device cannot access this memory at all, thus must be staged through accessible memory in order to complete certain operations */ + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READ = 0x1, /**< Read-only access, meaning writes to this memory are considered invalid accesses and thus return error in that case. */ + CU_POINTER_ATTRIBUTE_ACCESS_FLAG_READWRITE = 0x3 /**< Read-write access, the device has full read-write access to the memory */ +} CUDA_POINTER_ATTRIBUTE_ACCESS_FLAGS; + +/** + * Kernel launch parameters + */ +typedef struct CUDA_LAUNCH_PARAMS_st { + CUfunction function; /**< Kernel to launch */ + unsigned int gridDimX; /**< Width of grid in blocks */ + unsigned int gridDimY; /**< Height of grid in blocks */ + unsigned int gridDimZ; /**< Depth of grid in blocks */ + unsigned int blockDimX; /**< X dimension of each thread block */ + unsigned int blockDimY; /**< Y dimension of each thread block */ + unsigned int blockDimZ; /**< Z dimension of each thread block */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + CUstream hStream; /**< Stream identifier */ + void **kernelParams; /**< Array of pointers to kernel parameters */ +} CUDA_LAUNCH_PARAMS_v1; +typedef CUDA_LAUNCH_PARAMS_v1 CUDA_LAUNCH_PARAMS; + +/** + * External memory handle types + */ +typedef enum CUexternalMemoryHandleType_enum { + /** + * Handle is an opaque file descriptor + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + /** + * Handle is an opaque shared NT handle + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + /** + * Handle is a D3D12 heap object + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + /** + * Handle is a D3D12 committed resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, + /** + * Handle is a shared NT handle to a D3D11 resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, + /** + * Handle is a globally shared handle to a D3D11 resource + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, + /** + * Handle is an NvSciBuf object + */ + CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 +} CUexternalMemoryHandleType; + +/** + * Indicates that the external memory object is a dedicated resource + */ +#define CUDA_EXTERNAL_MEMORY_DEDICATED 0x1 + +/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS + * contains this flag, it indicates that signaling an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC 0x01 + +/** When the \p flags parameter of ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS + * contains this flag, it indicates that waiting on an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC 0x02 + +/** + * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application needs signaler specific NvSciSyncAttr + * to be filled by ::cuDeviceGetNvSciSyncAttributes. + */ +#define CUDA_NVSCISYNC_ATTR_SIGNAL 0x1 + +/** + * When \p flags of ::cuDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application needs waiter specific NvSciSyncAttr + * to be filled by ::cuDeviceGetNvSciSyncAttributes. + */ +#define CUDA_NVSCISYNC_ATTR_WAIT 0x2 +/** + * External memory handle descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { + /** + * Type of the handle + */ + CUexternalMemoryHandleType type; + union { + /** + * File descriptor referencing the memory object. Valid + * when type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE + * - ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is one of the following: + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid memory object. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + /** + * A handle representing an NvSciBuf Object. Valid when type + * is ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF + */ + const void *nvSciBufObject; + } handle; + /** + * Size of the memory allocation + */ + unsigned long long size; + /** + * Flags must either be zero or ::CUDA_EXTERNAL_MEMORY_DEDICATED + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 CUDA_EXTERNAL_MEMORY_HANDLE_DESC; + +/** + * External memory buffer descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { + /** + * Offset into the memory object where the buffer's base is + */ + unsigned long long offset; + /** + * Size of the buffer + */ + unsigned long long size; + /** + * Flags reserved for future use. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 CUDA_EXTERNAL_MEMORY_BUFFER_DESC; + +/** + * External memory mipmap descriptor + */ +typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { + /** + * Offset into the memory object where the base level of the + * mipmap chain is. + */ + unsigned long long offset; + /** + * Format, dimension and type of base level of the mipmap chain + */ + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + /** + * Total number of levels in the mipmap chain + */ + unsigned int numLevels; + unsigned int reserved[16]; +} CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1; +typedef CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; + +/** + * External semaphore handle types + */ +typedef enum CUexternalSemaphoreHandleType_enum { + /** + * Handle is an opaque file descriptor + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + /** + * Handle is an opaque shared NT handle + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + /** + * Handle is a shared NT handle referencing a D3D12 fence object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, + /** + * Handle is a shared NT handle referencing a D3D11 fence object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, + /** + * Opaque handle to NvSciSync Object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, + /** + * Handle is a shared NT handle referencing a D3D11 keyed mutex object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, + /** + * Handle is a globally shared handle referencing a D3D11 keyed mutex object + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, + /** + * Handle is an opaque file descriptor referencing a timeline semaphore + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, + /** + * Handle is an opaque shared NT handle referencing a timeline semaphore + */ + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 +} CUexternalSemaphoreHandleType; + +/** + * External semaphore handle descriptor + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { + /** + * Type of the handle + */ + CUexternalSemaphoreHandleType type; + union { + /** + * File descriptor referencing the semaphore object. Valid + * when type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is one of the following: + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * - ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid synchronization primitive. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + /** + * Valid NvSciSyncObj. Must be non NULL + */ + const void* nvSciSyncObj; + } handle; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; + +/** + * External semaphore signal parameters + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be signaled + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::CUexternalSemaphoreHandleType + * is of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to release the mutex with + */ + unsigned long long key; + } keyedMutex; + unsigned int reserved[12]; + } params; + /** + * Only when ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to + * signal a ::CUexternalSemaphore of type + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which indicates + * that while signaling the ::CUexternalSemaphore, no memory synchronization + * operations should be performed for any external memory object imported + * as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. + * For all other types of ::CUexternalSemaphore, flags must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS; + +/** + * External semaphore wait parameters + */ +typedef struct CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be waited on + */ + unsigned long long value; + } fence; + /** + * Pointer to NvSciSyncFence. Valid if CUexternalSemaphoreHandleType + * is of type CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC. + */ + union { + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to acquire the mutex with + */ + unsigned long long key; + /** + * Timeout in milliseconds to wait to acquire the mutex + */ + unsigned int timeoutMs; + } keyedMutex; + unsigned int reserved[10]; + } params; + /** + * Only when ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on + * a ::CUexternalSemaphore of type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * the valid flag is ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC + * which indicates that while waiting for the ::CUexternalSemaphore, no memory + * synchronization operations should be performed for any external memory + * object imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. + * For all other types of ::CUexternalSemaphore, flags must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +} CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1; +typedef CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS; + +/** + * Semaphore signal node parameters + */ +typedef struct CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st { + CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ + const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS* paramsArray; /**< Array of external semaphore signal parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +} CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1; +typedef CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 CUDA_EXT_SEM_SIGNAL_NODE_PARAMS; + +/** + * Semaphore wait node parameters + */ +typedef struct CUDA_EXT_SEM_WAIT_NODE_PARAMS_st { + CUexternalSemaphore* extSemArray; /**< Array of external semaphore handles. */ + const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS* paramsArray; /**< Array of external semaphore wait parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +} CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1; +typedef CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 CUDA_EXT_SEM_WAIT_NODE_PARAMS; + +typedef unsigned long long CUmemGenericAllocationHandle_v1; +typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle; + +/** + * Flags for specifying particular handle types + */ +typedef enum CUmemAllocationHandleType_enum { + CU_MEM_HANDLE_TYPE_NONE = 0x0, /**< Does not allow any export mechanism. > */ + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ + CU_MEM_HANDLE_TYPE_WIN32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ + CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4, /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ + CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF +} CUmemAllocationHandleType; + +/** + * Specifies the memory protection flags for mapping. + */ +typedef enum CUmemAccess_flags_enum { + CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0, /**< Default, make the address range not accessible */ + CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1, /**< Make the address range read accessible */ + CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3, /**< Make the address range read-write accessible */ + CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF +} CUmemAccess_flags; + +/** + * Specifies the type of location + */ +typedef enum CUmemLocationType_enum { + CU_MEM_LOCATION_TYPE_INVALID = 0x0, + CU_MEM_LOCATION_TYPE_DEVICE = 0x1, /**< Location is a device location, thus id is a device ordinal */ + CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF +} CUmemLocationType; + +/** +* Defines the allocation types available +*/ +typedef enum CUmemAllocationType_enum { + CU_MEM_ALLOCATION_TYPE_INVALID = 0x0, + + /** This allocation type is 'pinned', i.e. cannot migrate from its current + * location while the application is actively using it + */ + CU_MEM_ALLOCATION_TYPE_PINNED = 0x1, + CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF +} CUmemAllocationType; + +/** +* Flag for requesting different optimal and required granularities for an allocation. +*/ +typedef enum CUmemAllocationGranularity_flags_enum { + CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0, /**< Minimum required granularity for allocation */ + CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1 /**< Recommended granularity for allocation for best performance */ +} CUmemAllocationGranularity_flags; + +/** +* Specifies the handle type for address range +*/ +typedef enum CUmemRangeHandleType_enum +{ + CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD = 0x1, + CU_MEM_RANGE_HANDLE_TYPE_MAX = 0x7FFFFFFF +} CUmemRangeHandleType; + +/** + * Sparse subresource types + */ +typedef enum CUarraySparseSubresourceType_enum { + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 +} CUarraySparseSubresourceType; + +/** + * Memory operation types + */ +typedef enum CUmemOperationType_enum { + CU_MEM_OPERATION_TYPE_MAP = 1, + CU_MEM_OPERATION_TYPE_UNMAP = 2 +} CUmemOperationType; + +/** + * Memory handle types + */ +typedef enum CUmemHandleType_enum { + CU_MEM_HANDLE_TYPE_GENERIC = 0 +} CUmemHandleType; + +/** + * Specifies the CUDA array or CUDA mipmapped array memory mapping information + */ +typedef struct CUarrayMapInfo_st { + CUresourcetype resourceType; /**< Resource type */ + + union { + CUmipmappedArray mipmap; + CUarray array; + } resource; + + CUarraySparseSubresourceType subresourceType; /**< Sparse subresource type */ + + union { + struct { + unsigned int level; /**< For CUDA mipmapped arrays must a valid mipmap level. For CUDA arrays must be zero */ + unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ + unsigned int offsetX; /**< Starting X offset in elements */ + unsigned int offsetY; /**< Starting Y offset in elements */ + unsigned int offsetZ; /**< Starting Z offset in elements */ + unsigned int extentWidth; /**< Width in elements */ + unsigned int extentHeight; /**< Height in elements */ + unsigned int extentDepth; /**< Depth in elements */ + } sparseLevel; + struct { + unsigned int layer; /**< For CUDA layered arrays must be a valid layer index. Otherwise, must be zero */ + unsigned long long offset; /**< Offset within mip tail */ + unsigned long long size; /**< Extent in bytes */ + } miptail; + } subresource; + + CUmemOperationType memOperationType; /**< Memory operation type */ + CUmemHandleType memHandleType; /**< Memory handle type */ + + union { + CUmemGenericAllocationHandle memHandle; + } memHandle; + + unsigned long long offset; /**< Offset within the memory */ + unsigned int deviceBitMask; /**< Device ordinal bit mask */ + unsigned int flags; /**< flags for future use, must be zero now. */ + unsigned int reserved[2]; /**< Reserved for future use, must be zero now. */ +} CUarrayMapInfo_v1; +typedef CUarrayMapInfo_v1 CUarrayMapInfo; + +/** + * Specifies a memory location. + */ +typedef struct CUmemLocation_st { + CUmemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ + int id; /**< identifier for a given this location's ::CUmemLocationType. */ +} CUmemLocation_v1; +typedef CUmemLocation_v1 CUmemLocation; + +/** + * Specifies compression attribute for an allocation. + */ +typedef enum CUmemAllocationCompType_enum { + CU_MEM_ALLOCATION_COMP_NONE = 0x0, /**< Allocating non-compressible memory */ + CU_MEM_ALLOCATION_COMP_GENERIC = 0x1 /**< Allocating compressible memory */ +} CUmemAllocationCompType; + +/** + * This flag if set indicates that the memory will be used as a tile pool. + */ +#define CU_MEM_CREATE_USAGE_TILE_POOL 0x1 + +/** +* Specifies the allocation properties for a allocation. +*/ +typedef struct CUmemAllocationProp_st { + /** Allocation type */ + CUmemAllocationType type; + /** requested ::CUmemAllocationHandleType */ + CUmemAllocationHandleType requestedHandleTypes; + /** Location of allocation */ + CUmemLocation location; + /** + * Windows-specific POBJECT_ATTRIBUTES required when + * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object atributes structure + * includes security attributes that define + * the scope of which exported allocations may be tranferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32HandleMetaData; + struct { + /** + * Allocation hint for requesting compressible memory. + * On devices that support Compute Data Compression, compressible + * memory can be used to accelerate accesses to data with unstructured + * sparsity and other compressible data patterns. Applications are + * expected to query allocation property of the handle obtained with + * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to + * validate if the obtained allocation is compressible or not. Note that + * compressed memory may not be mappable on all devices. + */ + unsigned char compressionType; + unsigned char gpuDirectRDMACapable; + /** Bitmask indicating intended usage for this allocation */ + unsigned short usage; + unsigned char reserved[4]; + } allocFlags; +} CUmemAllocationProp_v1; +typedef CUmemAllocationProp_v1 CUmemAllocationProp; + +/** + * Memory access descriptor + */ +typedef struct CUmemAccessDesc_st { + CUmemLocation location; /**< Location on which the request is to change it's accessibility */ + CUmemAccess_flags flags; /**< ::CUmemProt accessibility flags to set on the request */ +} CUmemAccessDesc_v1; +typedef CUmemAccessDesc_v1 CUmemAccessDesc; + +typedef enum CUgraphExecUpdateResult_enum { + CU_GRAPH_EXEC_UPDATE_SUCCESS = 0x0, /**< The update succeeded */ + CU_GRAPH_EXEC_UPDATE_ERROR = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ + CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED = 0x2, /**< The update failed because the topology changed */ + CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED = 0x3, /**< The update failed because a node type changed */ + CU_GRAPH_EXEC_UPDATE_ERROR_FUNCTION_CHANGED = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ + CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ + CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED = 0x6, /**< The update failed because something about the node is not supported */ + CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */ + CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */ +} CUgraphExecUpdateResult; + +/** + * CUDA memory pool attributes + */ +typedef enum CUmemPool_attribute_enum { + /** + * (value type = int) + * Allow cuMemAllocAsync to use memory asynchronously freed + * in another streams as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + */ + CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1, + + /** + * (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + */ + CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, + + /** + * (value type = int) + * Allow cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by cuFreeAsync (default enabled). + */ + CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, + + /** + * (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + */ + CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + + /** + * (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool. + */ + CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since the + * last time it was reset. High watermark can only be reset to zero. + */ + CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, + + /** + * (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + */ + CU_MEMPOOL_ATTR_USED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the application since + * the last time it was reset. High watermark can only be reset to zero. + */ + CU_MEMPOOL_ATTR_USED_MEM_HIGH +} CUmemPool_attribute; + +/** + * Specifies the properties of allocations made from the pool. + */ +typedef struct CUmemPoolProps_st { + CUmemAllocationType allocType; /**< Allocation type. Currently must be specified as CU_MEM_ALLOCATION_TYPE_PINNED */ + CUmemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ + CUmemLocation location; /**< Location where allocations should reside. */ + /** + * Windows-specific LPSECURITYATTRIBUTES required when + * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines + * the scope of which exported allocations may be tranferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32SecurityAttributes; + unsigned char reserved[64]; /**< reserved for future use, must be 0 */ +} CUmemPoolProps_v1; +typedef CUmemPoolProps_v1 CUmemPoolProps; + +/** + * Opaque data for exporting a pool allocation + */ +typedef struct CUmemPoolPtrExportData_st { + unsigned char reserved[64]; +} CUmemPoolPtrExportData_v1; +typedef CUmemPoolPtrExportData_v1 CUmemPoolPtrExportData; + +/** + * Memory allocation node parameters + */ +typedef struct CUDA_MEM_ALLOC_NODE_PARAMS_st { + /** + * in: location where the allocation should reside (specified in ::location). + * ::handleTypes must be ::CU_MEM_HANDLE_TYPE_NONE. IPC is not supported. + */ + CUmemPoolProps poolProps; + const CUmemAccessDesc *accessDescs; /**< in: array of memory access descriptors. Used to describe peer GPU access */ + size_t accessDescCount; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ + size_t bytesize; /**< in: size in bytes of the requested allocation */ + CUdeviceptr dptr; /**< out: address of the allocation returned by CUDA */ +} CUDA_MEM_ALLOC_NODE_PARAMS; + +typedef enum CUgraphMem_attribute_enum { + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently associated with graphs + */ + CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + */ + CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, + + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH +} CUgraphMem_attribute; + +/** + * If set, each kernel launched as part of ::cuLaunchCooperativeKernelMultiDevice only + * waits for prior work in the stream corresponding to that GPU to complete before the + * kernel begins execution. + */ +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC 0x01 + +/** + * If set, any subsequent work pushed in a stream that participated in a call to + * ::cuLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on + * the GPU corresponding to that stream to complete before it begins execution. + */ +#define CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC 0x02 + +/** + * If set, the CUDA array is a collection of layers, where each layer is either a 1D + * or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number + * of layers, not the depth of a 3D array. + */ +#define CUDA_ARRAY3D_LAYERED 0x01 + +/** + * Deprecated, use CUDA_ARRAY3D_LAYERED + */ +#define CUDA_ARRAY3D_2DARRAY 0x01 + +/** + * This flag must be set in order to bind a surface reference + * to the CUDA array + */ +#define CUDA_ARRAY3D_SURFACE_LDST 0x02 + +/** + * If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The + * width of such a CUDA array must be equal to its height, and Depth must be six. + * If ::CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps + * and Depth must be a multiple of six. + */ +#define CUDA_ARRAY3D_CUBEMAP 0x04 + +/** + * This flag must be set in order to perform texture gather operations + * on a CUDA array. + */ +#define CUDA_ARRAY3D_TEXTURE_GATHER 0x08 + +/** + * This flag if set indicates that the CUDA + * array is a DEPTH_TEXTURE. + */ +#define CUDA_ARRAY3D_DEPTH_TEXTURE 0x10 + +/** + * This flag indicates that the CUDA array may be bound as a color target + * in an external graphics API + */ +#define CUDA_ARRAY3D_COLOR_ATTACHMENT 0x20 + +/** + * This flag if set indicates that the CUDA array or CUDA mipmapped array + * is a sparse CUDA array or CUDA mipmapped array respectively + */ +#define CUDA_ARRAY3D_SPARSE 0x40 + +/** + * This flag if set indicates that the CUDA array or CUDA mipmapped array + * will allow deferred memory mapping + */ +#define CUDA_ARRAY3D_DEFERRED_MAPPING 0x80 + +/** + * Override the texref format with a format inferred from the array. + * Flag for ::cuTexRefSetArray() + */ +#define CU_TRSA_OVERRIDE_FORMAT 0x01 + +/** + * Read the texture as integers rather than promoting the values to floats + * in the range [0,1]. + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_READ_AS_INTEGER 0x01 + +/** + * Use normalized texture coordinates in the range [0,1) instead of [0,dim). + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_NORMALIZED_COORDINATES 0x02 + +/** + * Perform sRGB->linear conversion during texture read. + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_SRGB 0x10 + + /** + * Disable any trilinear filtering optimizations. + * Flag for ::cuTexRefSetFlags() and ::cuTexObjectCreate() + */ +#define CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION 0x20 + +/** + * Enable seamless cube map filtering. + * Flag for ::cuTexObjectCreate() + */ +#define CU_TRSF_SEAMLESS_CUBEMAP 0x40 + +/** + * C++ compile time constant for CU_LAUNCH_PARAM_END + */ +#define CU_LAUNCH_PARAM_END_AS_INT 0x00 + +/** + * End of array terminator for the \p extra parameter to + * ::cuLaunchKernel + */ +#define CU_LAUNCH_PARAM_END ((void*)CU_LAUNCH_PARAM_END_AS_INT) + +/** + * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER + */ +#define CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT 0x01 + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a buffer containing all kernel + * parameters used for launching kernel \p f. This buffer needs to + * honor all alignment/padding requirements of the individual parameters. + * If ::CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the + * \p extra array, then ::CU_LAUNCH_PARAM_BUFFER_POINTER will have no + * effect. + */ +#define CU_LAUNCH_PARAM_BUFFER_POINTER ((void*)CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT) + +/** + * C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE + */ +#define CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT 0x02 + +/** + * Indicator that the next value in the \p extra parameter to + * ::cuLaunchKernel will be a pointer to a size_t which contains the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER. + * It is required that ::CU_LAUNCH_PARAM_BUFFER_POINTER also be specified + * in the \p extra array if the value associated with + * ::CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. + */ +#define CU_LAUNCH_PARAM_BUFFER_SIZE ((void*)CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT) + +/** + * For texture references loaded into the module, use default texunit from + * texture reference. + */ +#define CU_PARAM_TR_DEFAULT -1 + +/** + * Device that represents the CPU + */ +#define CU_DEVICE_CPU ((CUdevice)-1) + +/** + * Device that represents an invalid device + */ +#define CU_DEVICE_INVALID ((CUdevice)-2) + +/** + * Bitmasks for ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS + */ +typedef enum CUflushGPUDirectRDMAWritesOptions_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_HOST = 1<<0, /**< ::cuFlushGPUDirectRDMAWrites() and its CUDA Runtime API counterpart are supported on the device. */ + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_OPTION_MEMOPS = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the device. */ +} CUflushGPUDirectRDMAWritesOptions; + +/** + * Platform native ordering for GPUDirect RDMA writes + */ +typedef enum CUGPUDirectRDMAWritesOrdering_enum { + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_NONE = 0, /**< The device does not natively support ordering of remote writes. ::cuFlushGPUDirectRDMAWrites() can be leveraged if supported. */ + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_OWNER = 100, /**< Natively, the device can consistently consume remote writes, although other CUDA devices may not. */ + CU_GPU_DIRECT_RDMA_WRITES_ORDERING_ALL_DEVICES = 200 /**< Any CUDA device in the system can consistently consume remote writes to this device. */ +} CUGPUDirectRDMAWritesOrdering; + +/** + * The scopes for ::cuFlushGPUDirectRDMAWrites + */ +typedef enum CUflushGPUDirectRDMAWritesScope_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_OWNER = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TO_ALL_DEVICES = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ +} CUflushGPUDirectRDMAWritesScope; + +/** + * The targets for ::cuFlushGPUDirectRDMAWrites + */ +typedef enum CUflushGPUDirectRDMAWritesTarget_enum { + CU_FLUSH_GPU_DIRECT_RDMA_WRITES_TARGET_CURRENT_CTX = 0 /**< Sets the target for ::cuFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ +} CUflushGPUDirectRDMAWritesTarget; + +/** + * The additional write options for ::cuGraphDebugDotPrint + */ +typedef enum CUgraphDebugDot_flags_enum { + CU_GRAPH_DEBUG_DOT_FLAGS_VERBOSE = 1<<0, /** Output all debug data as if every debug flag is enabled */ + CU_GRAPH_DEBUG_DOT_FLAGS_RUNTIME_TYPES = 1<<1, /** Use CUDA Runtime structures for output */ + CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = 1<<2, /** Adds CUDA_KERNEL_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = 1<<3, /** Adds CUDA_MEMCPY3D values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = 1<<4, /** Adds CUDA_MEMSET_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = 1<<5, /** Adds CUDA_HOST_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = 1<<6, /** Adds CUevent handle from record and wait nodes to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = 1<<7, /** Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = 1<<8, /** Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = 1<<9, /** Adds CUkernelNodeAttrValue values to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES = 1<<10, /** Adds node handles and every kernel function handle to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEM_ALLOC_NODE_PARAMS = 1<<11, /** Adds memory alloc node parameters to output */ + CU_GRAPH_DEBUG_DOT_FLAGS_MEM_FREE_NODE_PARAMS = 1<<12 /** Adds memory free node parameters to output */ + , + CU_GRAPH_DEBUG_DOT_FLAGS_BATCH_MEM_OP_NODE_PARAMS = 1<<13 /** Adds batch mem op node parameters to output */ +} CUgraphDebugDot_flags; + +/** + * Flags for user objects for graphs + */ +typedef enum CUuserObject_flags_enum { + CU_USER_OBJECT_NO_DESTRUCTOR_SYNC = 1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ +} CUuserObject_flags; + +/** + * Flags for retaining user object references for graphs + */ +typedef enum CUuserObjectRetain_flags_enum { + CU_GRAPH_USER_OBJECT_MOVE = 1 /**< Transfer references from the caller rather than creating new references. */ +} CUuserObjectRetain_flags; + +/** + * Flags for instantiating a graph + */ +typedef enum CUgraphInstantiate_flags_enum { + CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH = 1 /**< Automatically free memory allocated in a graph before relaunching. */ + , CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY = 8 /**< Run the graph using the per-node priority attributes rather than the + priority of the stream it is launched into. */ +} CUgraphInstantiate_flags; + +/** @} */ /* END CUDA_TYPES */ + +#if defined(__GNUC__) + #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) + #pragma GCC visibility push(default) + #endif +#endif + +#ifdef _WIN32 +#define CUDAAPI __stdcall +#else +#define CUDAAPI +#endif + +/** + * \defgroup CUDA_ERROR Error Handling + * + * ___MANBRIEF___ error handling functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the error handling functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Gets the string description of an error code + * + * Sets \p *pStr to the address of a NULL-terminated string description + * of the error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::CUresult, + * ::cudaGetErrorString + */ +CUresult CUDAAPI cuGetErrorString(CUresult error, const char **pStr); + +/** + * \brief Gets the string representation of an error code enum name + * + * Sets \p *pStr to the address of a NULL-terminated string representation + * of the name of the enum error code \p error. + * If the error code is not recognized, ::CUDA_ERROR_INVALID_VALUE + * will be returned and \p *pStr will be set to the NULL address. + * + * \param error - Error code to convert to string + * \param pStr - Address of the string pointer. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::CUresult, + * ::cudaGetErrorName + */ +CUresult CUDAAPI cuGetErrorName(CUresult error, const char **pStr); + +/** @} */ /* END CUDA_ERROR */ + +/** + * \defgroup CUDA_INITIALIZE Initialization + * + * ___MANBRIEF___ initialization functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the initialization functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Initialize the CUDA driver API + * + * Initializes the driver API and must be called before any other function from + * the driver API. Currently, the \p Flags parameter must be 0. If ::cuInit() + * has not been called, any function from the driver API will return + * ::CUDA_ERROR_NOT_INITIALIZED. + * + * \param Flags - Initialization flag for CUDA. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_SYSTEM_DRIVER_MISMATCH, + * ::CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE + * \notefnerr + */ +CUresult CUDAAPI cuInit(unsigned int Flags); + +/** @} */ /* END CUDA_INITIALIZE */ + +/** + * \defgroup CUDA_VERSION Version Management + * + * ___MANBRIEF___ version management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the version management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns the latest CUDA version supported by driver + * + * Returns in \p *driverVersion the version of CUDA supported by + * the driver. The version is returned as + * (1000 × major + 10 × minor). For example, CUDA 9.2 + * would be represented by 9020. + * + * This function automatically returns ::CUDA_ERROR_INVALID_VALUE if + * \p driverVersion is NULL. + * + * \param driverVersion - Returns the CUDA driver version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cudaDriverGetVersion, + * ::cudaRuntimeGetVersion + */ +CUresult CUDAAPI cuDriverGetVersion(int *driverVersion); + +/** @} */ /* END CUDA_VERSION */ + +/** + * \defgroup CUDA_DEVICE Device Management + * + * ___MANBRIEF___ device management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given an ordinal in the range <b>[0, + * ::cuDeviceGetCount()-1]</b>. + * + * \param device - Returned device handle + * \param ordinal - Device number to get handle for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport + */ +CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal); + +/** + * \brief Returns the number of compute-capable devices + * + * Returns in \p *count the number of devices with compute capability greater + * than or equal to 2.0 that are available for execution. If there is no such + * device, ::cuDeviceGetCount() returns 0. + * + * \param count - Returned number of compute-capable devices + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceCount + */ +CUresult CUDAAPI cuDeviceGetCount(int *count); + +/** + * \brief Returns an identifer string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p name. \p len specifies the maximum length of the + * string that may be returned. + * + * \param name - Returned identifier string for the device + * \param len - Maximum length of string to store in \p name + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetUuid, + * ::cuDeviceGetLuid, + * ::cuDeviceGetCount, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev); + +/** + * \brief Return an UUID for the device + * + * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will + * supplant this version in 12.0, which is retained for minor version compatibility. + * + * Returns 16-octets identifing the device \p dev in the structure + * pointed by the \p uuid. + * + * \param uuid - Returned UUID + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetUuid_v2 + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev); + +/** + * \brief Return an UUID for the device (11.4+) + * + * Returns 16-octets identifing the device \p dev in the structure + * pointed by the \p uuid. If the device is in MIG mode, returns its + * MIG UUID which uniquely identifies the subscribed MIG compute instance. + * + * \param uuid - Returned UUID + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetLuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev); + +/** + * \brief Return an LUID and device node mask for the device + * + * Return identifying information (\p luid and \p deviceNodeMask) to allow + * matching device with graphics APIs. + * + * \param luid - Returned LUID + * \param deviceNodeMask - Returned device node mask + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev); + +/** + * \brief Returns the total amount of memory on the device + * + * Returns in \p *bytes the total amount of memory available on the device + * \p dev in bytes. + * + * \param bytes - Returned memory available on device in bytes + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaMemGetInfo + */ +CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev); + +/** + * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size. + * + * Returns in \p maxWidthInElements the maximum number of texture elements allocatable in a 1D linear texture + * for given \p format and \p numChannels. + * + * \param maxWidthInElements - Returned maximum number of texture elements allocatable for given \p format and \p numChannels. + * \param format - Texture format. + * \param numChannels - Number of channels per texture element. + * \param dev - Device handle. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cudaMemGetInfo, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev); + +/** + * \brief Returns information about the device + * + * Returns in \p *pi the integer value of the attribute \p attrib on device + * \p dev. The supported attributes are: + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per + * block; + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid + * - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of + * shared memory available to a thread block in bytes + * - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for + * __constant__ variables in a CUDA C kernel in bytes + * - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads + * - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the + * memory copy functions that involve memory regions allocated through + * ::cuMemAllocPitch() + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D + * texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width + * for a 1D texture bound to linear memory + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum + * mipmapped 1D texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D + * texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D + * texture height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width + * for a 2D texture bound to linear memory + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height + * for a 2D texture bound to linear memory + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch + * in bytes for a 2D texture bound to linear memory + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum + * mipmapped 2D texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum + * mipmapped 2D texture height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D + * texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D + * texture height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D + * texture depth + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE: + * Alternate maximum 3D texture width, 0 if no alternate + * maximum 3D texture size is supported + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE: + * Alternate maximum 3D texture height, 0 if no alternate + * maximum 3D texture size is supported + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE: + * Alternate maximum 3D texture depth, 0 if no alternate + * maximum 3D texture size is supported + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH: + * Maximum cubemap texture width or height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH: + * Maximum 1D layered texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered texture + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH: + * Maximum 2D layered texture width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT: + * Maximum 2D layered texture height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered texture + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered texture width or height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered texture + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH: + * Maximum 1D surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH: + * Maximum 2D surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT: + * Maximum 2D surface height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH: + * Maximum 3D surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT: + * Maximum 3D surface height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH: + * Maximum 3D surface depth + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH: + * Maximum 1D layered surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS: + * Maximum layers in a 1D layered surface + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH: + * Maximum 2D layered surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT: + * Maximum 2D layered surface height + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS: + * Maximum layers in a 2D layered surface + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH: + * Maximum cubemap surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH: + * Maximum cubemap layered surface width + * - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS: + * Maximum layers in a cubemap layered surface + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit + * registers available to a thread block + * - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture + * base addresses aligned to ::textureAlign bytes do not need an offset + * applied to texture fetches + * - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement + * for 2D texture references bound to pitched memory + * - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy + * memory between host and device while executing a kernel, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on + * the device + * - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit + * for kernels executed on the device, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the + * memory subsystem, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host + * memory into the CUDA address space, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently + * in. Available modes are as follows: + * - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and + * can have multiple CUDA contexts present at a single time. + * - ::CU_COMPUTEMODE_PROHIBITED: Compute-prohibited mode - Device is + * prohibited from creating new CUDA contexts. + * - ::CU_COMPUTEMODE_EXCLUSIVE_PROCESS: Compute-exclusive-process mode - Device + * can have only one context used by a single process at a time. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS: 1 if the device supports + * executing multiple kernels within the same context simultaneously, or 0 if + * not. It is not guaranteed that multiple kernels will be resident + * on the device concurrently so this feature should not be relied upon for + * correctness. + * - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the + * device, 0 if error correction is disabled or not supported by the device + * - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device + * - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier + * of the device + * - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device + * - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC + * is only available on Tesla hardware running Windows Vista or later + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits + * - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache + * - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with + * the host, or 0 if not + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number + * - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals + * in L1 cache, 0 if caching globals in L1 cache is not supported by the device + * - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals + * in L1 cache, 0 if caching locals in L1 cache is not supported by the device + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of + * shared memory available to a multiprocessor in bytes; this amount is shared + * by all thread blocks simultaneously resident on a multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit + * registers available to a multiprocessor; this number is shared by all thread + * blocks simultaneously resident on a multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory + * on this system, 0 if allocating managed memory is not supported by the device on this system. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not. + * - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID: Unique identifier for a group of devices + * associated with the same board. Devices on the same multi-GPU board will share the same identifier. + * - ::CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED: 1 if Link between the device and the host + * supports native atomic operations. + * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance + * (in floating-point operations per second) to double precision performance. + * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing + * pageable memory without calling cudaHostRegister on it. + * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory + * concurrently with the CPU. + * - ::CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED: Device supports Compute Preemption. + * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered + * memory at the same virtual address as the CPU. + * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size + * suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. + * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES + * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's + * page tables. + * - ::CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST: The host can directly access managed memory on the device without migration. + * - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor + * - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate + * - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes + * - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate. + * - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes + * - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. + * - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU + * - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum + * - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here. + * - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC + * - ::CU_DEVICE_ATTRIBUTE_DEFERRED_MAPPING_CUDA_ARRAY_SUPPORTED: Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays. + * + * \param pi - Returned device attribute value + * \param attrib - Device attribute to query + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem, + * ::cuDeviceGetExecAffinitySupport, + * ::cudaDeviceGetAttribute, + * ::cudaGetDeviceProperties + */ +CUresult CUDAAPI cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev); + +/** + * \brief Return NvSciSync attributes that this device can support. + * + * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that + * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList + * can be used to create an NvSciSync object that matches this device's capabilities. + * + * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is + * already set this API will return ::CUDA_ERROR_INVALID_VALUE. + * + * The applications should set \p nvSciSyncAttrList to a valid + * NvSciSyncAttrList failing which this API will return + * ::CUDA_ERROR_INVALID_HANDLE. + * + * The \p flags controls how applications intends to use + * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: + * - ::CUDA_NVSCISYNC_ATTR_SIGNAL, specifies that the applications intends to + * signal an NvSciSync on this CUDA device. + * - ::CUDA_NVSCISYNC_ATTR_WAIT, specifies that the applications intends to + * wait on an NvSciSync on this CUDA device. + * + * At least one of these flags must be set, failing which the API + * returns ::CUDA_ERROR_INVALID_VALUE. Both the flags are orthogonal + * to one another: a developer may set both these flags that allows to + * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. + * + * \param nvSciSyncAttrList - Return NvSciSync attributes supported. + * \param dev - Valid Cuda Device to get NvSciSync attributes for. + * \param flags - flags describing NvSciSync usage. + * + * \return + * + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa + * ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags); + +/** + * \brief Sets the current memory pool of a device + * + * The memory pool must be local to the specified device. + * ::cuMemAllocAsync allocates from the current mempool of the provided stream's device. + * By default, a device's current memory pool is its default memory pool. + * + * \note Use ::cuMemAllocFromPoolAsync to specify asynchronous allocations from a device different + * than the one the stream runs on. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolDestroy, ::cuMemAllocFromPoolAsync + */ +CUresult CUDAAPI cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool); + +/** + * \brief Gets the current mempool for a device + * + * Returns the last pool provided to ::cuDeviceSetMemPool for this device + * or the device's default memory pool if ::cuDeviceSetMemPool has never been called. + * By default the current mempool is the default mempool for a device. + * Otherwise the returned pool must have been set with ::cuDeviceSetMemPool. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate, ::cuDeviceSetMemPool + */ +CUresult CUDAAPI cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev); + +/** + * \brief Returns the default mempool of a device + * + * The default mempool of a device contains device memory from that device. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuMemAllocAsync, ::cuMemPoolTrimTo, ::cuMemPoolGetAttribute, ::cuMemPoolSetAttribute, cuMemPoolSetAccess, ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev); + +/** + * \brief Blocks until remote writes are visible to the specified scope + * + * Blocks until GPUDirect RDMA writes to the target context via mappings + * created through APIs like nvidia_p2p_get_pages (see + * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are + * visible to the specified scope. + * + * If the scope equals or lies within the scope indicated by + * ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING, the call + * will be a no-op and can be safely omitted for performance. This can be + * determined by comparing the numerical values between the two enums, with + * smaller scopes having smaller values. + * + * Users may query support for this API via + * ::CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS. + * + * \param target - The target of the operation, see ::CUflushGPUDirectRDMAWritesTarget + * \param scope - The scope of the operation, see ::CUflushGPUDirectRDMAWritesScope + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + */ +CUresult CUDAAPI cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope); + +/** @} */ /* END CUDA_DEVICE */ + +/** + * \defgroup CUDA_DEVICE_DEPRECATED Device Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated device management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns properties for a selected device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and replaced by ::cuDeviceGetAttribute(). + * + * Returns in \p *prop the properties of device \p dev. The ::CUdevprop + * structure is defined as: + * + * \code + typedef struct CUdevprop_st { + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int sharedMemPerBlock; + int totalConstantMemory; + int SIMDWidth; + int memPitch; + int regsPerBlock; + int clockRate; + int textureAlign + } CUdevprop; + * \endcode + * where: + * + * - ::maxThreadsPerBlock is the maximum number of threads per block; + * - ::maxThreadsDim[3] is the maximum sizes of each dimension of a block; + * - ::maxGridSize[3] is the maximum sizes of each dimension of a grid; + * - ::sharedMemPerBlock is the total amount of shared memory available per + * block in bytes; + * - ::totalConstantMemory is the total amount of constant memory available on + * the device in bytes; + * - ::SIMDWidth is the warp size; + * - ::memPitch is the maximum pitch allowed by the memory copy functions that + * involve memory regions allocated through ::cuMemAllocPitch(); + * - ::regsPerBlock is the total number of registers available per block; + * - ::clockRate is the clock frequency in kilohertz; + * - ::textureAlign is the alignment requirement; texture base addresses that + * are aligned to ::textureAlign bytes do not need an offset applied to + * texture fetches. + * + * \param prop - Returned properties of device + * \param dev - Device to get properties for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevice dev); + +/** + * \brief Returns the compute capability of the device + * + * \deprecated + * + * This function was deprecated as of CUDA 5.0 and its functionality superceded + * by ::cuDeviceGetAttribute(). + * + * Returns in \p *major and \p *minor the major and minor revision numbers that + * define the compute capability of the device \p dev. + * + * \param major - Major revision number + * \param minor - Minor revision number + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuDeviceComputeCapability(int *major, int *minor, CUdevice dev); + +/** @} */ /* END CUDA_DEVICE_DEPRECATED */ + +/** + * \defgroup CUDA_PRIMARY_CTX Primary Context Management + * + * ___MANBRIEF___ primary context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the primary context management functions of the low-level + * CUDA driver application programming interface. + * + * The primary context is unique per device and shared with the CUDA runtime API. + * These functions allow integration with other libraries using CUDA. + * + * @{ + */ + +/** + * \brief Retain the primary context on the GPU + * + * Retains the primary context on the device. + * Once the user successfully retains the primary context, the primary context + * will be active and available to the user until the user releases it + * with ::cuDevicePrimaryCtxRelease() or resets it with ::cuDevicePrimaryCtxReset(). + * Unlike ::cuCtxCreate() the newly retained context is not pushed onto the stack. + * + * Retaining the primary context for the first time will fail with ::CUDA_ERROR_UNKNOWN + * if the compute mode of the device is ::CU_COMPUTEMODE_PROHIBITED. The function + * ::cuDeviceGetAttribute() can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to + * determine the compute mode of the device. + * The <i>nvidia-smi</i> tool can be used to set the compute mode for + * devices. Documentation for <i>nvidia-smi</i> can be obtained by passing a + * -h option to it. + * + * Please note that the primary context always supports pinned allocations. Other + * flags can be specified by ::cuDevicePrimaryCtxSetFlags(). + * + * \param pctx - Returned context handle of the new context + * \param dev - Device for which primary context is requested + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRelease, + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev); + +/** + * \brief Release the primary context on the GPU + * + * Releases the primary context interop on the device. + * A retained context should always be released once the user is done using + * it. The context is automatically reset once the last reference to it is + * released. This behavior is different when the primary context was retained + * by the CUDA runtime from CUDA 4.0 and earlier. In this case, the primary + * context remains always active. + * + * Releasing a primary context that has not been previously retained will + * fail with ::CUDA_ERROR_INVALID_CONTEXT. + * + * Please note that unlike ::cuCtxDestroy() this method does not pop the context + * from stack in any circumstances. + * + * \param dev - Device which primary context is released + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); + +/** + * \brief Set flags for the primary context + * + * Sets the flags for the primary context on the device overwriting perviously + * set ones. + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. <br> + * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. <br> + * <b>Deprecated:</b> This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * + * \param dev - Device for which the primary context flags are set + * \param flags - New flags for the device + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxGetState, + * ::cuCtxCreate, + * ::cuCtxGetFlags, + * ::cudaSetDeviceFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); + +/** + * \brief Get the state of the primary context + * + * Returns in \p *flags the flags for the primary context of \p dev, and in + * \p *active whether it is active. See ::cuDevicePrimaryCtxSetFlags for flag + * values. + * + * \param dev - Device to get primary context flags for + * \param flags - Pointer to store flags + * \param active - Pointer to store context state; 0 = inactive, 1 = active + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa + * ::cuDevicePrimaryCtxSetFlags, + * ::cuCtxGetFlags, + * ::cudaGetDeviceFlags + */ +CUresult CUDAAPI cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active); + +/** + * \brief Destroy all allocations and reset all state on the primary context + * + * Explicitly destroys and cleans up all resources associated with the current + * device in the current process. + * + * Note that it is responsibility of the calling function to ensure that no + * other module in the process is using the device any more. For that reason + * it is recommended to use ::cuDevicePrimaryCtxRelease() in most cases. + * However it is safe for other modules to call ::cuDevicePrimaryCtxRelease() + * even after resetting the device. + * Resetting the primary context does not release it, an application that has + * retained the primary context should explicitly release its usage. + * + * \param dev - Device for which primary context is destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE + * \notefnerr + * + * \sa ::cuDevicePrimaryCtxRetain, + * ::cuDevicePrimaryCtxRelease, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceReset + */ +CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); + +/** @} */ /* END CUDA_PRIMARY_CTX */ + +/** + * \brief Returns information about the execution affinity support of the device. + * + * Returns in \p *pi whether execution affinity type \p type is supported by device \p dev. + * The supported types are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: 1 if context with limited SMs is supported by the device, + * or 0 if not; + * + * \param pi - 1 if the execution affinity type \p type is supported by the device, or 0 if not + * \param type - Execution affinity type to query + * \param dev - Device handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGetAttribute, + * ::cuDeviceGetCount, + * ::cuDeviceGetName, + * ::cuDeviceGetUuid, + * ::cuDeviceGet, + * ::cuDeviceTotalMem + */ +CUresult CUDAAPI cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev); + +/** + * \defgroup CUDA_CTX Context Management + * + * ___MANBRIEF___ context management functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the context management functions of the low-level + * CUDA driver application programming interface. + * + * Please note that some functions are described in + * \ref CUDA_PRIMARY_CTX "Primary Context Management" section. + * + * @{ + */ + +/** + * \brief Create a CUDA context + * + * \note In most cases it is recommended to use ::cuDevicePrimaryCtxRetain. + * + * Creates a new CUDA context and associates it with the calling thread. The + * \p flags parameter is described below. The context is created with a usage + * count of 1 and the caller of ::cuCtxCreate() must call ::cuCtxDestroy() or + * when done using the context. If a context is already current to the thread, + * it is supplanted by the newly created context and may be restored by a subsequent + * call to ::cuCtxPopCurrent(). + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. <br> + * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory that is + * accessible to the GPU. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. <br> + * <b>Deprecated:</b> This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set + * the compute mode for * devices. + * Documentation for <i>nvidia-smi</i> can be obtained by passing a + * -h option to it. + * + * \param pctx - Returned context handle of the new context + * \param flags - Context creation flags + * \param dev - Device to create context on + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); + +/** + * \brief Create a CUDA context with execution affinity + * + * Creates a new CUDA context with execution affinity and associates it with + * the calling thread. The \p paramsArray and \p flags parameter are described below. + * The context is created with a usage count of 1 and the caller of ::cuCtxCreate() must + * call ::cuCtxDestroy() or when done using the context. If a context is already + * current to the thread, it is supplanted by the newly created context and may + * be restored by a subsequent call to ::cuCtxPopCurrent(). + * + * The type and the amount of execution resource the context can use is limited by \p paramsArray + * and \p numParams. The \p paramsArray is an array of \p CUexecAffinityParam and the \p numParams + * describes the size of the array. If two \p CUexecAffinityParam in the array have the same type, + * the latter execution affinity parameter overrides the former execution affinity parameter. + * The supported execution affinity types are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT limits the portion of SMs that the context can use. The portion + * of SMs is specified as the number of SMs via \p CUexecAffinitySmCount. This limit will be internally + * rounded up to the next hardware-supported amount. Hence, it is imperative to query the actual execution + * affinity of the context via \p cuCtxGetExecAffinity after context creation. Currently, this attribute + * is only supported under Volta+ MPS. + * + * The three LSBs of the \p flags parameter can be used to control how the OS + * thread, which owns the CUDA context at the time of an API call, interacts + * with the OS scheduler when waiting for results from the GPU. Only one of + * the scheduling flags can be set when creating a context. + * + * - ::CU_CTX_SCHED_SPIN: Instruct CUDA to actively spin when waiting for + * results from the GPU. This can decrease latency when waiting for the GPU, + * but may lower the performance of CPU threads if they are performing work in + * parallel with the CUDA thread. + * + * - ::CU_CTX_SCHED_YIELD: Instruct CUDA to yield its thread when waiting for + * results from the GPU. This can increase latency when waiting for the GPU, + * but can increase the performance of CPU threads performing work in parallel + * with the GPU. + * + * - ::CU_CTX_SCHED_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. + * + * - ::CU_CTX_BLOCKING_SYNC: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the GPU to finish work. <br> + * <b>Deprecated:</b> This flag was deprecated as of CUDA 4.0 and was + * replaced with ::CU_CTX_SCHED_BLOCKING_SYNC. + * + * - ::CU_CTX_SCHED_AUTO: The default value if the \p flags parameter is zero, + * uses a heuristic based on the number of active CUDA contexts in the + * process \e C and the number of logical processors in the system \e P. If + * \e C > \e P, then CUDA will yield to other OS threads when waiting for + * the GPU (::CU_CTX_SCHED_YIELD), otherwise CUDA will not yield while + * waiting for results and actively spin on the processor (::CU_CTX_SCHED_SPIN). + * Additionally, on Tegra devices, ::CU_CTX_SCHED_AUTO uses a heuristic based on + * the power profile of the platform and may choose ::CU_CTX_SCHED_BLOCKING_SYNC + * for low-powered devices. + * + * - ::CU_CTX_MAP_HOST: Instruct CUDA to support mapped pinned allocations. + * This flag must be set in order to allocate pinned host memory that is + * accessible to the GPU. + * + * - ::CU_CTX_LMEM_RESIZE_TO_MAX: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. <br> + * <b>Deprecated:</b> This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * Instead, the per-thread stack size can be controlled with ::cuCtxSetLimit(). + * + * Context creation will fail with ::CUDA_ERROR_UNKNOWN if the compute mode of + * the device is ::CU_COMPUTEMODE_PROHIBITED. The function ::cuDeviceGetAttribute() + * can be used with ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE to determine the + * compute mode of the device. The <i>nvidia-smi</i> tool can be used to set + * the compute mode for * devices. + * Documentation for <i>nvidia-smi</i> can be obtained by passing a + * -h option to it. + * + * \param pctx - Returned context handle of the new context + * \param paramsArray - Execution affinity parameters + * \param numParams - Number of execution affinity parameters + * \param flags - Context creation flags + * \param dev - Device to create context on + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::CUexecAffinityParam + */ +CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev); + +/** + * \brief Destroy a CUDA context + * + * Destroys the CUDA context specified by \p ctx. The context \p ctx will be + * destroyed regardless of how many threads it is current to. + * It is the responsibility of the calling function to ensure that no API + * call issues using \p ctx while ::cuCtxDestroy() is executing. + * + * Destroys and cleans up all resources associated with the context. + * It is the caller's responsibility to ensure that the context or its resources + * are not accessed or passed in subsequent API calls and doing so will result in undefined behavior. + * These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent, + * ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref, + * ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore. + * + * If \p ctx is current to the calling thread then \p ctx will also be + * popped from the current thread's context stack (as though ::cuCtxPopCurrent() + * were called). If \p ctx is current to other threads, then \p ctx will + * remain current to those threads, and attempting to access \p ctx from + * those threads will result in the error ::CUDA_ERROR_CONTEXT_IS_DESTROYED. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); + +/** + * \brief Pushes a context on the current CPU thread + * + * Pushes the given context \p ctx onto the CPU thread's stack of current + * contexts. The specified context becomes the CPU thread's current context, so + * all CUDA functions that operate on the current context are affected. + * + * The previous current context may be made current again by calling + * ::cuCtxDestroy() or ::cuCtxPopCurrent(). + * + * \param ctx - Context to push + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); + +/** + * \brief Pops the current CUDA context from the current CPU thread. + * + * Pops the current CUDA context from the CPU thread and passes back the + * old context handle in \p *pctx. That context may then be made current + * to a different CPU thread by calling ::cuCtxPushCurrent(). + * + * If a context was current to the CPU thread before ::cuCtxCreate() or + * ::cuCtxPushCurrent() was called, this function makes that context current to + * the CPU thread again. + * + * \param pctx - Returned popped context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); + +/** + * \brief Binds the specified CUDA context to the calling CPU thread + * + * Binds the specified CUDA context to the calling CPU thread. + * If \p ctx is NULL then the CUDA context previously bound to the + * calling CPU thread is unbound and ::CUDA_SUCCESS is returned. + * + * If there exists a CUDA context stack on the calling CPU thread, this + * will replace the top of that stack with \p ctx. + * If \p ctx is NULL then this will be equivalent to popping the top + * of the calling CPU thread's CUDA context stack (or a no-op if the + * calling CPU thread's CUDA context stack is empty). + * + * \param ctx - Context to bind to the calling CPU thread + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa + * ::cuCtxGetCurrent, + * ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cudaSetDevice + */ +CUresult CUDAAPI cuCtxSetCurrent(CUcontext ctx); + +/** + * \brief Returns the CUDA context bound to the calling CPU thread. + * + * Returns in \p *pctx the CUDA context bound to the calling CPU thread. + * If no context is bound to the calling CPU thread then \p *pctx is + * set to NULL and ::CUDA_SUCCESS is returned. + * + * \param pctx - Returned context handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * \notefnerr + * + * \sa + * ::cuCtxSetCurrent, + * ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cudaGetDevice + */ +CUresult CUDAAPI cuCtxGetCurrent(CUcontext *pctx); + +/** + * \brief Returns the device ID for the current context + * + * Returns in \p *device the ordinal of the current context's device. + * + * \param device - Returned device ID for the current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaGetDevice + */ +CUresult CUDAAPI cuCtxGetDevice(CUdevice *device); + +/** + * \brief Returns the flags for the current context + * + * Returns in \p *flags the flags of the current context. See ::cuCtxCreate + * for flag values. + * + * \param flags - Pointer to store flags of current context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetCurrent, + * ::cuCtxGetDevice, + * ::cuCtxGetLimit, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxGetStreamPriorityRange, + * ::cudaGetDeviceFlags + */ +CUresult CUDAAPI cuCtxGetFlags(unsigned int *flags); + +/** + * \brief Block for a context's tasks to complete + * + * Blocks until the device has completed all preceding requested tasks. + * ::cuCtxSynchronize() returns an error if one of the preceding tasks failed. + * If the context was created with the ::CU_CTX_SCHED_BLOCKING_SYNC flag, the + * CPU thread will block until the GPU context has finished its work. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cudaDeviceSynchronize + */ +CUresult CUDAAPI cuCtxSynchronize(void); + +/** + * \brief Set resource limits + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the context. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cuCtxGetLimit() to find out exactly + * what the limit has been set to. + * + * Setting each ::CUlimit has its own specific restrictions, so each is + * discussed here. + * + * - ::CU_LIMIT_STACK_SIZE controls the stack size in bytes of each GPU thread. + * The driver automatically increases the per-thread stack size + * for each kernel launch as needed. This size isn't reset back to the + * original value after each launch. Setting this value will take effect + * immediately, and if necessary, the device will block until all preceding + * requested tasks are complete. + * + * - ::CU_LIMIT_PRINTF_FIFO_SIZE controls the size in bytes of the FIFO used + * by the ::printf() device system call. Setting ::CU_LIMIT_PRINTF_FIFO_SIZE + * must be performed before launching any kernel that uses the ::printf() + * device system call, otherwise ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * - ::CU_LIMIT_MALLOC_HEAP_SIZE controls the size in bytes of the heap used + * by the ::malloc() and ::free() device system calls. Setting + * ::CU_LIMIT_MALLOC_HEAP_SIZE must be performed before launching any kernel + * that uses the ::malloc() or ::free() device system calls, otherwise + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH controls the maximum nesting depth of + * a grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting + * this limit must be performed before any launch of a kernel that uses the + * device runtime and calls ::cudaDeviceSynchronize() above the default sync + * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + * with error code ::cudaErrorSyncDepthExceeded if the limitation is + * violated. This limit can be set smaller than the default or up the maximum + * launch depth of 24. When setting this limit, keep in mind that additional + * levels of sync depth require the driver to reserve large amounts of device + * memory which can no longer be used for user allocations. If these + * reservations of device memory fail, ::cuCtxSetLimit() will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT controls the maximum number of + * outstanding device runtime launches that can be made from the current + * context. A grid is outstanding from the point of launch up until the grid + * is known to have been completed. Device runtime launches which violate + * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when + * ::cudaGetLastError() is called after launch. If more pending launches than + * the default (2048 launches) are needed for a module using the device + * runtime, this limit can be increased. Keep in mind that being able to + * sustain additional pending launches will require the driver to reserve + * larger amounts of device memory upfront which can no longer be used for + * allocations. If these reservations fail, ::cuCtxSetLimit() will return + * ::CUDA_ERROR_OUT_OF_MEMORY, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::CUDA_ERROR_UNSUPPORTED_LIMIT being + * returned. + * + * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. + * Values can range from 0B to 128B. This is purely a performence hint and + * it can be ignored or clamped depending on the platform. + * + * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for + * persisting L2 cache. This is purely a performance hint and it can be + * ignored or clamped depending on the platform. + * + * \param limit - Limit to set + * \param value - Size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSynchronize, + * ::cudaDeviceSetLimit + */ +CUresult CUDAAPI cuCtxSetLimit(CUlimit limit, size_t value); + +/** + * \brief Returns resource limits + * + * Returns in \p *pvalue the current size of \p limit. The supported + * ::CUlimit values are: + * - ::CU_LIMIT_STACK_SIZE: stack size in bytes of each GPU thread. + * - ::CU_LIMIT_PRINTF_FIFO_SIZE: size in bytes of the FIFO used by the + * ::printf() device system call. + * - ::CU_LIMIT_MALLOC_HEAP_SIZE: size in bytes of the heap used by the + * ::malloc() and ::free() device system calls. + * - ::CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH: maximum grid depth at which a thread + * can issue the device runtime call ::cudaDeviceSynchronize() to wait on + * child grid launches to complete. + * - ::CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT: maximum number of outstanding + * device runtime launches that can be made from this context. + * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY: L2 cache fetch granularity. + * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE: Persisting L2 cache size in bytes + * + * \param limit - Limit to query + * \param pvalue - Returned size of limit + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_LIMIT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceGetLimit + */ +CUresult CUDAAPI cuCtxGetLimit(size_t *pvalue, CUlimit limit); + +/** + * \brief Returns the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this function returns through \p pconfig the preferred cache configuration + * for the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute functions. + * + * This will return a \p pconfig of ::CU_FUNC_CACHE_PREFER_NONE on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param pconfig - Returned cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceGetCacheConfig + */ +CUresult CUDAAPI cuCtxGetCacheConfig(CUfunc_cache *pconfig); + +/** + * \brief Sets the preferred cache configuration for the current context. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the current context. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute the function. Any function preference + * set via ::cuFuncSetCacheConfig() will be preferred over this context-wide + * setting. Setting the context-wide cache configuration to + * ::CU_FUNC_CACHE_PREFER_NONE will cause subsequent kernel launches to prefer + * to not change the cache configuration unless required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceSetCacheConfig + */ +CUresult CUDAAPI cuCtxSetCacheConfig(CUfunc_cache config); + +/** + * \brief Returns the current shared memory configuration for the current context. + * + * This function will return in \p pConfig the current size of shared memory banks + * in the current context. On devices with configurable shared memory banks, + * ::cuCtxSetSharedMemConfig can be used to change this setting, so that all + * subsequent kernel launches will by default use the new bank size. When + * ::cuCtxGetSharedMemConfig is called on devices without configurable shared + * memory, it will return the fixed bank size of the hardware. + * + * The returned bank configurations can be either: + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: shared memory bank width is + * four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: shared memory bank width will + * eight bytes. + * + * \param pConfig - returned shared memory configuration + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceGetSharedMemConfig + */ +CUresult CUDAAPI cuCtxGetSharedMemConfig(CUsharedconfig *pConfig); + +/** + * \brief Sets the shared memory configuration for the current context. + * + * On devices with configurable shared memory banks, this function will set + * the context's shared memory bank size which is used for subsequent kernel + * launches. + * + * Changed the shared memory configuration between launches may insert a device + * side synchronization point between those launches. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: set bank width to the default initial + * setting (currently, four bytes). + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes. + * + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cuCtxGetSharedMemConfig, + * ::cuFuncSetCacheConfig, + * ::cudaDeviceSetSharedMemConfig + */ +CUresult CUDAAPI cuCtxSetSharedMemConfig(CUsharedconfig config); + +/** + * \brief Gets the context's API version. + * + * Returns a version number in \p version corresponding to the capabilities of + * the context (e.g. 3010 or 3020), which library developers can use to direct + * callers to a specific API version. If \p ctx is NULL, returns the API version + * used to create the currently bound context. + * + * Note that new API versions are only introduced when context capabilities are + * changed that break binary compatibility, so the API version and driver version + * may be different. For example, it is valid for the API version to be 3020 while + * the driver version is 4020. + * + * \param ctx - Context to check + * \param version - Pointer to version + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +CUresult CUDAAPI cuCtxGetApiVersion(CUcontext ctx, unsigned int *version); + +/** + * \brief Returns numerical values that correspond to the least and + * greatest stream priorities. + * + * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond + * to the least and greatest stream priorities respectively. Stream priorities + * follow a convention where lower numbers imply greater priorities. The range of + * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. + * If the user attempts to create a stream with a priority value that is + * outside the meaningful range as specified by this API, the priority is + * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority + * respectively. See ::cuStreamCreateWithPriority for details on creating a + * priority stream. + * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value + * is not desired. + * + * This function will return '0' in both \p *leastPriority and \p *greatestPriority if + * the current context's device does not support stream priorities + * (see ::cuDeviceGetAttribute). + * + * \param leastPriority - Pointer to an int in which the numerical value for least + * stream priority is returned + * \param greatestPriority - Pointer to an int in which the numerical value for greatest + * stream priority is returned + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize, + * ::cudaDeviceGetStreamPriorityRange + */ +CUresult CUDAAPI cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority); + +/** + * \brief Resets all persisting lines in cache to normal status. + * + * ::cuCtxResetPersistingL2Cache Resets all persisting lines in cache to normal + * status. Takes effect on function return. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuCtxResetPersistingL2Cache(void); + +/** + * \brief Returns the execution affinity setting for the current context. + * + * Returns in \p *pExecAffinity the current value of \p type. The supported + * ::CUexecAffinityType values are: + * - ::CU_EXEC_AFFINITY_TYPE_SM_COUNT: number of SMs the context is limited to use. + * + * \param type - Execution affinity type to query + * \param pExecAffinity - Returned execution affinity + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY + * \notefnerr + * + * \sa + * ::CUexecAffinityParam + */ +CUresult CUDAAPI cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type); + + +/** @} */ /* END CUDA_CTX */ + +/** + * \defgroup CUDA_CTX_DEPRECATED Context Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated context management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated context management functions of the low-level + * CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Increment a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Increments the usage count of the context and passes back a context handle + * in \p *pctx that must be passed to ::cuCtxDetach() when the application is + * done with the context. ::cuCtxAttach() fails if there is no context current + * to the thread. + * + * Currently, the \p flags parameter must be 0. + * + * \param pctx - Returned context handle of the current context + * \param flags - Context attach flags (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxDetach, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuCtxAttach(CUcontext *pctx, unsigned int flags); + +/** + * \brief Decrement a context's usage-count + * + * \deprecated + * + * Note that this function is deprecated and should not be used. + * + * Decrements the usage count of the context \p ctx, and destroys the context + * if the usage count goes to 0. The context must be a handle that was passed + * back by ::cuCtxCreate() or ::cuCtxAttach(), and must be current to the + * calling thread. + * + * \param ctx - Context to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxCreate, + * ::cuCtxDestroy, + * ::cuCtxGetApiVersion, + * ::cuCtxGetCacheConfig, + * ::cuCtxGetDevice, + * ::cuCtxGetFlags, + * ::cuCtxGetLimit, + * ::cuCtxPopCurrent, + * ::cuCtxPushCurrent, + * ::cuCtxSetCacheConfig, + * ::cuCtxSetLimit, + * ::cuCtxSynchronize + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuCtxDetach(CUcontext ctx); + +/** @} */ /* END CUDA_CTX_DEPRECATED */ + + +/** + * \defgroup CUDA_MODULE Module Management + * + * ___MANBRIEF___ module management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the module management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Loads a compute module + * + * Takes a filename \p fname and loads the corresponding module \p module into + * the current context. The CUDA driver API does not attempt to lazily + * allocate the resources needed by a module; if the memory for functions and + * data (constant and global) needed by the module cannot be allocated, + * ::cuModuleLoad() fails. The file should be a \e cubin file as output by + * \b nvcc, or a \e PTX file either as output by \b nvcc or handwritten, or + * a \e fatbin file as output by \b nvcc from toolchain 4.0 or later. + * + * \param module - Returned module + * \param fname - Filename of module to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_FILE_NOT_FOUND, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoad(CUmodule *module, const char *fname); + +/** + * \brief Load a module's data + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. + * + * \param module - Returned module + * \param image - Module data to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadData(CUmodule *module, const void *image); + +/** + * \brief Load a module's data with options + * + * Takes a pointer \p image and loads the corresponding module \p module into + * the current context. The pointer may be obtained by mapping a \e cubin or + * \e PTX or \e fatbin file, passing a \e cubin or \e PTX or \e fatbin file + * as a NULL-terminated text string, or incorporating a \e cubin or \e fatbin + * object into the executable resources and using operating system calls such + * as Windows \c FindResource() to obtain the pointer. Options are passed as + * an array via \p options and any corresponding parameters are passed in + * \p optionValues. The number of total options is supplied via \p numOptions. + * Any outputs will be returned via \p optionValues. + * + * \param module - Returned module + * \param image - Module data to load + * \param numOptions - Number of options + * \param options - Options for JIT + * \param optionValues - Option values for JIT + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Load a module's data + * + * Takes a pointer \p fatCubin and loads the corresponding module \p module + * into the current context. The pointer represents a <i>fat binary</i> object, + * which is a collection of different \e cubin and/or \e PTX files, all + * representing the same device code, but compiled and optimized for different + * architectures. + * + * Prior to CUDA 4.0, there was no documented API for constructing and using + * fat binary objects by programmers. Starting with CUDA 4.0, fat binary + * objects can be constructed by providing the <i>-fatbin option</i> to \b nvcc. + * More information can be found in the \b nvcc document. + * + * \param module - Returned module + * \param fatCubin - Fat binary to load + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU, + * ::CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin); + +/** + * \brief Unloads a module + * + * Unloads a module \p hmod from the current context. + * + * \param hmod - Module to unload + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_destroy_ub + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary + */ +CUresult CUDAAPI cuModuleUnload(CUmodule hmod); + +/** + * CUDA Lazy Loading status + */ +typedef enum CUmoduleLoadingMode_enum { + CU_MODULE_EAGER_LOADING = 0x1, /**< Lazy Kernel Loading is not enabled */ + CU_MODULE_LAZY_LOADING = 0x2, /**< Lazy Kernel Loading is enabled */ +} CUmoduleLoadingMode; + +/** + * \brief Query lazy loading mode + * + * Returns lazy loading mode + * Module loading mode is controlled by CUDA_MODULE_LOADING env variable + * + * \param mode - Returns the lazy loading mode + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \notefnerr + * + * \sa + * ::cuModuleLoad, + */ +CUresult CUDAAPI cuModuleGetLoadingMode(CUmoduleLoadingMode *mode); + +/** + * \brief Returns a function handle + * + * Returns in \p *hfunc the handle of the function of name \p name located in + * module \p hmod. If no function of that name exists, ::cuModuleGetFunction() + * returns ::CUDA_ERROR_NOT_FOUND. + * + * \param hfunc - Returned function handle + * \param hmod - Module to retrieve function from + * \param name - Name of function to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload + */ +CUresult CUDAAPI cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); + +/** + * \brief Returns a global pointer from a module + * + * Returns in \p *dptr and \p *bytes the base pointer and size of the + * global of name \p name located in module \p hmod. If no variable of that name + * exists, ::cuModuleGetGlobal() returns ::CUDA_ERROR_NOT_FOUND. Both + * parameters \p dptr and \p bytes are optional. If one of them is + * NULL, it is ignored. + * + * \param dptr - Returned global device pointer + * \param bytes - Returned global size in bytes + * \param hmod - Module to retrieve global from + * \param name - Name of global to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa ::cuModuleGetFunction, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetSymbolAddress, + * ::cudaGetSymbolSize + */ +CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name); + +/** + * \brief Returns a handle to a texture reference + * + * Returns in \p *pTexRef the handle of the texture reference of name \p name + * in the module \p hmod. If no texture reference of that name exists, + * ::cuModuleGetTexRef() returns ::CUDA_ERROR_NOT_FOUND. This texture reference + * handle should not be destroyed, since it will be destroyed when the module + * is unloaded. + * + * \param pTexRef - Returned texture reference + * \param hmod - Module to retrieve texture reference from + * \param name - Name of texture reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa + * ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetSurfRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetTextureReference + */ +CUresult CUDAAPI cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name); + +/** + * \brief Returns a handle to a surface reference + * + * Returns in \p *pSurfRef the handle of the surface reference of name \p name + * in the module \p hmod. If no surface reference of that name exists, + * ::cuModuleGetSurfRef() returns ::CUDA_ERROR_NOT_FOUND. + * + * \param pSurfRef - Returned surface reference + * \param hmod - Module to retrieve surface reference from + * \param name - Name of surface reference to retrieve + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + * \sa + * ::cuModuleGetFunction, + * ::cuModuleGetGlobal, + * ::cuModuleGetTexRef, + * ::cuModuleLoad, + * ::cuModuleLoadData, + * ::cuModuleLoadDataEx, + * ::cuModuleLoadFatBinary, + * ::cuModuleUnload, + * ::cudaGetSurfaceReference + */ +CUresult CUDAAPI cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name); + +/** + * \brief Creates a pending JIT linker invocation. + * + * If the call is successful, the caller owns the returned CUlinkState, which + * should eventually be destroyed with ::cuLinkDestroy. The + * device code machine size (32 or 64 bit) will match the calling application. + * + * Both linker and compiler options may be specified. Compiler options will + * be applied to inputs to this linker action which must be compiled from PTX. + * The options ::CU_JIT_WALL_TIME, + * ::CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, and ::CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES + * will accumulate data until the CUlinkState is destroyed. + * + * \p optionValues must remain valid for the life of the CUlinkState if output + * options are used. No other references to inputs are maintained after this + * call returns. + * + * \param numOptions Size of options arrays + * \param options Array of linker and compiler options + * \param optionValues Array of option values, each cast to void * + * \param stateOut On success, this will contain a CUlinkState to specify + * and complete this action + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_JIT_COMPILER_NOT_FOUND + * \notefnerr + * + * \sa ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + +/** + * \brief Add an input to a pending linker invocation + * + * Ownership of \p data is retained by the caller. No reference is retained to any + * inputs after this call returns. + * + * This method accepts only compiler options, which are used if the data must + * be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * \param state A pending linker action. + * \param type The type of the input data. + * \param data The input data. PTX must be NULL-terminated. + * \param size The length of the input data. + * \param name An optional name for this input in log messages. + * \param numOptions Size of options. + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate). + * \param optionValues Array of option values, each cast to void *. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddFile, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Add a file input to a pending linker invocation + * + * No reference is retained to any inputs after this call returns. + * + * This method accepts only compiler options, which are used if the input + * must be compiled from PTX, and does not accept any of + * ::CU_JIT_WALL_TIME, ::CU_JIT_INFO_LOG_BUFFER, ::CU_JIT_ERROR_LOG_BUFFER, + * ::CU_JIT_TARGET_FROM_CUCONTEXT, or ::CU_JIT_TARGET. + * + * This method is equivalent to invoking ::cuLinkAddData on the contents + * of the file. + * + * \param state A pending linker action + * \param type The type of the input data + * \param path Path to the input file + * \param numOptions Size of options + * \param options Options to be applied only for this input (overrides options from ::cuLinkCreate) + * \param optionValues Array of option values, each cast to void * + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_FILE_NOT_FOUND + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_PTX, + * ::CUDA_ERROR_UNSUPPORTED_PTX_VERSION, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NO_BINARY_FOR_GPU + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkComplete, + * ::cuLinkDestroy + */ +CUresult CUDAAPI +cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); + +/** + * \brief Complete a pending linker invocation + * + * Completes the pending linker action and returns the cubin image for the linked + * device code, which can be used with ::cuModuleLoadData. The cubin is owned by + * \p state, so it should be loaded before \p state is destroyed via ::cuLinkDestroy. + * This call does not destroy \p state. + * + * \param state A pending linker invocation + * \param cubinOut On success, this will point to the output image + * \param sizeOut Optional parameter to receive the size of the generated image + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuLinkCreate, + * ::cuLinkAddData, + * ::cuLinkAddFile, + * ::cuLinkDestroy, + * ::cuModuleLoadData + */ +CUresult CUDAAPI +cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut); + +/** + * \brief Destroys state for a JIT linker invocation. + * + * \param state State object for the linker invocation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \sa ::cuLinkCreate + */ +CUresult CUDAAPI +cuLinkDestroy(CUlinkState state); + +/** @} */ /* END CUDA_MODULE */ + + +/** + * \defgroup CUDA_MEM Memory Management + * + * ___MANBRIEF___ memory management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the memory management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Gets free and total memory + * + * Returns in \p *total the total amount of memory available to the the current context. + * Returns in \p *free the amount of memory on the device that is free according to the OS. + * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free. + * In a multi-tenet situation, free estimate returned is prone to race condition where + * a new allocation/free done by a different process or a different thread in the same + * process between the time when free memory was estimated and reported, will result in + * deviation in free value reported and actual free memory. + * + * The integrated GPU on Tegra shares memory with CPU and other component + * of the SoC. The free and total values returned by the API excludes + * the SWAP memory space maintained by the OS on some platforms. + * The OS may move some of the memory pages into swap area as the GPU or + * CPU allocate or access memory. See Tegra app note on how to calculate + * total and free memory on Tegra. + * + * \param free - Returned free memory in bytes + * \param total - Returned total memory in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemGetInfo + */ +CUresult CUDAAPI cuMemGetInfo(size_t *free, size_t *total); + +/** + * \brief Allocates device memory + * + * Allocates \p bytesize bytes of linear memory on the device and returns in + * \p *dptr a pointer to the allocated memory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAlloc() returns ::CUDA_ERROR_INVALID_VALUE. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMalloc + */ +CUresult CUDAAPI cuMemAlloc(CUdeviceptr *dptr, size_t bytesize); + +/** + * \brief Allocates pitched device memory + * + * Allocates at least \p WidthInBytes * \p Height bytes of linear memory on + * the device and returns in \p *dptr a pointer to the allocated memory. The + * function may pad the allocation to ensure that corresponding pointers in + * any given row will continue to meet the alignment requirements for + * coalescing as the address is updated from row to row. \p ElementSizeBytes + * specifies the size of the largest reads and writes that will be performed + * on the memory range. \p ElementSizeBytes may be 4, 8 or 16 (since coalesced + * memory transactions are not possible on other data sizes). If + * \p ElementSizeBytes is smaller than the actual read/write size of a kernel, + * the kernel will run correctly, but possibly at reduced speed. The pitch + * returned in \p *pPitch by ::cuMemAllocPitch() is the width in bytes of the + * allocation. The intended usage of pitch is as a separate parameter of the + * allocation, used to compute addresses within the 2D array. Given the row + * and column of an array element of type \b T, the address is computed as: + * \code + T* pElement = (T*)((char*)BaseAddress + Row * Pitch) + Column; + * \endcode + * + * The pitch returned by ::cuMemAllocPitch() is guaranteed to work with + * ::cuMemcpy2D() under all circumstances. For allocations of 2D arrays, it is + * recommended that programmers consider performing pitch allocations using + * ::cuMemAllocPitch(). Due to alignment restrictions in the hardware, this is + * especially true if the application will be performing 2D memory copies + * between different regions of device memory (whether linear memory or CUDA + * arrays). + * + * The byte alignment of the pitch returned by ::cuMemAllocPitch() is guaranteed + * to match or exceed the alignment requirement for texture binding with + * ::cuTexRefSetAddress2D(). + * + * \param dptr - Returned device pointer + * \param pPitch - Returned pitch of allocation in bytes + * \param WidthInBytes - Requested allocation width in bytes + * \param Height - Requested allocation height in rows + * \param ElementSizeBytes - Size of largest reads/writes for range + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocPitch + */ +CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); + +/** + * \brief Frees device memory + * + * Frees the memory space pointed to by \p dptr, which must have been returned + * by a previous call to one of the following memory allocation APIs - ::cuMemAlloc(), + * ::cuMemAllocPitch(), ::cuMemAllocManaged(), ::cuMemAllocAsync(), ::cuMemAllocFromPoolAsync() + * + * Note - This API will not perform any implict synchronization when the pointer was allocated with + * ::cuMemAllocAsync or ::cuMemAllocFromPoolAsync. Callers must ensure that all accesses to the + * pointer have completed before invoking ::cuMemFree. For best performance and memory reuse, users + * should use ::cuMemFreeAsync to free memory allocated via the stream ordered memory allocator. + * + * \param dptr - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemAllocManaged, ::cuMemAllocAsync, ::cuMemAllocFromPoolAsync, + * ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, ::cuMemcpy3D, ::cuMemcpy3DAsync, + * ::cuMemcpyAtoA, ::cuMemcpyAtoD, ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, + * ::cuMemcpyHtoAAsync, ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, ::cuMemFreeAsync, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFree + */ +CUresult CUDAAPI cuMemFree(CUdeviceptr dptr); + +/** + * \brief Get information on memory allocations + * + * Returns the base address in \p *pbase and size in \p *psize of the + * allocation by ::cuMemAlloc() or ::cuMemAllocPitch() that contains the input + * pointer \p dptr. Both parameters \p pbase and \p psize are optional. If one + * of them is NULL, it is ignored. + * + * \param pbase - Returned base address + * \param psize - Returned size of device memory allocation + * \param dptr - Device pointer to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32 + */ +CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and + * accessible to the device. The driver tracks the virtual memory ranges + * allocated with this function and automatically accelerates calls to + * functions such as ::cuMemcpy(). Since the memory can be accessed directly by + * the device, it can be read or written with much higher bandwidth than + * pageable memory obtained with functions such as ::malloc(). Allocating + * excessive amounts of memory with ::cuMemAllocHost() may degrade system + * performance, since it reduces the amount of memory available to the system + * for paging. As a result, this function is best used sparingly to allocate + * staging areas for data exchange between host and device. + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * The device pointer that may be used to access this host memory from those + * contexts is always equal to the returned host pointer \p *pp. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocHost + */ +CUresult CUDAAPI cuMemAllocHost(void **pp, size_t bytesize); + +/** + * \brief Frees page-locked host memory + * + * Frees the memory space pointed to by \p p, which must have been returned by + * a previous call to ::cuMemAllocHost(). + * + * \param p - Pointer to memory to free + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFreeHost + */ +CUresult CUDAAPI cuMemFreeHost(void *p); + +/** + * \brief Allocates page-locked host memory + * + * Allocates \p bytesize bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cuMemcpyHtoD(). Since the memory can be accessed directly by the device, + * it can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between + * host and device. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTALLOC_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTALLOC_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). + * + * - ::CU_MEMHOSTALLOC_WRITECOMBINED: Allocates the memory as write-combined + * (WC). WC memory can be transferred across the PCI Express bus more + * quickly on some system configurations, but cannot be read efficiently by + * most CPUs. WC memory is a good option for buffers that will be written by + * the CPU and read by the GPU via mapped pinned memory or host->device + * transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * The ::CU_MEMHOSTALLOC_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTALLOC_PORTABLE flag. + * + * The memory allocated by this function must be freed with ::cuMemFreeHost(). + * + * Note all host memory allocated using ::cuMemHostAlloc() will automatically + * be immediately accessible to all contexts on all devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING). + * Unless the flag ::CU_MEMHOSTALLOC_WRITECOMBINED is specified, the device pointer + * that may be used to access this host memory from those contexts is always equal + * to the returned host pointer \p *pp. If the flag ::CU_MEMHOSTALLOC_WRITECOMBINED + * is specified, then the function ::cuMemHostGetDevicePointer() must be used + * to query the device pointer, even if the context supports unified addressing. + * See \ref CUDA_UNIFIED for additional details. + * + * \param pp - Returned host pointer to page-locked memory + * \param bytesize - Requested allocation size in bytes + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaHostAlloc + */ +CUresult CUDAAPI cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags); + +/** + * \brief Passes back device pointer of mapped pinned memory + * + * Passes back the device pointer \p pdptr corresponding to the mapped, pinned + * host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetDevicePointer() will fail if the ::CU_MEMHOSTALLOC_DEVICEMAP + * flag was not specified at the time the memory was allocated, or if the + * function is called on a GPU that does not support mapped pinned memory. + * + * For devices that have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory + * can also be accessed from the device using the host pointer \p p. + * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not + * match the original host pointer \p p and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() + * will match the original pointer \p p. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cuMemHostGetDevicePointer() will not match the original host pointer \p p, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only one of the two pointers and not both. + * + * \p Flags provides for future releases. For now, it must be set to 0. + * + * \param pdptr - Returned device pointer + * \param p - Host pointer + * \param Flags - Options (must be 0) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaHostGetDevicePointer + */ +CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags); + +/** + * \brief Passes back flags that were used for a pinned allocation + * + * Passes back the flags \p pFlags that were specified when allocating + * the pinned host buffer \p p allocated by ::cuMemHostAlloc. + * + * ::cuMemHostGetFlags() will fail if the pointer does not reside in + * an allocation performed by ::cuMemAllocHost() or ::cuMemHostAlloc(). + * + * \param pFlags - Returned flags word + * \param p - Host pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuMemAllocHost, + * ::cuMemHostAlloc, + * ::cudaHostGetFlags + */ +CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p); + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p bytesize bytes of managed memory on the device and returns in + * \p *dptr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::CUDA_ERROR_NOT_SUPPORTED is returned. Support + * for managed memory can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p bytesize + * is 0, ::cuMemAllocManaged returns ::CUDA_ERROR_INVALID_VALUE. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST. If + * ::CU_MEM_ATTACH_GLOBAL is specified, then this memory is accessible from + * any stream on any device. If ::CU_MEM_ATTACH_HOST is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS; an explicit call to + * ::cuStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cuStreamAttachMemAsync to + * a single stream, the default association as specifed during ::cuMemAllocManaged + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cuMemAllocManaged should be released with ::cuMemFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cuMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cuMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cuMemAllocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a + * non-zero value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all contexts created in + * that process on devices that support managed memory have to be peer-to-peer compatible + * with each other. Context creation will fail if a context is created on a device that + * supports managed memory and is not peer-to-peer compatible with any of the other + * managed memory supporting devices on which contexts were previously created, even if + * those contexts have been destroyed. These environment variables are described + * in the CUDA programming guide under the "CUDA environment variables" section. + * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. + * + * \param dptr - Returned device pointer + * \param bytesize - Requested allocation size in bytes + * \param flags - Must be one of ::CU_MEM_ATTACH_GLOBAL or ::CU_MEM_ATTACH_HOST + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cuDeviceGetAttribute, ::cuStreamAttachMemAsync, + * ::cudaMallocManaged + */ +CUresult CUDAAPI cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags); + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device handle given a PCI bus ID string. + * + * \param dev - Returned device handle + * + * \param pciBusId - String in one of the following forms: + * [domain]:[bus]:[device].[function] + * [domain]:[bus]:[device] + * [bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGet, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetPCIBusId, + * ::cudaDeviceGetByPCIBusId + */ +CUresult CUDAAPI cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId); + +/** + * \brief Returns a PCI Bus Id string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p pciBusId. \p len specifies the maximum length of the + * string that may be returned. + * + * \param pciBusId - Returned identifier string for the device in the following format + * [domain]:[bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. + * pciBusId should be large enough to store 13 characters including the NULL-terminator. + * + * \param len - Maximum length of string to store in \p name + * + * \param dev - Device to get identifier string for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuDeviceGet, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetByPCIBusId, + * ::cudaDeviceGetPCIBusId + */ +CUresult CUDAAPI cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev); + +/** + * \brief Gets an interprocess handle for a previously allocated event + * + * Takes as input a previously allocated event. This event must have been + * created with the ::CU_EVENT_INTERPROCESS and ::CU_EVENT_DISABLE_TIMING + * flags set. This opaque handle may be copied into other processes and + * opened with ::cuIpcOpenEventHandle to allow efficient hardware + * synchronization between GPU work in different processes. + * + * After the event has been opened in the importing process, + * ::cuEventRecord, ::cuEventSynchronize, ::cuStreamWaitEvent and + * ::cuEventQuery may be used in either process. Performing operations + * on the imported event after the exported event has been freed + * with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param pHandle - Pointer to a user allocated CUipcEventHandle + * in which to return the opaque event handle + * \param event - Event allocated with ::CU_EVENT_INTERPROCESS and + * ::CU_EVENT_DISABLE_TIMING flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcGetEventHandle + */ +CUresult CUDAAPI cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event); + +/** + * \brief Opens an interprocess event handle for use in the current process + * + * Opens an interprocess event handle exported from another process with + * ::cuIpcGetEventHandle. This function returns a ::CUevent that behaves like + * a locally created event with the ::CU_EVENT_DISABLE_TIMING flag specified. + * This event must be freed with ::cuEventDestroy. + * + * Performing operations on the imported event after the exported event has + * been freed with ::cuEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param phEvent - Returns the imported event + * \param handle - Interprocess handle to open + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuEventCreate, + * ::cuEventDestroy, + * ::cuEventSynchronize, + * ::cuEventQuery, + * ::cuStreamWaitEvent, + * ::cuIpcGetEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcOpenEventHandle + */ +CUresult CUDAAPI cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle); + +/** + * \brief Gets an interprocess memory handle for an existing device memory + * allocation + * + * Takes a pointer to the base of an existing device memory allocation created + * with ::cuMemAlloc and exports it for use in another process. This is a + * lightweight operation and may be called multiple times on an allocation + * without adverse effects. + * + * If a region of memory is freed with ::cuMemFree and a subsequent call + * to ::cuMemAlloc returns memory with the same device address, + * ::cuIpcGetMemHandle will return a unique handle for the + * new memory. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param pHandle - Pointer to user allocated ::CUipcMemHandle to return + * the handle in. + * \param dptr - Base pointer to previously allocated device memory + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcOpenMemHandle, + * ::cuIpcCloseMemHandle, + * ::cudaIpcGetMemHandle + */ +CUresult CUDAAPI cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr); + +/** + * \brief Opens an interprocess memory handle exported from another process + * and returns a device pointer usable in the local process. + * + * Maps memory exported from another process with ::cuIpcGetMemHandle into + * the current device address space. For contexts on different devices + * ::cuIpcOpenMemHandle can attempt to enable peer access between the + * devices as if the user called ::cuCtxEnablePeerAccess. This behavior is + * controlled by the ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS flag. + * ::cuDeviceCanAccessPeer can determine if a mapping is possible. + * + * Contexts that may open ::CUipcMemHandles are restricted in the following way. + * ::CUipcMemHandles from each ::CUdevice in a given process may only be opened + * by one ::CUcontext per ::CUdevice per other process. + * + * If the memory handle has already been opened by the current context, the + * reference count on the handle is incremented by 1 and the existing device pointer + * is returned. + * + * Memory returned from ::cuIpcOpenMemHandle must be freed with + * ::cuIpcCloseMemHandle. + * + * Calling ::cuMemFree on an exported memory region before calling + * ::cuIpcCloseMemHandle in the importing context will result in undefined + * behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param pdptr - Returned device pointer + * \param handle - ::CUipcMemHandle to open + * \param Flags - Flags for this operation. Must be specified as ::CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \note No guarantees are made about the address returned in \p *pdptr. + * In particular, multiple processes may not receive the same address for the same \p handle. + * + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcCloseMemHandle, + * ::cuCtxEnablePeerAccess, + * ::cuDeviceCanAccessPeer, + * ::cudaIpcOpenMemHandle + */ +CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + +/** + * \brief Attempts to close memory mapped with ::cuIpcOpenMemHandle + * + * Decrements the reference count of the memory returned by ::cuIpcOpenMemHandle by 1. + * When the reference count reaches 0, this API unmaps the memory. The original allocation + * in the exporting process as well as imported mappings in other processes + * will be unaffected. + * + * Any resources used to enable peer access will be freed if this is the + * last mapping using them. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux and Windows operating systems. + * IPC functionality on Windows is restricted to GPUs in TCC mode + * + * \param dptr - Device pointer returned by ::cuIpcOpenMemHandle + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_MAP_FAILED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \sa + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuIpcGetEventHandle, + * ::cuIpcOpenEventHandle, + * ::cuIpcGetMemHandle, + * ::cuIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle + */ +CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr); + +/** + * \brief Registers an existing host memory range for use by CUDA + * + * Page-locks the memory range specified by \p p and \p bytesize and maps it + * for the device(s) as specified by \p Flags. This memory range also is added + * to the same tracking mechanism as ::cuMemHostAlloc to automatically accelerate + * calls to functions such as ::cuMemcpyHtoD(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher bandwidth + * than pageable memory that has not been registered. Page-locking excessive + * amounts of memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to register staging areas for data exchange between + * host and device. + * + * The \p Flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::CU_MEMHOSTREGISTER_PORTABLE: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::CU_MEMHOSTREGISTER_DEVICEMAP: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cuMemHostGetDevicePointer(). + * + * - ::CU_MEMHOSTREGISTER_IOMEMORY: The pointer is treated as pointing to some + * I/O memory space, e.g. the PCI Express resource of a 3rd party device. + * + * - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory + * that is considered read-only by the device. On platforms without + * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is + * required in order to register memory mapped to the CPU as read-only. Support + * for the use of this flag can be queried from the device attribute + * ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with + * a current context associated with a device that does not have this attribute + * set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED. + * + * All of these flags are orthogonal to one another: a developer may page-lock + * memory that is portable or mapped with no restrictions. + * + * The ::CU_MEMHOSTREGISTER_DEVICEMAP flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cuMemHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::CU_MEMHOSTREGISTER_PORTABLE flag. + * + * For devices that have a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM, the memory + * can also be accessed from the device using the host pointer \p p. + * The device pointer returned by ::cuMemHostGetDevicePointer() may or may not + * match the original host pointer \p ptr and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cuMemHostGetDevicePointer() + * will match the original pointer \p ptr. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cuMemHostGetDevicePointer() will not match the original host pointer \p ptr, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * The memory page-locked by this function must be unregistered with + * ::cuMemHostUnregister(). + * + * \param p - Host pointer to memory to page-lock + * \param bytesize - Size in bytes of the address range to page-lock + * \param Flags - Flags for allocation request + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED, + * ::CUDA_ERROR_NOT_PERMITTED, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa + * ::cuMemHostUnregister, + * ::cuMemHostGetFlags, + * ::cuMemHostGetDevicePointer, + * ::cudaHostRegister + */ +CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + +/** + * \brief Unregisters a memory range that was registered with cuMemHostRegister. + * + * Unmaps the memory range whose base address is specified by \p p, and makes + * it pageable again. + * + * The base address must be the same one specified to ::cuMemHostRegister(). + * + * \param p - Host pointer to memory to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED, + * \notefnerr + * + * \sa + * ::cuMemHostRegister, + * ::cudaHostUnregister + */ +CUresult CUDAAPI cuMemHostUnregister(void *p); + +/** + * \brief Copies memory + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + +/** + * \brief Copies device memory between two contexts + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpyPeer + */ +CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol + */ +CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol + */ +CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Device to Array + * + * Copies from device memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting index of the destination data. + * \p srcDevice specifies the base pointer of the source. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyToArray + */ +CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + +/** + * \brief Copies memory from Array to Device + * + * Copies from one 1D CUDA array to device memory. \p dstDevice specifies the + * base pointer of the destination and must be naturally aligned with the CUDA + * array elements. \p srcArray and \p srcOffset specify the CUDA array handle + * and the offset in bytes into the array where the copy is to begin. + * \p ByteCount specifies the number of bytes to copy and must be evenly + * divisible by the array element size. + * + * \param dstDevice - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyFromArray + */ +CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the destination + * data. \p pSrc specifies the base address of the source. \p ByteCount specifies + * the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyToArray + */ +CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination device pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyFromArray + */ +CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory from Array to Array + * + * Copies from one 1D CUDA array to another. \p dstArray and \p srcArray + * specify the handles of the destination and source CUDA arrays for the copy, + * respectively. \p dstOffset and \p srcOffset specify the destination and + * source offsets in bytes into the CUDA arrays. \p ByteCount is the number of + * bytes to be copied. The size of the elements in the CUDA arrays need not be + * the same format, but the elements must be the same size; and count must be + * evenly divisible by that size. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpyArrayToArray + */ +CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DFromArray + */ +CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * + * \par + * ::cuMemcpy2D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2D() may fail for pitches not computed by ::cuMemAllocPitch(). + * ::cuMemcpy2DUnaligned() does not have this restriction, but may run + * significantly slower in the cases where ::cuMemcpy2D() would have returned + * an error code. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DFromArray + */ +CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3D() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMemcpy3D + */ +CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D *pCopy); + +/** + * \brief Copies memory between contexts + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_sync + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpy3DPeer + */ +CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + +/** + * \brief Copies memory asynchronously + * + * Copies data between two pointers. + * \p dst and \p src are base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * Note that this function infers the type of the transfer (host to host, host to + * device, device to device, or device to host) from the pointer values. This + * function is only allowed in contexts which support unified addressing. + * + * \param dst - Destination unified virtual address space pointer + * \param src - Source unified virtual address space pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies device memory between two contexts asynchronously. + * + * Copies from device memory in one context to device memory in another + * context. \p dstDevice is the base device pointer of the destination memory + * and \p dstContext is the destination context. \p srcDevice is the base + * device pointer of the source memory and \p srcContext is the source pointer. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param dstContext - Destination context + * \param srcDevice - Source device pointer + * \param srcContext - Source context + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpy3DPeer, ::cuMemcpyDtoDAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpyPeerAsync + */ +CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Host to Device + * + * Copies from host memory to device memory. \p dstDevice and \p srcHost are + * the base addresses of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync + */ +CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Host + * + * Copies from device to host memory. \p dstHost and \p srcDevice specify the + * base pointers of the destination and source, respectively. \p ByteCount + * specifies the number of bytes to copy. + * + * \param dstHost - Destination host pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Device to Device + * + * Copies from device memory to device memory. \p dstDevice and \p srcDevice + * are the base pointers of the destination and source, respectively. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstDevice - Destination device pointer + * \param srcDevice - Source device pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cudaMemcpyFromSymbolAsync + */ +CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Host to Array + * + * Copies from host memory to a 1D CUDA array. \p dstArray and \p dstOffset + * specify the CUDA array handle and starting offset in bytes of the + * destination data. \p srcHost specifies the base address of the source. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstArray - Destination array + * \param dstOffset - Offset in bytes of destination array + * \param srcHost - Source host pointer + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyToArrayAsync + */ +CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory from Array to Host + * + * Copies from one 1D CUDA array to host memory. \p dstHost specifies the base + * pointer of the destination. \p srcArray and \p srcOffset specify the CUDA + * array handle and starting offset in bytes of the source data. + * \p ByteCount specifies the number of bytes to copy. + * + * \param dstHost - Destination pointer + * \param srcArray - Source array + * \param srcOffset - Offset in bytes of source array + * \param ByteCount - Size of memory copy in bytes + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * \note_memcpy + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpyFromArrayAsync + */ +CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + +/** + * \brief Copies memory for 2D arrays + * + * Perform a 2D memory copy according to the parameters specified in \p pCopy. + * The ::CUDA_MEMCPY2D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY2D_st { + unsigned int srcXInBytes, srcY; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; + unsigned int dstXInBytes, dstY; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; + unsigned int WidthInBytes; + unsigned int Height; + } CUDA_MEMCPY2D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost and ::srcPitch + * specify the (host) base address of the source data and the bytes per row to + * apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice and ::srcPitch + * specify the (device) base address of the source data and the bytes per row + * to apply. ::srcArray is ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice and ::srcPitch are + * ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data and the bytes per + * row to apply. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice and ::dstPitch are + * ignored. + * + * - ::srcXInBytes and ::srcY specify the base address of the source data for + * the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+srcY*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+srcY*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - ::dstXInBytes and ::dstY specify the base address of the destination data + * for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+dstY*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+dstY*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes and ::Height specify the width (in bytes) and height of + * the 2D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy2DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). ::cuMemAllocPitch() passes back + * pitches that always work with ::cuMemcpy2D(). On intra-device memory copies + * (device to device, CUDA array to device, CUDA array to CUDA array), + * ::cuMemcpy2DAsync() may fail for pitches not computed by ::cuMemAllocPitch(). + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync + */ +CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + +/** + * \brief Copies memory for 3D arrays + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. The ::CUDA_MEMCPY3D structure is defined as: + * + * \code + typedef struct CUDA_MEMCPY3D_st { + + unsigned int srcXInBytes, srcY, srcZ; + unsigned int srcLOD; + CUmemorytype srcMemoryType; + const void *srcHost; + CUdeviceptr srcDevice; + CUarray srcArray; + unsigned int srcPitch; // ignored when src is array + unsigned int srcHeight; // ignored when src is array; may be 0 if Depth==1 + + unsigned int dstXInBytes, dstY, dstZ; + unsigned int dstLOD; + CUmemorytype dstMemoryType; + void *dstHost; + CUdeviceptr dstDevice; + CUarray dstArray; + unsigned int dstPitch; // ignored when dst is array + unsigned int dstHeight; // ignored when dst is array; may be 0 if Depth==1 + + unsigned int WidthInBytes; + unsigned int Height; + unsigned int Depth; + } CUDA_MEMCPY3D; + * \endcode + * where: + * - ::srcMemoryType and ::dstMemoryType specify the type of memory of the + * source and destination, respectively; ::CUmemorytype_enum is defined as: + * + * \code + typedef enum CUmemorytype_enum { + CU_MEMORYTYPE_HOST = 0x01, + CU_MEMORYTYPE_DEVICE = 0x02, + CU_MEMORYTYPE_ARRAY = 0x03, + CU_MEMORYTYPE_UNIFIED = 0x04 + } CUmemorytype; + * \endcode + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::srcDevice and ::srcPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::srcArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_HOST, ::srcHost, ::srcPitch and + * ::srcHeight specify the (host) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_DEVICE, ::srcDevice, ::srcPitch and + * ::srcHeight specify the (device) base address of the source data, the bytes + * per row, and the height of each 2D slice of the 3D array. ::srcArray is + * ignored. + * + * \par + * If ::srcMemoryType is ::CU_MEMORYTYPE_ARRAY, ::srcArray specifies the + * handle of the source data. ::srcHost, ::srcDevice, ::srcPitch and + * ::srcHeight are ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_UNIFIED, ::dstDevice and ::dstPitch + * specify the (unified virtual address space) base address of the source data + * and the bytes per row to apply. ::dstArray is ignored. + * This value may be used only if unified addressing is supported in the calling + * context. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_HOST, ::dstHost and ::dstPitch + * specify the (host) base address of the destination data, the bytes per row, + * and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_DEVICE, ::dstDevice and ::dstPitch + * specify the (device) base address of the destination data, the bytes per + * row, and the height of each 2D slice of the 3D array. ::dstArray is ignored. + * + * \par + * If ::dstMemoryType is ::CU_MEMORYTYPE_ARRAY, ::dstArray specifies the + * handle of the destination data. ::dstHost, ::dstDevice, ::dstPitch and + * ::dstHeight are ignored. + * + * - ::srcXInBytes, ::srcY and ::srcZ specify the base address of the source + * data for the copy. + * + * \par + * For host pointers, the starting address is + * \code + void* Start = (void*)((char*)srcHost+(srcZ*srcHeight+srcY)*srcPitch + srcXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr Start = srcDevice+(srcZ*srcHeight+srcY)*srcPitch+srcXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::srcXInBytes must be evenly divisible by the array + * element size. + * + * - dstXInBytes, ::dstY and ::dstZ specify the base address of the + * destination data for the copy. + * + * \par + * For host pointers, the base address is + * \code + void* dstStart = (void*)((char*)dstHost+(dstZ*dstHeight+dstY)*dstPitch + dstXInBytes); + * \endcode + * + * \par + * For device pointers, the starting address is + * \code + CUdeviceptr dstStart = dstDevice+(dstZ*dstHeight+dstY)*dstPitch+dstXInBytes; + * \endcode + * + * \par + * For CUDA arrays, ::dstXInBytes must be evenly divisible by the array + * element size. + * + * - ::WidthInBytes, ::Height and ::Depth specify the width (in bytes), height + * and depth of the 3D copy being performed. + * - If specified, ::srcPitch must be greater than or equal to ::WidthInBytes + + * ::srcXInBytes, and ::dstPitch must be greater than or equal to + * ::WidthInBytes + dstXInBytes. + * - If specified, ::srcHeight must be greater than or equal to ::Height + + * ::srcY, and ::dstHeight must be greater than or equal to ::Height + ::dstY. + * + * \par + * ::cuMemcpy3DAsync() returns an error if any pitch is greater than the maximum + * allowed (::CU_DEVICE_ATTRIBUTE_MAX_PITCH). + * + * The ::srcLOD and ::dstLOD members of the ::CUDA_MEMCPY3D structure must be + * set to 0. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemcpy3DAsync + */ +CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream); + +/** + * \brief Copies memory between contexts asynchronously. + * + * Perform a 3D memory copy according to the parameters specified in + * \p pCopy. See the definition of the ::CUDA_MEMCPY3D_PEER structure + * for documentation of its parameters. + * + * \param pCopy - Parameters for the memory copy + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpyDtoD, ::cuMemcpyPeer, ::cuMemcpyDtoDAsync, ::cuMemcpyPeerAsync, + * ::cuMemcpy3DPeerAsync, + * ::cudaMemcpy3DPeerAsync + */ +CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32Async, + * ::cudaMemset + */ +CUresult CUDAAPI cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + +/** + * \brief Initializes device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2D + */ +CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 8-bit values to the specified value + * \p uc. + * + * \param dstDevice - Destination device pointer + * \param uc - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 16-bit values to the specified value + * \p us. The \p dstDevice pointer must be two byte aligned. + * + * \param dstDevice - Destination device pointer + * \param us - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the memory range of \p N 32-bit values to the specified value + * \p ui. The \p dstDevice pointer must be four byte aligned. + * + * \param dstDevice - Destination device pointer + * \param ui - Value to set + * \param N - Number of elements + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, ::cuMemsetD32, + * ::cudaMemsetAsync + */ +CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 8-bit values to the specified value + * \p uc. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param uc - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 16-bit values to the specified value + * \p us. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be two byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param us - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D32, ::cuMemsetD2D32Async, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Sets device memory + * + * Sets the 2D memory range of \p Width 32-bit values to the specified value + * \p ui. \p Height specifies the number of rows to set, and \p dstPitch + * specifies the number of bytes between each row. The \p dstDevice pointer + * and \p dstPitch offset must be four byte aligned. This function performs + * fastest when the pitch is one that has been passed back by + * ::cuMemAllocPitch(). + * + * \param dstDevice - Destination device pointer + * \param dstPitch - Pitch of destination device pointer(Unused if \p Height is 1) + * \param ui - Value to set + * \param Width - Width of row + * \param Height - Number of rows + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * \note_memset + * \note_null_stream + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D8Async, + * ::cuMemsetD2D16, ::cuMemsetD2D16Async, ::cuMemsetD2D32, + * ::cuMemsetD8, ::cuMemsetD8Async, ::cuMemsetD16, ::cuMemsetD16Async, + * ::cuMemsetD32, ::cuMemsetD32Async, + * ::cudaMemset2DAsync + */ +CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + +/** + * \brief Creates a 1D or 2D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + CUarray_format Format; + unsigned int NumChannels; + } CUDA_ARRAY_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, and \p Height are the width, and height of the CUDA array (in + * elements); the CUDA array is one-dimensional if height is 0, two-dimensional + * otherwise; + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 1; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + * \endcode + * + * Description for a \p width x \p height CUDA array of 64-bit, 4x16-bit + * float16's: + * \code + CUDA_ARRAY_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + * \endcode + * + * Description for a \p width x \p height CUDA array of 16-bit elements, each + * of which is two 8-bit unsigned chars: + * \code + CUDA_ARRAY_DESCRIPTOR arrayDesc; + desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; + desc.NumChannels = 2; + desc.Width = width; + desc.Height = height; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - Array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMallocArray + */ +CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 1D or 2D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * \param pArrayDescriptor - Returned array descriptor + * \param hArray - Array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaArrayGetInfo + */ +CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray); + +/** + * \brief Returns the layout properties of a sparse CUDA array + * + * Returns the layout properties of a sparse CUDA array in \p sparseProperties + * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_SPARSE + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * If the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize represents the total size of the array. Otherwise, it will be zero. + * Also, the returned value in ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is always zero. + * Note that the \p array must have been allocated using ::cuArrayCreate or ::cuArray3DCreate. For CUDA arrays obtained + * using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. Instead, ::cuMipmappedArrayGetSparseProperties + * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES + * \param[in] array - CUDA array to get the sparse properties of + * \sa ::cuMipmappedArrayGetSparseProperties, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array); + +/** + * \brief Returns the layout properties of a sparse CUDA mipmapped array + * + * Returns the sparse array layout properties in \p sparseProperties + * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_SPARSE + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * For non-layered CUDA mipmapped arrays, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize returns the + * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth + * is less than that of the tile. + * For layered CUDA mipmapped arrays, if ::CUDA_ARRAY_SPARSE_PROPERTIES::flags contains ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL, + * then ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies the size of the mip tail of all layers combined. + * Otherwise, ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize specifies mip tail size per layer. + * The returned value of ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailFirstLevel is valid only if ::CUDA_ARRAY_SPARSE_PROPERTIES::miptailSize is non-zero. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] sparseProperties - Pointer to ::CUDA_ARRAY_SPARSE_PROPERTIES + * \param[in] mipmap - CUDA mipmapped array to get the sparse properties of + * \sa ::cuArrayGetSparseProperties, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap); + +/** + * \brief Returns the memory requirements of a CUDA array + * + * Returns the memory requirements of a CUDA array in \p memoryRequirements + * If the CUDA array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size + * represents the total size of the CUDA array. + * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment + * represents the alignment necessary for mapping the CUDA array. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS + * \param[in] array - CUDA array to get the memory requirements of + * \param[in] device - Device to get the memory requirements for + * \sa ::cuMipmappedArrayGetMemoryRequirements, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device); + +/** + * \brief Returns the memory requirements of a CUDA mipmapped array + * + * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements + * If the CUDA mipmapped array is not allocated with flag ::CUDA_ARRAY3D_DEFERRED_MAPPING + * ::CUDA_ERROR_INVALID_VALUE will be returned. + * + * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::size + * represents the total size of the CUDA mipmapped array. + * The returned value in ::CUDA_ARRAY_MEMORY_REQUIREMENTS::alignment + * represents the alignment necessary for mapping the CUDA mipmapped + * array. + * + * \return + * ::CUDA_SUCCESS + * ::CUDA_ERROR_INVALID_VALUE + * + * \param[out] memoryRequirements - Pointer to ::CUDA_ARRAY_MEMORY_REQUIREMENTS + * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of + * \param[in] device - Device to get the memory requirements for + * \sa ::cuArrayGetMemoryRequirements, ::cuMemMapArrayAsync + */ +CUresult CUDAAPI cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device); + +/** + * \brief Gets a CUDA array plane from a CUDA array + * + * Returns in \p pPlaneArray a CUDA array that represents a single format plane + * of the CUDA array \p hArray. + * + * If \p planeIdx is greater than the maximum number of planes in this array or if the array does + * not have a multi-planar format e.g: ::CU_AD_FORMAT_NV12, then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Note that if the \p hArray has format ::CU_AD_FORMAT_NV12, then passing in 0 for \p planeIdx returns + * a CUDA array of the same size as \p hArray but with one channel and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width + * of \p hArray with two channels and ::CU_AD_FORMAT_UNSIGNED_INT8 as its format. + * + * \param pPlaneArray - Returned CUDA array referenced by the \p planeIdx + * \param hArray - Multiplanar CUDA array + * \param planeIdx - Plane index + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::cuArrayCreate, + * ::cudaGetArrayPlane + */ +CUresult CUDAAPI cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx); + +/** + * \brief Destroys a CUDA array + * + * Destroys the CUDA array \p hArray. + * + * \param hArray - Array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaFreeArray + */ +CUresult CUDAAPI cuArrayDestroy(CUarray hArray); + +/** + * \brief Creates a 3D CUDA array + * + * Creates a CUDA array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pAllocateArray and returns a handle to the new CUDA array in \p *pHandle. + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D array is allocated if only \p Depth extent is zero. + * - A 3D array is allocated if all three extents are non-zero. + * - A 1D layered CUDA array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to the CUDA array. + * If this flag is not set, ::cuSurfRefSetArray will fail when attempting to bind the CUDA array + * to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH. + * + * Note that 2D CUDA arrays have different size requirements if the ::CUDA_ARRAY3D_TEXTURE_GATHER flag + * is set. \p Width and \p Height must not be greater than ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH + * and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT respectively, in that case. + * + * <table> + * <tr><td><b>CUDA array type</b></td> + * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range), + * (depth range)}</b></td> + * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br> + * {(width range in elements), (height range), (depth range)}</b></td></tr> + * <tr><td>1D</td> + * <td><small>{ (1,TEXTURE1D_WIDTH), 0, 0 }</small></td> + * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr> + * <tr><td>2D</td> + * <td><small>{ (1,TEXTURE2D_WIDTH), (1,TEXTURE2D_HEIGHT), 0 }</small></td> + * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr> + * <tr><td>3D</td> + * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td> + * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }</small></td></tr> + * <tr><td>1D Layered</td> + * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td> + * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr> + * <tr><td>2D Layered</td> + * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td> + * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr> + * <tr><td>Cubemap</td> + * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td> + * <td><small>{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr> + * <tr><td>Cubemap Layered</td> + * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td> + * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr> + * </table> + * + * Here are examples of CUDA array descriptions: + * + * Description for a CUDA array of 2048 floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 2048; + desc.Height = 0; + desc.Depth = 0; + * \endcode + * + * Description for a 64 x 64 CUDA array of floats: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_FLOAT; + desc.NumChannels = 1; + desc.Width = 64; + desc.Height = 64; + desc.Depth = 0; + * \endcode + * + * Description for a \p width x \p height x \p depth CUDA array of 64-bit, + * 4x16-bit float16's: + * \code + CUDA_ARRAY3D_DESCRIPTOR desc; + desc.Format = CU_AD_FORMAT_HALF; + desc.NumChannels = 4; + desc.Width = width; + desc.Height = height; + desc.Depth = depth; + * \endcode + * + * \param pHandle - Returned array + * \param pAllocateArray - 3D array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuArray3DGetDescriptor, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaMalloc3DArray + */ +CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray); + +/** + * \brief Get a 3D CUDA array descriptor + * + * Returns in \p *pArrayDescriptor a descriptor containing information on the + * format and dimensions of the CUDA array \p hArray. It is useful for + * subroutines that have been passed a CUDA array, but need to know the CUDA + * array parameters for validation or other purposes. + * + * This function may be called on 1D and 2D arrays, in which case the \p Height + * and/or \p Depth members of the descriptor struct will be set to 0. + * + * \param pArrayDescriptor - Returned 3D array descriptor + * \param hArray - 3D array to get descriptor of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa ::cuArray3DCreate, ::cuArrayCreate, + * ::cuArrayDestroy, ::cuArrayGetDescriptor, ::cuMemAlloc, ::cuMemAllocHost, + * ::cuMemAllocPitch, ::cuMemcpy2D, ::cuMemcpy2DAsync, ::cuMemcpy2DUnaligned, + * ::cuMemcpy3D, ::cuMemcpy3DAsync, ::cuMemcpyAtoA, ::cuMemcpyAtoD, + * ::cuMemcpyAtoH, ::cuMemcpyAtoHAsync, ::cuMemcpyDtoA, ::cuMemcpyDtoD, ::cuMemcpyDtoDAsync, + * ::cuMemcpyDtoH, ::cuMemcpyDtoHAsync, ::cuMemcpyHtoA, ::cuMemcpyHtoAAsync, + * ::cuMemcpyHtoD, ::cuMemcpyHtoDAsync, ::cuMemFree, ::cuMemFreeHost, + * ::cuMemGetAddressRange, ::cuMemGetInfo, ::cuMemHostAlloc, + * ::cuMemHostGetDevicePointer, ::cuMemsetD2D8, ::cuMemsetD2D16, + * ::cuMemsetD2D32, ::cuMemsetD8, ::cuMemsetD16, ::cuMemsetD32, + * ::cudaArrayGetInfo + */ +CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray); + +/** + * \brief Creates a CUDA mipmapped array + * + * Creates a CUDA mipmapped array according to the ::CUDA_ARRAY3D_DESCRIPTOR structure + * \p pMipmappedArrayDesc and returns a handle to the new CUDA mipmapped array in \p *pHandle. + * \p numMipmapLevels specifies the number of mipmap levels to be allocated. This value is + * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. + * + * The ::CUDA_ARRAY3D_DESCRIPTOR is defined as: + * + * \code + typedef struct { + unsigned int Width; + unsigned int Height; + unsigned int Depth; + CUarray_format Format; + unsigned int NumChannels; + unsigned int Flags; + } CUDA_ARRAY3D_DESCRIPTOR; + * \endcode + * where: + * + * - \p Width, \p Height, and \p Depth are the width, height, and depth of the + * CUDA array (in elements); the following types of CUDA arrays can be allocated: + * - A 1D mipmapped array is allocated if \p Height and \p Depth extents are both zero. + * - A 2D mipmapped array is allocated if only \p Depth extent is zero. + * - A 3D mipmapped array is allocated if all three extents are non-zero. + * - A 1D layered CUDA mipmapped array is allocated if only \p Height is zero and the + * ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 1D array. The number + * of layers is determined by the depth extent. + * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and + * the ::CUDA_ARRAY3D_LAYERED flag is set. Each layer is a 2D array. The number + * of layers is determined by the depth extent. + * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + * ::CUDA_ARRAY3D_CUBEMAP flag is set. \p Width must be equal to \p Height, and + * \p Depth must be six. A cubemap is a special type of 2D layered CUDA array, + * where the six layers represent the six faces of a cube. The order of the six + * layers in memory is the same as that listed in ::CUarray_cubemap_face. + * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, + * and both, ::CUDA_ARRAY3D_CUBEMAP and ::CUDA_ARRAY3D_LAYERED flags are set. + * \p Width must be equal to \p Height, and \p Depth must be a multiple of six. + * A cubemap layered CUDA array is a special type of 2D layered CUDA array that + * consists of a collection of cubemaps. The first six layers represent the first + * cubemap, the next six layers form the second cubemap, and so on. + * + * - ::Format specifies the format of the elements; ::CUarray_format is + * defined as: + * \code + typedef enum CUarray_format_enum { + CU_AD_FORMAT_UNSIGNED_INT8 = 0x01, + CU_AD_FORMAT_UNSIGNED_INT16 = 0x02, + CU_AD_FORMAT_UNSIGNED_INT32 = 0x03, + CU_AD_FORMAT_SIGNED_INT8 = 0x08, + CU_AD_FORMAT_SIGNED_INT16 = 0x09, + CU_AD_FORMAT_SIGNED_INT32 = 0x0a, + CU_AD_FORMAT_HALF = 0x10, + CU_AD_FORMAT_FLOAT = 0x20 + } CUarray_format; + * \endcode + * + * - \p NumChannels specifies the number of packed components per CUDA array + * element; it may be 1, 2, or 4; + * + * - ::Flags may be set to + * - ::CUDA_ARRAY3D_LAYERED to enable creation of layered CUDA mipmapped arrays. If this flag is set, + * \p Depth specifies the number of layers, not the depth of a 3D array. + * - ::CUDA_ARRAY3D_SURFACE_LDST to enable surface references to be bound to individual mipmap levels of + * the CUDA mipmapped array. If this flag is not set, ::cuSurfRefSetArray will fail when attempting to + * bind a mipmap level of the CUDA mipmapped array to a surface reference. + * - ::CUDA_ARRAY3D_CUBEMAP to enable creation of mipmapped cubemaps. If this flag is set, \p Width must be + * equal to \p Height, and \p Depth must be six. If the ::CUDA_ARRAY3D_LAYERED flag is also set, + * then \p Depth must be a multiple of six. + * - ::CUDA_ARRAY3D_TEXTURE_GATHER to indicate that the CUDA mipmapped array will be used for texture gather. + * Texture gather can only be performed on 2D CUDA mipmapped arrays. + * + * \p Width, \p Height and \p Depth must meet certain size requirements as listed in the following table. + * All values are specified in elements. Note that for brevity's sake, the full name of the device attribute + * is not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device attribute + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH. + * + * <table> + * <tr><td><b>CUDA array type</b></td> + * <td><b>Valid extents that must always be met<br>{(width range in elements), (height range), + * (depth range)}</b></td> + * <td><b>Valid extents with CUDA_ARRAY3D_SURFACE_LDST set<br> + * {(width range in elements), (height range), (depth range)}</b></td></tr> + * <tr><td>1D</td> + * <td><small>{ (1,TEXTURE1D_MIPMAPPED_WIDTH), 0, 0 }</small></td> + * <td><small>{ (1,SURFACE1D_WIDTH), 0, 0 }</small></td></tr> + * <tr><td>2D</td> + * <td><small>{ (1,TEXTURE2D_MIPMAPPED_WIDTH), (1,TEXTURE2D_MIPMAPPED_HEIGHT), 0 }</small></td> + * <td><small>{ (1,SURFACE2D_WIDTH), (1,SURFACE2D_HEIGHT), 0 }</small></td></tr> + * <tr><td>3D</td> + * <td><small>{ (1,TEXTURE3D_WIDTH), (1,TEXTURE3D_HEIGHT), (1,TEXTURE3D_DEPTH) } + * <br>OR<br>{ (1,TEXTURE3D_WIDTH_ALTERNATE), (1,TEXTURE3D_HEIGHT_ALTERNATE), + * (1,TEXTURE3D_DEPTH_ALTERNATE) }</small></td> + * <td><small>{ (1,SURFACE3D_WIDTH), (1,SURFACE3D_HEIGHT), + * (1,SURFACE3D_DEPTH) }</small></td></tr> + * <tr><td>1D Layered</td> + * <td><small>{ (1,TEXTURE1D_LAYERED_WIDTH), 0, + * (1,TEXTURE1D_LAYERED_LAYERS) }</small></td> + * <td><small>{ (1,SURFACE1D_LAYERED_WIDTH), 0, + * (1,SURFACE1D_LAYERED_LAYERS) }</small></td></tr> + * <tr><td>2D Layered</td> + * <td><small>{ (1,TEXTURE2D_LAYERED_WIDTH), (1,TEXTURE2D_LAYERED_HEIGHT), + * (1,TEXTURE2D_LAYERED_LAYERS) }</small></td> + * <td><small>{ (1,SURFACE2D_LAYERED_WIDTH), (1,SURFACE2D_LAYERED_HEIGHT), + * (1,SURFACE2D_LAYERED_LAYERS) }</small></td></tr> + * <tr><td>Cubemap</td> + * <td><small>{ (1,TEXTURECUBEMAP_WIDTH), (1,TEXTURECUBEMAP_WIDTH), 6 }</small></td> + * <td><small>{ (1,SURFACECUBEMAP_WIDTH), + * (1,SURFACECUBEMAP_WIDTH), 6 }</small></td></tr> + * <tr><td>Cubemap Layered</td> + * <td><small>{ (1,TEXTURECUBEMAP_LAYERED_WIDTH), (1,TEXTURECUBEMAP_LAYERED_WIDTH), + * (1,TEXTURECUBEMAP_LAYERED_LAYERS) }</small></td> + * <td><small>{ (1,SURFACECUBEMAP_LAYERED_WIDTH), (1,SURFACECUBEMAP_LAYERED_WIDTH), + * (1,SURFACECUBEMAP_LAYERED_LAYERS) }</small></td></tr> + * </table> + * + * + * \param pHandle - Returned mipmapped array + * \param pMipmappedArrayDesc - mipmapped array descriptor + * \param numMipmapLevels - Number of mipmap levels + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuMipmappedArrayDestroy, + * ::cuMipmappedArrayGetLevel, + * ::cuArrayCreate, + * ::cudaMallocMipmappedArray + */ +CUresult CUDAAPI cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels); + +/** + * \brief Gets a mipmap level of a CUDA mipmapped array + * + * Returns in \p *pLevelArray a CUDA array that represents a single mipmap level + * of the CUDA mipmapped array \p hMipmappedArray. + * + * If \p level is greater than the maximum number of levels in this mipmapped array, + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pLevelArray - Returned mipmap level CUDA array + * \param hMipmappedArray - CUDA mipmapped array + * \param level - Mipmap level + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::cuMipmappedArrayCreate, + * ::cuMipmappedArrayDestroy, + * ::cuArrayCreate, + * ::cudaGetMipmappedArrayLevel + */ +CUresult CUDAAPI cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); + +/** + * \brief Destroys a CUDA mipmapped array + * + * Destroys the CUDA mipmapped array \p hMipmappedArray. + * + * \param hMipmappedArray - Mipmapped array to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ARRAY_IS_MAPPED, + * ::CUDA_ERROR_CONTEXT_IS_DESTROYED + * \notefnerr + * + * \sa + * ::cuMipmappedArrayCreate, + * ::cuMipmappedArrayGetLevel, + * ::cuArrayCreate, + * ::cudaFreeMipmappedArray + */ +CUresult CUDAAPI cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray); + +/** +* \brief Retrieve handle for an address range +* +* Get a handle of the specified type to an address range. The address range +* must have been obtained by a prior call to either ::cuMemAlloc or ::cuMemAddressReserve. +* If the address range was obtained via ::cuMemAddressReserve, it must also be fully mapped via ::cuMemMap. +* +* Users must ensure the \p dptr and \p size are aligned to the host page size. +* +* When requesting CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, +* users are expected to query for dma_buf support for the platform +* by using ::CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED device attribute before calling +* this API. The \p handle will be interpreted as a pointer to an integer to store the dma_buf file descriptor. +* Users must ensure the entire address range is backed and mapped when +* the address range is allocated by ::cuMemAddressReserve. All the physical +* allocations backing the address range must be resident on the same device and +* have identical allocation properties. Users are also expected to retrieve a +* new handle every time the underlying physical allocation(s) corresponding +* to a previously queried VA range are changed. +* +* \param[out] handle - Pointer to the location where the returned handle will be stored. +* \param[in] dptr - Pointer to a valid CUDA device allocation. Must be aligned to host page size. +* \param[in] size - Length of the address range. Must be aligned to host page size. +* \param[in] handleType - Type of handle requested (defines type and size of the \p handle output parameter) +* \param[in] flags - Reserved, must be zero +* +* \return +* CUDA_SUCCESS +* CUDA_ERROR_INVALID_VALUE +* CUDA_ERROR_NOT_SUPPORTED +*/ +CUresult CUDAAPI cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags); + +/** @} */ /* END CUDA_MEM */ + +/** + * \defgroup CUDA_VA Virtual Memory Management + * + * ___MANBRIEF___ virtual memory management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the virtual memory management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** +* \brief Allocate an address range reservation. +* +* Reserves a virtual address range based on the given parameters, giving +* the starting address of the range in \p ptr. This API requires a system that +* supports UVA. The size and address parameters must be a multiple of the +* host page size and the alignment must be a power of two or zero for default +* alignment. +* +* \param[out] ptr - Resulting pointer to start of virtual address range allocated +* \param[in] size - Size of the reserved virtual address range requested +* \param[in] alignment - Alignment of the reserved virtual address range requested +* \param[in] addr - Fixed starting address range requested +* \param[in] flags - Currently unused, must be zero +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemAddressFree +*/ +CUresult CUDAAPI cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags); + +/** +* \brief Free an address range reservation. +* +* Frees a virtual address range reserved by cuMemAddressReserve. The size +* must match what was given to memAddressReserve and the ptr given must +* match what was returned from memAddressReserve. +* +* \param[in] ptr - Starting address of the virtual address range to free +* \param[in] size - Size of the virtual address region to free +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemAddressReserve +*/ +CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size); + +/** +* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties +* +* This creates a memory allocation on the target device specified through the +* \p prop strcuture. The created allocation will not have any device or host +* mappings. The generic memory \p handle for the allocation can be +* mapped to the address space of calling process via ::cuMemMap. This handle +* cannot be transmitted directly to other processes (see +* ::cuMemExportToShareableHandle). On Windows, the caller must also pass +* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which +* limits or allows access to this handle for a recepient process (see +* ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this +* allocation must be a multiple of the the value given via +* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM +* flag. +* If ::CUmemAllocationProp::allocFlags::usage contains ::CU_MEM_CREATE_USAGE_TILE_POOL flag then +* the memory allocation is intended only to be used as backing tile pool for sparse CUDA arrays +* and sparse CUDA mipmapped arrays. +* (see ::cuMemMapArrayAsync). +* +* \param[out] handle - Value of handle returned. All operations on this allocation are to be performed using this handle. +* \param[in] size - Size of the allocation requested +* \param[in] prop - Properties of the allocation to create. +* \param[in] flags - flags for future use, must be zero now. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemRelease, ::cuMemExportToShareableHandle, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags); + +/** +* \brief Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate. +* +* Frees the memory that was allocated on a device through cuMemCreate. +* +* The memory allocation will be freed when all outstanding mappings to the memory +* are unmapped and when all outstanding references to the handle (including it's +* shareable counterparts) are also released. The generic memory handle can be +* freed when there are still outstanding mappings made with this handle. Each +* time a recepient process imports a shareable handle, it needs to pair it with +* ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle +* the behavior is undefined. +* +* \param[in] handle Value of handle which was returned previously by cuMemCreate. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemCreate +*/ +CUresult CUDAAPI cuMemRelease(CUmemGenericAllocationHandle handle); + +/** +* \brief Maps an allocation handle to a reserved virtual address range. +* +* Maps bytes of memory represented by \p handle starting from byte \p offset to +* \p size to address range [\p addr, \p addr + \p size]. This range must be an +* address reservation previously reserved with ::cuMemAddressReserve, and +* \p offset + \p size must be less than the size of the memory allocation. +* Both \p ptr, \p size, and \p offset must be a multiple of the value given via +* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM flag. +* +* Please note calling ::cuMemMap does not make the address accessible, +* the caller needs to update accessibility of a contiguous mapped VA +* range by calling ::cuMemSetAccess. +* +* Once a recipient process obtains a shareable memory handle +* from ::cuMemImportFromShareableHandle, the process must +* use ::cuMemMap to map the memory into its address ranges before +* setting accessibility with ::cuMemSetAccess. +* +* ::cuMemMap can only create mappings on VA range reservations +* that are not currently mapped. +* +* \param[in] ptr - Address where memory will be mapped. +* \param[in] size - Size of the memory mapping. +* \param[in] offset - Offset into the memory represented by +* - \p handle from which to start mapping +* - Note: currently must be zero. +* \param[in] handle - Handle to a shareable memory +* \param[in] flags - flags for future use, must be zero now. +* \return +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_OUT_OF_MEMORY, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* +* \sa ::cuMemUnmap, ::cuMemSetAccess, ::cuMemCreate, ::cuMemAddressReserve, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags); + +/** + * \brief Maps or unmaps subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays + * + * Performs map or unmap operations on subregions of sparse CUDA arrays and sparse CUDA mipmapped arrays. + * Each operation is specified by a ::CUarrayMapInfo entry in the \p mapInfoList array of size \p count. + * The structure ::CUarrayMapInfo is defined as follow: + \code + typedef struct CUarrayMapInfo_st { + CUresourcetype resourceType; + union { + CUmipmappedArray mipmap; + CUarray array; + } resource; + + CUarraySparseSubresourceType subresourceType; + union { + struct { + unsigned int level; + unsigned int layer; + unsigned int offsetX; + unsigned int offsetY; + unsigned int offsetZ; + unsigned int extentWidth; + unsigned int extentHeight; + unsigned int extentDepth; + } sparseLevel; + struct { + unsigned int layer; + unsigned long long offset; + unsigned long long size; + } miptail; + } subresource; + + CUmemOperationType memOperationType; + + CUmemHandleType memHandleType; + union { + CUmemGenericAllocationHandle memHandle; + } memHandle; + + unsigned long long offset; + unsigned int deviceBitMask; + unsigned int flags; + unsigned int reserved[2]; + } CUarrayMapInfo; + \endcode + * + * where ::CUarrayMapInfo::resourceType specifies the type of resource to be operated on. + * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_ARRAY then + * ::CUarrayMapInfo::resource::array must be set to a valid sparse CUDA array handle. + * The CUDA array must be either a 2D, 2D layered or 3D CUDA array and must have been allocated using + * ::cuArrayCreate or ::cuArray3DCreate with the flag ::CUDA_ARRAY3D_SPARSE + * or ::CUDA_ARRAY3D_DEFERRED_MAPPING. + * For CUDA arrays obtained using ::cuMipmappedArrayGetLevel, ::CUDA_ERROR_INVALID_VALUE will be returned. + * If ::CUarrayMapInfo::resourceType is set to ::CUresourcetype::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY + * then ::CUarrayMapInfo::resource::mipmap must be set to a valid sparse CUDA mipmapped array handle. + * The CUDA mipmapped array must be either a 2D, 2D layered or 3D CUDA mipmapped array and must have been + * allocated using ::cuMipmappedArrayCreate with the flag ::CUDA_ARRAY3D_SPARSE + * or ::CUDA_ARRAY3D_DEFERRED_MAPPING. + * + * ::CUarrayMapInfo::subresourceType specifies the type of subresource within the resource. + * ::CUarraySparseSubresourceType_enum is defined as: + \code + typedef enum CUarraySparseSubresourceType_enum { + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL = 0, + CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL = 1 + } CUarraySparseSubresourceType; + \endcode + * + * where ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL indicates a + * sparse-miplevel which spans at least one tile in every dimension. The remaining miplevels which + * are too small to span at least one tile in any dimension constitute the mip tail region as indicated by + * ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL subresource type. + * + * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL + * then ::CUarrayMapInfo::subresource::sparseLevel struct must contain valid array subregion offsets and extents. + * The ::CUarrayMapInfo::subresource::sparseLevel::offsetX, ::CUarrayMapInfo::subresource::sparseLevel::offsetY + * and ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must specify valid X, Y and Z offsets respectively. + * The ::CUarrayMapInfo::subresource::sparseLevel::extentWidth, ::CUarrayMapInfo::subresource::sparseLevel::extentHeight + * and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth must specify valid width, height and depth extents respectively. + * These offsets and extents must be aligned to the corresponding tile dimension. + * For CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::level must specify a valid mip level index. Otherwise, + * must be zero. + * For layered CUDA arrays and layered CUDA mipmapped arrays ::CUarrayMapInfo::subresource::sparseLevel::layer must specify a valid layer index. Otherwise, + * must be zero. + * ::CUarrayMapInfo::subresource::sparseLevel::offsetZ must be zero and ::CUarrayMapInfo::subresource::sparseLevel::extentDepth + * must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped arrays. + * Tile extents can be obtained by calling ::cuArrayGetSparseProperties and ::cuMipmappedArrayGetSparseProperties + * + * If ::CUarrayMapInfo::subresourceType is set to ::CUarraySparseSubresourceType::CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL + * then ::CUarrayMapInfo::subresource::miptail struct must contain valid mip tail offset in + * ::CUarrayMapInfo::subresource::miptail::offset and size in ::CUarrayMapInfo::subresource::miptail::size. + * Both, mip tail offset and mip tail size must be aligned to the tile size. + * For layered CUDA mipmapped arrays which don't have the flag ::CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL set in ::CUDA_ARRAY_SPARSE_PROPERTIES::flags + * as returned by ::cuMipmappedArrayGetSparseProperties, ::CUarrayMapInfo::subresource::miptail::layer must specify a valid layer index. + * Otherwise, must be zero. + * + * If ::CUarrayMapInfo::resource::array or ::CUarrayMapInfo::resource::mipmap was created with ::CUDA_ARRAY3D_DEFERRED_MAPPING + * flag set the ::CUarrayMapInfo::subresourceType and the contents of ::CUarrayMapInfo::subresource will be ignored. + * + * ::CUarrayMapInfo::memOperationType specifies the type of operation. ::CUmemOperationType is defined as: + \code + typedef enum CUmemOperationType_enum { + CU_MEM_OPERATION_TYPE_MAP = 1, + CU_MEM_OPERATION_TYPE_UNMAP = 2 + } CUmemOperationType; + \endcode + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP then the subresource + * will be mapped onto the tile pool memory specified by ::CUarrayMapInfo::memHandle at offset ::CUarrayMapInfo::offset. + * The tile pool allocation has to be created by specifying the ::CU_MEM_CREATE_USAGE_TILE_POOL flag when calling ::cuMemCreate. Also, + * ::CUarrayMapInfo::memHandleType must be set to ::CUmemHandleType::CU_MEM_HANDLE_TYPE_GENERIC. + * + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_UNMAP then an unmapping operation + * is performed. ::CUarrayMapInfo::memHandle must be NULL. + * + * ::CUarrayMapInfo::deviceBitMask specifies the list of devices that must map or unmap physical memory. + * Currently, this mask must have exactly one bit set, and the corresponding device must match the device associated with the stream. + * If ::CUarrayMapInfo::memOperationType is set to ::CUmemOperationType::CU_MEM_OPERATION_TYPE_MAP, the device must also match + * the device associated with the tile pool memory allocation as specified by ::CUarrayMapInfo::memHandle. + * + * ::CUarrayMapInfo::flags and ::CUarrayMapInfo::reserved[] are unused and must be set to zero. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * + * \param[in] mapInfoList - List of ::CUarrayMapInfo + * \param[in] count - Count of ::CUarrayMapInfo in \p mapInfoList + * \param[in] hStream - Stream identifier for the stream to use for map or unmap operations + * + * \sa ::cuMipmappedArrayCreate, ::cuArrayCreate, ::cuArray3DCreate, ::cuMemCreate, ::cuArrayGetSparseProperties, ::cuMipmappedArrayGetSparseProperties + */ +CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); + +/** +* \brief Unmap the backing memory of a given address range. +* +* The range must be the entire contiguous address range that was mapped to. In +* other words, ::cuMemUnmap cannot unmap a sub-range of an address range mapped +* by ::cuMemCreate / ::cuMemMap. Any backing memory allocations will be freed +* if there are no existing mappings and there are no unreleased memory handles. +* +* When ::cuMemUnmap returns successfully the address range is converted to an +* address reservation and can be used for a future calls to ::cuMemMap. Any new +* mapping to this virtual address will need to have access granted through +* ::cuMemSetAccess, as all mappings start with no accessibility setup. +* +* \param[in] ptr - Starting address for the virtual address range to unmap +* \param[in] size - Size of the virtual address range to unmap +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* \note_sync +* +* \sa ::cuMemCreate, ::cuMemAddressReserve +*/ +CUresult CUDAAPI cuMemUnmap(CUdeviceptr ptr, size_t size); + +/** +* \brief Set the access flags for each location specified in \p desc for the given virtual address range +* +* Given the virtual address range via \p ptr and \p size, and the locations +* in the array given by \p desc and \p count, set the access flags for the +* target locations. The range must be a fully mapped address range +* containing all allocations created by ::cuMemMap / ::cuMemCreate. +* +* \param[in] ptr - Starting address for the virtual address range +* \param[in] size - Length of the virtual address range +* \param[in] desc - Array of ::CUmemAccessDesc that describe how to change the +* - mapping for each location specified +* \param[in] count - Number of ::CUmemAccessDesc in \p desc +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_SUPPORTED +* \notefnerr +* \note_sync +* +* \sa ::cuMemSetAccess, ::cuMemCreate, :cuMemMap +*/ +CUresult CUDAAPI cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count); + +/** +* \brief Get the access \p flags set for the given \p location and \p ptr +* +* \param[out] flags - Flags set for this location +* \param[in] location - Location in which to check the flags for +* \param[in] ptr - Address in which to check the access flags for +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_INVALID_DEVICE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemSetAccess +*/ +CUresult CUDAAPI cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr); + +/** +* \brief Exports an allocation to a requested shareable handle type +* +* Given a CUDA memory handle, create a shareable memory +* allocation handle that can be used to share the memory with other +* processes. The recipient process can convert the shareable handle back into a +* CUDA memory handle using ::cuMemImportFromShareableHandle and map +* it with ::cuMemMap. The implementation of what this handle is and how it +* can be transferred is defined by the requested handle type in \p handleType +* +* Once all shareable handles are closed and the allocation is released, the allocated +* memory referenced will be released back to the OS and uses of the CUDA handle afterward +* will lead to undefined behavior. +* +* This API can also be used in conjunction with other APIs (e.g. Vulkan, OpenGL) +* that support importing memory from the shareable type +* +* \param[out] shareableHandle - Pointer to the location in which to store the requested handle type +* \param[in] handle - CUDA handle for the memory allocation +* \param[in] handleType - Type of shareable handle requested (defines type and size of the \p shareableHandle output parameter) +* \param[in] flags - Reserved, must be zero +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags); + +/** +* \brief Imports an allocation from a requested shareable handle type. +* +* If the current process cannot support the memory described by this shareable +* handle, this API will error as CUDA_ERROR_NOT_SUPPORTED. +* +* \note Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) +* created on devices under an SLI group may not be supported, and thus this API will +* return CUDA_ERROR_NOT_SUPPORTED. +* There is no guarantee that the contents of \p handle will be the same CUDA memory handle +* for the same given OS shareable handle, or the same underlying allocation. +* +* \param[out] handle - CUDA Memory handle for the memory allocation. +* \param[in] osHandle - Shareable Handle representing the memory allocation that is to be imported. +* \param[in] shHandleType - handle type of the exported handle ::CUmemAllocationHandleType. +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemExportToShareableHandle, ::cuMemMap, ::cuMemRelease +*/ +CUresult CUDAAPI cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType); + +/** +* \brief Calculates either the minimal or recommended granularity +* +* Calculates either the minimal or recommended granularity +* for a given allocation specification and returns it in granularity. This +* granularity can be used as a multiple for alignment, size, or address mapping. +* +* \param[out] granularity Returned granularity. +* \param[in] prop Property for which to determine the granularity for +* \param[in] option Determines which granularity to return +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemMap +*/ +CUresult CUDAAPI cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option); + +/** +* \brief Retrieve the contents of the property structure defining properties for this handle +* +* \param[out] prop - Pointer to a properties structure which will hold the information about this handle +* \param[in] handle - Handle which to perform the query on +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemImportFromShareableHandle +*/ +CUresult CUDAAPI cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle); + +/** +* \brief Given an address \p addr, returns the allocation handle of the backing memory allocation. +* +* The handle is guaranteed to be the same handle value used to map the memory. If the address +* requested is not mapped, the function will fail. The returned handle must be released with +* corresponding number of calls to ::cuMemRelease. +* +* \note The address \p addr, can be any address in a range previously mapped +* by ::cuMemMap, and not necessarily the start address. +* +* \param[out] handle CUDA Memory handle for the backing memory allocation. +* \param[in] addr Memory address to query, that has been mapped previously. +* \returns +* ::CUDA_SUCCESS, +* ::CUDA_ERROR_INVALID_VALUE, +* ::CUDA_ERROR_NOT_INITIALIZED, +* ::CUDA_ERROR_DEINITIALIZED, +* ::CUDA_ERROR_NOT_PERMITTED, +* ::CUDA_ERROR_NOT_SUPPORTED +* +* \sa ::cuMemCreate, ::cuMemRelease, ::cuMemMap +*/ +CUresult CUDAAPI cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr); + +/** @} */ /* END CUDA_VA */ + +/** + * \defgroup CUDA_MALLOC_ASYNC Stream Ordered Memory Allocator + * + * ___MANBRIEF___ Functions for performing allocation and free operations in stream order. + * Functions for controlling the behavior of the underlying allocator. + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream ordered memory allocator exposed by the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_MALLOC_ASYNC_overview overview + * + * The asynchronous allocator allows the user to allocate and free in stream order. + * All asynchronous accesses of the allocation must happen between + * the stream executions of the allocation and the free. If the memory is accessed + * outside of the promised stream order, a use before allocation / use after free error + * will cause undefined behavior. + * + * The allocator is free to reallocate the memory as long as it can guarantee + * that compliant memory accesses will not overlap temporally. + * The allocator may refer to internal stream ordering as well as inter-stream dependencies + * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. + * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. + * + * \section CUDA_MALLOC_ASYNC_support Supported Platforms + * + * Whether or not a device supports the integrated stream ordered memory allocator + * may be queried by calling ::cuDeviceGetAttribute() with the device attribute + * ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED + */ + +/** + * \brief Frees memory with stream ordered semantics + * + * Inserts a free operation into \p hStream. + * The allocation must not be accessed after stream execution reaches the free. + * After this API returns, accessing the memory from any subsequent work launched on the GPU + * or querying its pointer attributes results in undefined behavior. + * + * \note During stream capture, this function results in the creation of a free node and + * must therefore be passed the address of a graph allocation. + * + * \param dptr - memory to free + * \param hStream - The stream establishing the stream ordering contract. + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED + */ +CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); + +/** + * \brief Allocates memory with stream ordered semantics + * + * Inserts an allocation operation into \p hStream. + * A pointer to the allocated memory is returned immediately in *dptr. + * The allocation must not be accessed until the the allocation operation completes. + * The allocation comes from the memory pool current to the stream's device. + * + * \note The default memory pool of a device contains device memory from that device. + * \note Basic stream ordering allows future work submitted into the same stream to use the allocation. + * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + * operation completes before work submitted in a separate stream runs. + * \note During stream capture, this function results in the creation of an allocation node. In this case, + * the allocation is owned by the graph instead of the memory pool. The memory pool's properties + * are used to set the node's creation parameters. + * + * \param[out] dptr - Returned device pointer + * \param[in] bytesize - Number of bytes to allocate + * \param[in] hStream - The stream establishing the stream ordering contract and the memory pool to allocate from + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemAllocFromPoolAsync, ::cuMemFreeAsync, ::cuDeviceSetMemPool, + * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, + * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); + +/** + * \brief Tries to release memory back to the OS + * + * Releases memory back to the OS until the pool contains fewer than minBytesToKeep + * reserved bytes, or there is no more memory that the allocator can safely release. + * The allocator cannot release OS allocations that back outstanding asynchronous allocations. + * The OS allocations may happen at different granularity from the user allocations. + * + * \note: Allocations that have not been freed count as outstanding. + * \note: Allocations that have been asynchronously freed but whose completion has + * not been observed on the host (eg. by a synchronize) can count as outstanding. + * + * \param[in] pool - The memory pool to trim + * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved, + * the TrimTo operation is a no-op. Otherwise the pool will be guaranteed to have + * at least minBytesToKeep bytes reserved after the operation. + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep); + +/** + * \brief Sets attributes of a memory pool + * + * Supported attributes are: + * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to use memory asynchronously freed + * in another stream as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by ::cuMemFreeAsync (default enabled). + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) + * Reset the high watermark that tracks the amount of backing memory that was + * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. + * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) + * Reset the high watermark that tracks the amount of used memory that was + * allocated for the memory pool. + * + * \param[in] pool - The memory pool to modify + * \param[in] attr - The attribute to modify + * \param[in] value - Pointer to the value to assign + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); + +/** + * \brief Gets attributes of a memory pool + * + * Supported attributes are: + * - ::CU_MEMPOOL_ATTR_RELEASE_THRESHOLD: (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + * - ::CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to use memory asynchronously freed + * in another stream as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC: (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + * - ::CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES: (value type = int) + * Allow ::cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by ::cuMemFreeAsync (default enabled). + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT: (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool + * - ::CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH: (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since the + * last time it was reset. + * - ::CU_MEMPOOL_ATTR_USED_MEM_CURRENT: (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + * - ::CU_MEMPOOL_ATTR_USED_MEM_HIGH: (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the application. + * + * \param[in] pool - The memory pool to get attributes of + * \param[in] attr - The attribute to get + * \param[out] value - Retrieved value + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value); + +/** + * \brief Controls visibility of pools between devices + * + * \param[in] pool - The pool being modified + * \param[in] map - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu. + * \param[in] count - Number of descriptors in the map array. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count); + +/** + * \brief Returns the accessibility of a pool from a device + * + * Returns the accessibility of the pool's memory from the specified location. + * + * \param[out] flags - the accessibility of the pool from the specified location + * \param[in] memPool - the pool being queried + * \param[in] location - the location accessing the pool + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location); + +/** + * \brief Creates a memory pool + * + * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines + * the properties of the pool such as the backing device and IPC capabilities. + * + * By default, the pool's memory will be accessible from the device it is allocated on. + * + * \note Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_NOT_SUPPORTED + * + * \sa ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, ::cuDeviceGetDefaultMemPool, + * ::cuMemAllocFromPoolAsync, ::cuMemPoolExportToShareableHandle + */ +CUresult CUDAAPI cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps); + +/** + * \brief Destroys the specified memory pool + * + * If any pointers obtained from this pool haven't been freed or + * the pool has free operations that haven't completed + * when ::cuMemPoolDestroy is invoked, the function will return immediately and the + * resources associated with the pool will be released automatically + * once there are no more outstanding allocations. + * + * Destroying the current mempool of a device sets the default mempool of + * that device as the current mempool for that device. + * + * \note A device's default memory pool cannot be destroyed. + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuMemFreeAsync, ::cuDeviceSetMemPool, ::cuDeviceGetMemPool, + * ::cuDeviceGetDefaultMemPool, ::cuMemPoolCreate + */ +CUresult CUDAAPI cuMemPoolDestroy(CUmemoryPool pool); + +/** + * \brief Allocates memory from a specified pool with stream ordered semantics. + * + * Inserts an allocation operation into \p hStream. + * A pointer to the allocated memory is returned immediately in *dptr. + * The allocation must not be accessed until the the allocation operation completes. + * The allocation comes from the specified memory pool. + * + * \note + * - The specified memory pool may be from a device different than that of the specified \p hStream. + * + * - Basic stream ordering allows future work submitted into the same stream to use the allocation. + * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + * operation completes before work submitted in a separate stream runs. + * + * \note During stream capture, this function results in the creation of an allocation node. In this case, + * the allocation is owned by the graph instead of the memory pool. The memory pool's properties + * are used to set the node's creation parameters. + * + * \param[out] dptr - Returned device pointer + * \param[in] bytesize - Number of bytes to allocate + * \param[in] pool - The pool to allocate from + * \param[in] hStream - The stream establishing the stream ordering semantic + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT (default stream specified with no current context), + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemAllocAsync, ::cuMemFreeAsync, ::cuDeviceGetDefaultMemPool, + * ::cuDeviceGetMemPool, ::cuMemPoolCreate, ::cuMemPoolSetAccess, + * ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); + +/** + * \brief Exports a memory pool to the requested handle type. + * + * Given an IPC capable mempool, create an OS handle to share the pool with another process. + * A recipient process can convert the shareable handle into a mempool with ::cuMemPoolImportFromShareableHandle. + * Individual pointers can then be shared with the ::cuMemPoolExportPointer and ::cuMemPoolImportPointer APIs. + * The implementation of what the shareable handle is and how it can be transferred is defined by the requested + * handle type. + * + * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE. + * + * \param[out] handle_out - Returned OS handle + * \param[in] pool - pool to export + * \param[in] handleType - the type of handle to create + * \param[in] flags - must be 0 + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer, + * ::cuMemPoolImportPointer, ::cuMemAllocAsync, ::cuMemFreeAsync, + * ::cuDeviceGetDefaultMemPool, ::cuDeviceGetMemPool, ::cuMemPoolCreate, + * ::cuMemPoolSetAccess, ::cuMemPoolSetAttribute + */ +CUresult CUDAAPI cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags); + +/** + * \brief imports a memory pool from a shared handle. + * + * Specific allocations can be imported from the imported pool with cuMemPoolImportPointer. + * + * \note Imported memory pools do not support creating new allocations. + * As such imported memory pools may not be used in cuDeviceSetMemPool + * or ::cuMemAllocFromPoolAsync calls. + * + * \param[out] pool_out - Returned memory pool + * \param[in] handle - OS handle of the pool to open + * \param[in] handleType - The type of handle being imported + * \param[in] flags - must be 0 + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolExportPointer, ::cuMemPoolImportPointer + */ +CUresult CUDAAPI cuMemPoolImportFromShareableHandle( + CUmemoryPool *pool_out, + void *handle, + CUmemAllocationHandleType handleType, + unsigned long long flags); + +/** + * \brief Export data to share a memory pool allocation between processes. + * + * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. + * The recipient process can import the allocation with the ::cuMemPoolImportPointer api. + * The data is not a handle and may be shared through any IPC mechanism. + * + * \param[out] shareData_out - Returned export data + * \param[in] ptr - pointer to memory being exported + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolImportPointer + */ +CUresult CUDAAPI cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr); + +/** + * \brief Import a memory pool allocation from another process. + * + * Returns in \p ptr_out a pointer to the imported memory. + * The imported memory must not be accessed before the allocation operation completes + * in the exporting process. The imported memory must be freed from all importing processes before + * being freed in the exporting process. The pointer may be freed with cuMemFree + * or cuMemFreeAsync. If cuMemFreeAsync is used, the free must be completed + * on the importing process before the free operation on the exporting process. + * + * \note The cuMemFreeAsync api may be used in the exporting process before + * the cuMemFreeAsync operation completes in its stream as long as the + * cuMemFreeAsync in the exporting process specifies a stream with + * a stream dependency on the importing process's cuMemFreeAsync. + * + * \param[out] ptr_out - pointer to imported memory + * \param[in] pool - pool from which to import + * \param[in] shareData - data specifying the memory to import + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolExportToShareableHandle, ::cuMemPoolImportFromShareableHandle, ::cuMemPoolExportPointer + */ +CUresult CUDAAPI cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData); + +/** @} */ /* END CUDA_MALLOC_ASYNC */ + +/** + * \defgroup CUDA_UNIFIED Unified Addressing + * + * ___MANBRIEF___ unified addressing functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the unified addressing functions of the + * low-level CUDA driver application programming interface. + * + * @{ + * + * \section CUDA_UNIFIED_overview Overview + * + * CUDA devices can share a unified address space with the host. + * For these devices there is no distinction between a device + * pointer and a host pointer -- the same pointer value may be + * used to access memory from the host program and from a kernel + * running on the device (with exceptions enumerated below). + * + * \section CUDA_UNIFIED_support Supported Platforms + * + * Whether or not a device supports unified addressing may be + * queried by calling ::cuDeviceGetAttribute() with the device + * attribute ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING. + * + * Unified addressing is automatically enabled in 64-bit processes + * + * \section CUDA_UNIFIED_lookup Looking Up Information from Pointer Values + * + * It is possible to look up information about the memory which backs a + * pointer value. For instance, one may want to know if a pointer points + * to host or device memory. As another example, in the case of device + * memory, one may want to know on which CUDA device the memory + * resides. These properties may be queried using the function + * ::cuPointerGetAttribute() + * + * Since pointers are unique, it is not necessary to specify information + * about the pointers specified to the various copy functions in the + * CUDA API. The function ::cuMemcpy() may be used to perform a copy + * between two pointers, ignoring whether they point to host or device + * memory (making ::cuMemcpyHtoD(), ::cuMemcpyDtoD(), and ::cuMemcpyDtoH() + * unnecessary for devices supporting unified addressing). For + * multidimensional copies, the memory type ::CU_MEMORYTYPE_UNIFIED may be + * used to specify that the CUDA driver should infer the location of the + * pointer from its value. + * + * \section CUDA_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory + * + * All host memory allocated in all contexts using ::cuMemAllocHost() and + * ::cuMemHostAlloc() is always directly accessible from all contexts on + * all devices that support unified addressing. This is the case regardless + * of whether or not the flags ::CU_MEMHOSTALLOC_PORTABLE and + * ::CU_MEMHOSTALLOC_DEVICEMAP are specified. + * + * The pointer value through which allocated host memory may be accessed + * in kernels on all devices that support unified addressing is the same + * as the pointer value through which that memory is accessed on the host, + * so it is not necessary to call ::cuMemHostGetDevicePointer() to get the device + * pointer for these allocations. + * + * Note that this is not the case for memory allocated using the flag + * ::CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below. + * + * \section CUDA_UNIFIED_autopeerregister Automatic Registration of Peer Memory + * + * Upon enabling direct access from a context that supports unified addressing + * to another peer context that supports unified addressing using + * ::cuCtxEnablePeerAccess() all memory allocated in the peer context using + * ::cuMemAlloc() and ::cuMemAllocPitch() will immediately be accessible + * by the current context. The device pointer value through + * which any peer memory may be accessed in the current context + * is the same pointer value through which that memory may be + * accessed in the peer context. + * + * \section CUDA_UNIFIED_exceptions Exceptions, Disjoint Addressing + * + * Not all memory may be accessed on devices through the same pointer + * value through which they are accessed on the host. These exceptions + * are host memory registered using ::cuMemHostRegister() and host memory + * allocated using the flag ::CU_MEMHOSTALLOC_WRITECOMBINED. For these + * exceptions, there exists a distinct host and device address for the + * memory. The device address is guaranteed to not overlap any valid host + * pointer range and is guaranteed to have the same value across all + * contexts that support unified addressing. + * + * This device address may be queried using ::cuMemHostGetDevicePointer() + * when a context using unified addressing is current. Either the host + * or the unified device pointer value may be used to refer to this memory + * through ::cuMemcpy() and similar functions using the + * ::CU_MEMORYTYPE_UNIFIED memory type. + * + */ + +/** + * \brief Returns information about a pointer + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT: + * + * Returns in \p *data the ::CUcontext in which \p ptr was allocated or + * registered. + * The type of \p data must be ::CUcontext *. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE: + * + * Returns in \p *data the physical memory type of the memory that + * \p ptr addresses as a ::CUmemorytype enumerated value. + * The type of \p data must be unsigned int. + * + * If \p ptr addresses device memory then \p *data is set to + * ::CU_MEMORYTYPE_DEVICE. The particular ::CUdevice on which the + * memory resides is the ::CUdevice of the ::CUcontext returned by the + * ::CU_POINTER_ATTRIBUTE_CONTEXT attribute of \p ptr. + * + * If \p ptr addresses host memory then \p *data is set to + * ::CU_MEMORYTYPE_HOST. + * + * If \p ptr was not allocated by, mapped by, or registered with + * a ::CUcontext which uses unified virtual addressing then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If the current ::CUcontext does not support unified virtual + * addressing then ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER: + * + * Returns in \p *data the device pointer value through which + * \p ptr may be accessed by kernels running in the current + * ::CUcontext. + * The type of \p data must be CUdeviceptr *. + * + * If there exists no device pointer value through which + * kernels running in the current ::CUcontext may access + * \p ptr then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * If there is no current ::CUcontext then + * ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER: + * + * Returns in \p *data the host pointer value through which + * \p ptr may be accessed by by the host program. + * The type of \p data must be void **. + * If there exists no host pointer value through which + * the host program may directly access \p ptr then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Except in the exceptional disjoint addressing cases discussed + * below, the value returned in \p *data will equal the input + * value \p ptr. + * + * - ::CU_POINTER_ATTRIBUTE_P2P_TOKENS: + * + * Returns in \p *data two tokens for use with the nv-p2p.h Linux + * kernel interface. \p data must be a struct of type + * CUDA_POINTER_ATTRIBUTE_P2P_TOKENS. + * + * \p ptr must be a pointer to memory obtained from :cuMemAlloc(). + * Note that p2pToken and vaSpaceToken are only valid for the + * lifetime of the source allocation. A subsequent allocation at + * the same address may return completely different tokens. + * Querying this attribute has a side effect of setting the attribute + * ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS for the region of memory that + * \p ptr points to. + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute which when set, ensures that synchronous memory operations + * initiated on the region of memory that \p ptr points to will always synchronize. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID: + * + * Returns in \p *data a buffer ID which is guaranteed to be unique within the process. + * \p data must point to an unsigned long long. + * + * \p ptr must be a pointer to memory obtained from a CUDA memory allocation API. + * Every memory allocation from any of the CUDA memory allocation APIs will + * have a unique ID over a process lifetime. Subsequent allocations do not reuse IDs + * from previous freed allocations. IDs are only unique within a single process. + * + * + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED: + * + * Returns in \p *data a boolean that indicates whether the pointer points to + * managed memory or not. + * + * If \p ptr is not a valid CUDA pointer then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL: + * + * Returns in \p *data an integer representing a device ordinal of a device against + * which the memory was allocated or registered. + * + * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE: + * + * Returns in \p *data a boolean that indicates if this pointer maps to + * an allocation that is suitable for ::cudaIpcGetMemHandle. + * + * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR: + * + * Returns in \p *data the starting address for the allocation referenced + * by the device pointer \p ptr. Note that this is not necessarily the + * address of the mapped region, but the address of the mappable address + * range \p ptr references (e.g. from ::cuMemAddressReserve). + * + * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE: + * + * Returns in \p *data the size for the allocation referenced by the device + * pointer \p ptr. Note that this is not necessarily the size of the mapped + * region, but the size of the mappable address range \p ptr references + * (e.g. from ::cuMemAddressReserve). To retrieve the size of the mapped + * region, see ::cuMemGetAddressRange + * + * - ::CU_POINTER_ATTRIBUTE_MAPPED: + * + * Returns in \p *data a boolean that indicates if this pointer is in a + * valid address range that is mapped to a backing allocation. + * + * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES: + * + * Returns a bitmask of the allowed handle types for an allocation that may + * be passed to ::cuMemExportToShareableHandle. + * + * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE: + * + * Returns in \p *data the handle to the mempool that the allocation was obtained from. + * + * \par + * + * Note that for most allocations in the unified virtual address space + * the host and device pointer for accessing the allocation will be the + * same. The exceptions to this are + * - user memory registered using ::cuMemHostRegister + * - host memory allocated using ::cuMemHostAlloc with the + * ::CU_MEMHOSTALLOC_WRITECOMBINED flag + * For these types of allocation there will exist separate, disjoint host + * and device addresses for accessing the allocation. In particular + * - The host address will correspond to an invalid unmapped device address + * (which will result in an exception if accessed from the device) + * - The device address will correspond to an invalid unmapped host address + * (which will result in an exception if accessed from the host). + * For these types of allocations, querying ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * and ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER may be used to retrieve the host + * and device addresses from either address. + * + * \param data - Returned pointer attribute value + * \param attribute - Pointer attribute to query + * \param ptr - Pointer + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuPointerSetAttribute, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister, + * ::cudaPointerGetAttributes + */ +CUresult CUDAAPI cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr); + +/** + * \brief Prefetches memory to the specified destination device + * + * Prefetches memory to the specified destination device. \p devPtr is the + * base device pointer of the memory to be prefetched and \p dstDevice is the + * destination device. \p count specifies the number of bytes to copy. \p hStream + * is the stream in which the operation is enqueued. The memory range must refer + * to managed memory allocated via ::cuMemAllocManaged or declared via __managed__ variables. + * + * Passing in CU_DEVICE_CPU for \p dstDevice will prefetch the data to host memory. If + * \p dstDevice is a GPU, then the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + * must be non-zero. Additionally, \p hStream must be associated with a device that has a + * non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * + * The start address and end address of the memory range will be rounded down and rounded up + * respectively to be aligned to CPU page size before the prefetch operation is enqueued + * in the stream. + * + * If no physical memory has been allocated for this region, then this memory region + * will be populated and mapped on the destination device. If there's insufficient + * memory to prefetch the desired region, the Unified Memory driver may evict pages from other + * ::cuMemAllocManaged allocations to host memory in order to make room. Device memory + * allocated using ::cuMemAlloc or ::cuArrayCreate will not be evicted. + * + * By default, any mappings to the previous location of the migrated pages are removed and + * mappings for the new location are only setup on \p dstDevice. The exact behavior however + * also depends on the settings applied to this memory range via ::cuMemAdvise as described + * below: + * + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY was set on any subset of this memory range, + * then that subset will create a read-only copy of the pages on \p dstDevice. + * + * If ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION was called on any subset of this memory + * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the + * preferred location of any pages in the memory range. + * + * If ::CU_MEM_ADVISE_SET_ACCESSED_BY was called on any subset of this memory range, + * then mappings to those pages from all the appropriate processors are updated to + * refer to the new location if establishing such a mapping is possible. Otherwise, + * those mappings are cleared. + * + * Note that this API is not required for functionality and only serves to improve performance + * by allowing the application to migrate data to a suitable location before it is accessed. + * Memory accesses to this range are always coherent and are allowed even when the data is + * actively being migrated. + * + * Note that this function is asynchronous with respect to the host and all work + * on other devices. + * + * \param devPtr - Pointer to be prefetched + * \param count - Size in bytes + * \param dstDevice - Destination device to prefetch to + * \param hStream - Stream to enqueue prefetch operation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, + * ::cuMemcpy3DPeerAsync, ::cuMemAdvise, + * ::cudaMemPrefetchAsync + */ +CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); + +/** + * \brief Advise about the usage of a given memory range + * + * Advise the Unified Memory subsystem about the usage pattern for the memory range + * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory + * range will be rounded down and rounded up respectively to be aligned to CPU page size before the + * advice is applied. The memory range must refer to managed memory allocated via ::cuMemAllocManaged + * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable + * memory provided it represents a valid, host-accessible region of memory and all additional constraints + * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable + * memory range results in an error being returned. + * + * The \p advice parameter can take the following values: + * - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read + * from and only occasionally written to. Any read accesses from any processor to this region will create a + * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cuMemPrefetchAsync + * is called on this region, it will create a read-only copy of the data on the destination processor. + * If any processor writes to this region, all copies of the corresponding page will be invalidated + * except for the one where the write occurred. The \p device argument is ignored for this advice. + * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + * that has a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * Also, if a context is created on a device that does not have the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS set, then read-duplication will not occur until + * all such contexts are destroyed. + * If the memory region refers to valid system-allocated pageable memory, then the accessing device must + * have a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS for a read-only + * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, then setting this advice + * will not create a read-only copy when that device accesses this memory region. + * + * - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY and also prevents the + * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated + * copies of the data will be collapsed into a single copy. The location for the collapsed + * copy will be the preferred location if the page has a preferred location and one of the read-duplicated + * copies was resident at that location. Otherwise, the location chosen is arbitrary. + * + * - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the + * data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the + * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. Setting the preferred location + * does not cause data to migrate to that location immediately. Instead, it guides the migration policy + * when a fault occurs on that memory region. If the data is already in its preferred location and the + * faulting processor can establish a mapping without requiring the data to be migrated, then + * data migration will be avoided. On the other hand, if the data is not in its preferred location + * or if a direct mapping cannot be established, then it will be migrated to the processor accessing + * it. It is important to note that setting the preferred location does not prevent data prefetching + * done using ::cuMemPrefetchAsync. + * Having a preferred location can override the page thrash detection and resolution logic in the Unified + * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device + * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But + * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice, unless read accesses from + * \p device will not result in a read-only copy being created on that device as outlined in description for + * the advice ::CU_MEM_ADVISE_SET_READ_MOSTLY. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. Note however that this behavior may change in the future. + * + * - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION + * and changes the preferred location to none. + * + * - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device. + * Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU. If \p device is a GPU, then + * the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS must be non-zero. + * This advice does not cause data migration and has no impact on the location of the data per se. Instead, + * it causes the data to always be mapped in the specified processor's page tables, as long as the + * location of the data permits a mapping to be established. If the data gets migrated for any reason, + * the mappings are updated accordingly. + * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data + * over to the other GPUs is not as important because the accesses are infrequent and the overhead of + * migration may be too high. But preventing faults can still help improve performance, and so having + * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the + * ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the + * page in host memory. + * If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice. Additionally, if the + * preferred location of this memory region or any subset of it is also \p device, then the policies + * associated with ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION will override the policies of this advice. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. + * + * - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of ::CU_MEM_ADVISE_SET_ACCESSED_BY. Any mappings to + * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. Additionally, if \p device has + * a non-zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, + * then this call has no effect. + * + * \param devPtr - Pointer to memory to set the advice for + * \param count - Size in bytes of the memory range + * \param advice - Advice to be applied for the specified memory range + * \param device - Device to apply the advice for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemcpy, ::cuMemcpyPeer, ::cuMemcpyAsync, + * ::cuMemcpy3DPeerAsync, ::cuMemPrefetchAsync, + * ::cudaMemAdvise + */ +CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device); + +/** + * \brief Query an attribute of a given memory range + * + * Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via + * __managed__ variables. + * + * The \p attribute parameter can take the following values: + * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY: If this attribute is specified, \p data will be interpreted + * as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given + * memory range have read-duplication enabled, or 0 otherwise. + * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION: If this attribute is specified, \p data will be + * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device + * id if all pages in the memory range have that GPU as their preferred location, or it will be CU_DEVICE_CPU + * if all pages in the memory range have the CPU as their preferred location, or it will be CU_DEVICE_INVALID + * if either all the pages don't have the same preferred location or some of the pages don't have a + * preferred location at all. Note that the actual location of the pages in the memory range at the time of + * the query may be different from the preferred location. + * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY: If this attribute is specified, \p data will be interpreted + * as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned + * will be a list of device ids that had ::CU_MEM_ADVISE_SET_ACCESSED_BY set for that entire memory range. + * If any device does not have that advice set for the entire memory range, that device will not be included. + * If \p data is larger than the number of devices that have that advice set for that memory range, + * CU_DEVICE_INVALID will be returned in all the extra space provided. For ex., if \p dataSize is 12 + * (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be + * { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If \p data is smaller than the number of devices that have + * that advice set, then only as many devices will be returned as can fit in the array. There is no + * guarantee on which specific devices will be returned, however. + * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION: If this attribute is specified, \p data will be + * interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location + * to which all pages in the memory range were prefetched explicitly via ::cuMemPrefetchAsync. This will either be + * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU + * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not + * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the + * last location that the applicaton requested to prefetch the memory range to. It gives no indication as to + * whether the prefetch operation to that location has completed or even begun. + * + * \param data - A pointers to a memory location where the result + * of each attribute query will be written to. + * \param dataSize - Array containing the size of data + * \param attribute - The attribute to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * \note_async + * \note_null_stream + * + * \sa ::cuMemRangeGetAttributes, ::cuMemPrefetchAsync, + * ::cuMemAdvise, + * ::cudaMemRangeGetAttribute + */ +CUresult CUDAAPI cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count); + +/** + * \brief Query attributes of a given memory range. + * + * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cuMemAllocManaged or declared via + * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes + * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. + * The results of the query will be stored in \p data. + * + * The list of supported attributes are given below. Please refer to ::cuMemRangeGetAttribute for + * attribute descriptions and restrictions. + * + * - ::CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY + * - ::CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION + * - ::CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY + * - ::CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION + * + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param dataSizes - Array containing the sizes of each result + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param numAttributes - Number of attributes to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuMemRangeGetAttribute, ::cuMemAdvise, + * ::cuMemPrefetchAsync, + * ::cudaMemRangeGetAttributes + */ +CUresult CUDAAPI cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count); + +/** + * \brief Set attributes on a previously allocated memory region + * + * The supported attributes are: + * + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS: + * + * A boolean attribute that can either be set (1) or unset (0). When set, + * the region of memory that \p ptr points to is guaranteed to always synchronize + * memory operations that are synchronous. If there are some previously initiated + * synchronous memory operations that are pending when this attribute is set, the + * function does not return until those memory operations are complete. + * See further documentation in the section titled "API synchronization behavior" + * to learn more about cases when synchronous memory operations can + * exhibit asynchronous behavior. + * \p value will be considered as a pointer to an unsigned integer to which this attribute is to be set. + * + * \param value - Pointer to memory containing the value to be set + * \param attribute - Pointer attribute to set + * \param ptr - Pointer to a memory region allocated using CUDA memory allocation APIs + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa ::cuPointerGetAttribute, + * ::cuPointerGetAttributes, + * ::cuMemAlloc, + * ::cuMemFree, + * ::cuMemAllocHost, + * ::cuMemFreeHost, + * ::cuMemHostAlloc, + * ::cuMemHostRegister, + * ::cuMemHostUnregister + */ +CUresult CUDAAPI cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr); + +/** + * \brief Returns information about a pointer. + * + * The supported attributes are (refer to ::cuPointerGetAttribute for attribute descriptions and restrictions): + * + * - ::CU_POINTER_ATTRIBUTE_CONTEXT + * - ::CU_POINTER_ATTRIBUTE_MEMORY_TYPE + * - ::CU_POINTER_ATTRIBUTE_DEVICE_POINTER + * - ::CU_POINTER_ATTRIBUTE_HOST_POINTER + * - ::CU_POINTER_ATTRIBUTE_SYNC_MEMOPS + * - ::CU_POINTER_ATTRIBUTE_BUFFER_ID + * - ::CU_POINTER_ATTRIBUTE_IS_MANAGED + * - ::CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL + * - ::CU_POINTER_ATTRIBUTE_RANGE_START_ADDR + * - ::CU_POINTER_ATTRIBUTE_RANGE_SIZE + * - ::CU_POINTER_ATTRIBUTE_MAPPED + * - ::CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE + * - ::CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES + * - ::CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE + * + * \param numAttributes - Number of attributes to query + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param ptr - Pointer to query + * + * Unlike ::cuPointerGetAttribute, this function will not return an error when the \p ptr + * encountered is not a valid CUDA pointer. Instead, the attributes are assigned default NULL values + * and CUDA_SUCCESS is returned. + * + * If \p ptr was not allocated by, mapped by, or registered with a ::CUcontext which uses UVA + * (Unified Virtual Addressing), ::CUDA_ERROR_INVALID_CONTEXT is returned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuPointerGetAttribute, + * ::cuPointerSetAttribute, + * ::cudaPointerGetAttributes + */ +CUresult CUDAAPI cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr); + +/** @} */ /* END CUDA_UNIFIED */ + +/** + * \defgroup CUDA_STREAM Stream Management + * + * ___MANBRIEF___ stream management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Create a stream + * + * Creates a stream and returns a handle in \p phStream. The \p Flags argument + * determines behaviors of the stream. + * + * Valid values for \p Flags are: + * - ::CU_STREAM_DEFAULT: Default stream creation flag. + * - ::CU_STREAM_NON_BLOCKING: Specifies that work running in the created + * stream may run concurrently with work in stream 0 (the NULL stream), and that + * the created stream should perform no implicit synchronization with stream 0. + * + * \param phStream - Returned newly created stream + * \param Flags - Parameters for stream creation + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags + */ +CUresult CUDAAPI cuStreamCreate(CUstream *phStream, unsigned int Flags); + +/** + * \brief Create a stream with the given priority + * + * Creates a stream with the specified priority and returns a handle in \p phStream. + * This API alters the scheduler priority of work in the stream. Work in a higher + * priority stream may preempt work already executing in a low priority stream. + * + * \p priority follows a convention where lower numbers represent higher priorities. + * '0' represents default priority. The range of meaningful numerical priorities can + * be queried using ::cuCtxGetStreamPriorityRange. If the specified priority is + * outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * it will automatically be clamped to the lowest or the highest number in the range. + * + * \param phStream - Returned newly created stream + * \param flags - Flags for stream creation. See ::cuStreamCreate for a list of + * valid flags + * \param priority - Stream priority. Lower numbers represent higher priorities. + * See ::cuCtxGetStreamPriorityRange for more information about + * meaningful stream priorities that can be passed. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \note Stream priorities are supported only on GPUs + * with compute capability 3.5 or higher. + * + * \note In the current implementation, only compute kernels launched in + * priority streams are affected by the stream's priority. Stream priorities have + * no effect on host-to-device and device-to-host memory operations. + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreateWithPriority + */ +CUresult CUDAAPI cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority); + + +/** + * \brief Query the priority of a given stream + * + * Query the priority of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the priority in \p priority. Note that if the stream was created with a + * priority outside the numerical range returned by ::cuCtxGetStreamPriorityRange, + * this function returns the clamped priority. + * See ::cuStreamCreateWithPriority for details about priority clamping. + * + * \param hStream - Handle to the stream to be queried + * \param priority - Pointer to a signed integer in which the stream's priority is returned + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamCreateWithPriority, + * ::cuCtxGetStreamPriorityRange, + * ::cuStreamGetFlags, + * ::cudaStreamGetPriority + */ +CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + +/** + * \brief Query the flags of a given stream + * + * Query the flags of a stream created using ::cuStreamCreate or ::cuStreamCreateWithPriority + * and return the flags in \p flags. + * + * \param hStream - Handle to the stream to be queried + * \param flags - Pointer to an unsigned integer in which the stream's flags are returned + * The value returned in \p flags is a logical 'OR' of all flags that + * were used while creating this stream. See ::cuStreamCreate for the list + * of valid flags + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreate, + * ::cuStreamGetPriority, + * ::cudaStreamGetFlags + */ +CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + +/** + * \brief Query the context associated with a stream + * + * Returns the CUDA context that the stream is associated with. + * + * The stream handle \p hStream can refer to any of the following: + * <ul> + * <li>a stream created via any of the CUDA driver APIs such as ::cuStreamCreate + * and ::cuStreamCreateWithPriority, or their runtime API equivalents such as + * ::cudaStreamCreate, ::cudaStreamCreateWithFlags and ::cudaStreamCreateWithPriority. + * The returned context is the context that was active in the calling thread when the + * stream was created. Passing an invalid handle will result in undefined behavior.</li> + * <li>any of the special streams such as the NULL stream, ::CU_STREAM_LEGACY and + * ::CU_STREAM_PER_THREAD. The runtime API equivalents of these are also accepted, + * which are NULL, ::cudaStreamLegacy and ::cudaStreamPerThread respectively. + * Specifying any of the special handles will return the context current to the + * calling thread. If no context is current to the calling thread, + * ::CUDA_ERROR_INVALID_CONTEXT is returned.</li> + * </ul> + * + * \param hStream - Handle to the stream to be queried + * \param pctx - Returned context associated with the stream + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \notefnerr + * + * \sa ::cuStreamDestroy, + * ::cuStreamCreateWithPriority, + * ::cuStreamGetPriority, + * ::cuStreamGetFlags, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags + */ +CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); + +/** + * \brief Make a compute stream wait on an event + * + * Makes all future work submitted to \p hStream wait for all work captured in + * \p hEvent. See ::cuEventRecord() for details on what is captured by an event. + * The synchronization will be performed efficiently on the device when applicable. + * \p hEvent may be from a different context or device than \p hStream. + * + * flags include: + * - ::CU_EVENT_WAIT_DEFAULT: Default event creation flag. + * - ::CU_EVENT_WAIT_EXTERNAL: Event is captured in the graph as an external + * event node when performing stream capture. This flag is invalid outside + * of stream capture. + * + * \param hStream - Stream to wait + * \param hEvent - Event to wait on (may not be NULL) + * \param Flags - See ::CUevent_capture_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuEventRecord, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cuStreamDestroy, + * ::cudaStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + +/** + * \brief Add a callback to a compute stream + * + * \note This function is slated for eventual deprecation and removal. If + * you do not require the callback to execute in case of a device error, + * consider using ::cuLaunchHostFunc. Additionally, this function is not + * supported with ::cuStreamBeginCapture and ::cuStreamEndCapture, unlike + * ::cuLaunchHostFunc. + * + * Adds a callback to be called on the host after all currently enqueued + * items in the stream have completed. For each + * cuStreamAddCallback call, the callback will be executed exactly once. + * The callback will block later work in the stream until it is finished. + * + * The callback may be passed ::CUDA_SUCCESS or an error code. In the event + * of a device error, all subsequently executed callbacks will receive an + * appropriate ::CUresult. + * + * Callbacks must not make any CUDA API calls. Attempting to use a CUDA API + * will result in ::CUDA_ERROR_NOT_PERMITTED. Callbacks must not perform any + * synchronization that may depend on outstanding device work or other callbacks + * that are not mandated to run earlier. Callbacks without a mandated order + * (in independent streams) execute in undefined order and may be serialized. + * + * For the purposes of Unified Memory, callback execution makes a number of + * guarantees: + * <ul> + * <li>The callback stream is considered idle for the duration of the + * callback. Thus, for example, a callback may always use memory attached + * to the callback stream.</li> + * <li>The start of execution of a callback has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the callback. It thus synchronizes streams which have been "joined" + * prior to the callback.</li> + * <li>Adding device work to any stream does not have the effect of making + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for + * example, a callback might use global attached memory even if work has + * been added to another stream, if the work has been ordered behind the + * callback with an event.</li> + * <li>Completion of a callback does not cause a stream to become + * active except as described above. The callback stream will remain idle + * if no device work follows the callback, and will remain idle across + * consecutive callbacks without device work in between. Thus, for example, + * stream synchronization can be done by signaling from a callback at the + * end of the stream.</li> + * </ul> + * + * \param hStream - Stream to add callback to + * \param callback - The function to call once preceding stream operations are complete + * \param userData - User specified data to be passed to the callback function + * \param flags - Reserved for future use, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync, + * ::cuStreamLaunchHostFunc, + * ::cudaStreamAddCallback + */ +CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + +/** + * \brief Begins graph capture on a stream + * + * Begin graph capture on \p hStream. When a stream is in capture mode, all operations + * pushed into the stream will not be executed, but will instead be captured into + * a graph, which will be returned via ::cuStreamEndCapture. Capture may not be initiated + * if \p stream is CU_STREAM_LEGACY. Capture must be ended on the same stream in which + * it was initiated, and it may only be initiated if the stream is not already in capture + * mode. The capture mode may be queried via ::cuStreamIsCapturing. A unique id + * representing the capture sequence may be queried via ::cuStreamGetCaptureInfo. + * + * If \p mode is not ::CU_STREAM_CAPTURE_MODE_RELAXED, ::cuStreamEndCapture must be + * called on this stream from the same thread. + * + * \param hStream - Stream in which to initiate capture + * \param mode - Controls the interaction of this capture sequence with other API + * calls that are potentially unsafe. For more details see + * ::cuThreadExchangeStreamCaptureMode. + * + * \note Kernels captured using this API must not use texture and surface references. + * Reading or writing through any texture or surface reference is undefined + * behavior. This restriction does not apply to texture and surface objects. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamIsCapturing, + * ::cuStreamEndCapture, + * ::cuThreadExchangeStreamCaptureMode + */ +CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode); + +/** + * \brief Swaps the stream capture interaction mode for a thread + * + * Sets the calling thread's stream capture interaction mode to the value contained + * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To + * facilitate deterministic behavior across function or module boundaries, callers + * are encouraged to use this API in a push-pop fashion: \code + CUstreamCaptureMode mode = desiredMode; + cuThreadExchangeStreamCaptureMode(&mode); + ... + cuThreadExchangeStreamCaptureMode(&mode); // restore previous mode + * \endcode + * + * During stream capture (see ::cuStreamBeginCapture), some actions, such as a call + * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is + * not enqueued asynchronously to a stream, and is not observed by stream capture. + * Therefore, if the sequence of operations captured via ::cuStreamBeginCapture + * depended on the allocation being replayed whenever the graph is launched, the + * captured graph would be invalid. + * + * Therefore, stream capture places restrictions on API calls that can be made within + * or concurrently to a ::cuStreamBeginCapture-::cuStreamEndCapture sequence. This + * behavior can be controlled via this API and flags to ::cuStreamBeginCapture. + * + * A thread's mode is one of the following: + * - \p CU_STREAM_CAPTURE_MODE_GLOBAL: This is the default mode. If the local thread has + * an ongoing capture sequence that was not initiated with + * \p CU_STREAM_CAPTURE_MODE_RELAXED at \p cuStreamBeginCapture, or if any other thread + * has a concurrent capture sequence initiated with \p CU_STREAM_CAPTURE_MODE_GLOBAL, + * this thread is prohibited from potentially unsafe API calls. + * - \p CU_STREAM_CAPTURE_MODE_THREAD_LOCAL: If the local thread has an ongoing capture + * sequence not initiated with \p CU_STREAM_CAPTURE_MODE_RELAXED, it is prohibited + * from potentially unsafe API calls. Concurrent capture sequences in other threads + * are ignored. + * - \p CU_STREAM_CAPTURE_MODE_RELAXED: The local thread is not prohibited from potentially + * unsafe API calls. Note that the thread is still prohibited from API calls which + * necessarily conflict with stream capture, for example, attempting ::cuEventQuery + * on an event that was last recorded inside a capture sequence. + * + * \param mode - Pointer to mode value to swap with the current mode + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuStreamBeginCapture + */ +CUresult CUDAAPI cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode); + +/** + * \brief Ends capture on a stream, returning the captured graph + * + * End capture on \p hStream, returning the captured graph via \p phGraph. + * Capture must have been initiated on \p hStream via a call to ::cuStreamBeginCapture. + * If capture was invalidated, due to a violation of the rules of stream capture, then + * a NULL graph will be returned. + * + * If the \p mode argument to ::cuStreamBeginCapture was not + * ::CU_STREAM_CAPTURE_MODE_RELAXED, this call must be from the same thread as + * ::cuStreamBeginCapture. + * + * \param hStream - Stream to query + * \param phGraph - The captured graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing + */ +CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); + +/** + * \brief Returns a stream's capture status + * + * Return the capture status of \p hStream via \p captureStatus. After a successful + * call, \p *captureStatus will contain one of the following: + * - ::CU_STREAM_CAPTURE_STATUS_NONE: The stream is not capturing. + * - ::CU_STREAM_CAPTURE_STATUS_ACTIVE: The stream is capturing. + * - ::CU_STREAM_CAPTURE_STATUS_INVALIDATED: The stream was capturing but an error + * has invalidated the capture sequence. The capture sequence must be terminated + * with ::cuStreamEndCapture on the stream where it was initiated in order to + * continue using \p hStream. + * + * Note that, if this is called on ::CU_STREAM_LEGACY (the "null stream") while + * a blocking stream in the same context is capturing, it will return + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT and \p *captureStatus is unspecified + * after the call. The blocking stream capture is not invalidated. + * + * When a blocking stream is capturing, the legacy stream is in an + * unusable state until the blocking stream capture is terminated. The legacy + * stream is not supported for stream capture, but attempted use would have an + * implicit dependency on the capturing stream(s). + * + * \param hStream - Stream to query + * \param captureStatus - Returns the stream's capture status + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \notefnerr + * + * \sa + * ::cuStreamCreate, + * ::cuStreamBeginCapture, + * ::cuStreamEndCapture + */ +CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + +/** + * \brief Query capture status of a stream + * + * Note there is a later version of this API, ::cuStreamGetCaptureInfo_v2. It will + * supplant this version in 12.0, which is retained for minor version compatibility. + * + * Query the capture status of a stream and and get an id for + * the capture sequence, which is unique over the lifetime of the process. + * + * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created + * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. + * + * A valid id is returned only if both of the following are true: + * - the call returns CUDA_SUCCESS + * - captureStatus is set to ::CU_STREAM_CAPTURE_STATUS_ACTIVE + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \notefnerr + * + * \sa + * ::cuStreamGetCaptureInfo_v2, + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing + */ +CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); + +/** + * \brief Query a stream's capture state (11.3+) + * + * Query stream state related to stream capture. + * + * If called on ::CU_STREAM_LEGACY (the "null stream") while a stream not created + * with ::CU_STREAM_NON_BLOCKING is capturing, returns ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT. + * + * Valid data (other than capture status) is returned only if both of the following are true: + * - the call returns CUDA_SUCCESS + * - the returned capture status is ::CU_STREAM_CAPTURE_STATUS_ACTIVE + * + * This version of cuStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the + * previous version in 12.0. Developers requiring compatibility across minor versions to + * CUDA 11.0 (driver version 445) should use ::cuStreamGetCaptureInfo or include a fallback + * path. + * + * \param hStream - The stream to query + * \param captureStatus_out - Location to return the capture status of the stream; required + * \param id_out - Optional location to return an id for the capture sequence, which is + * unique over the lifetime of the process + * \param graph_out - Optional location to return the graph being captured into. All + * operations other than destroy and node removal are permitted on the graph + * while the capture sequence is in progress. This API does not transfer + * ownership of the graph, which is transferred or destroyed at + * ::cuStreamEndCapture. Note that the graph handle may be invalidated before + * end of capture for certain errors. Nodes that are or become + * unreachable from the original stream at ::cuStreamEndCapture due to direct + * actions on the graph do not trigger ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED. + * \param dependencies_out - Optional location to store a pointer to an array of nodes. + * The next node to be captured in the stream will depend on this set of nodes, + * absent operations such as event wait which modify this set. The array pointer + * is valid until the next API call which operates on the stream or until end of + * capture. The node handles may be copied out and are valid until they or the + * graph is destroyed. The driver-owned array may also be passed directly to + * APIs that operate on the graph (not the stream) without copying. + * \param numDependencies_out - Optional location to store the size of the array + * returned in dependencies_out. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_STREAM_CAPTURE_IMPLICIT + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamGetCaptureInfo, + * ::cuStreamBeginCapture, + * ::cuStreamIsCapturing, + * ::cuStreamUpdateCaptureDependencies + */ +CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, + cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); + +/** + * \brief Update the set of dependencies in a capturing stream (11.3+) + * + * Modifies the dependency set of a capturing stream. The dependency set is the set + * of nodes that the next captured node in the stream will depend on. + * + * Valid flags are ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES and + * ::CU_STREAM_SET_CAPTURE_DEPENDENCIES. These control whether the set passed to + * the API is added to the existing set or replaces it. A flags value of 0 defaults + * to ::CU_STREAM_ADD_CAPTURE_DEPENDENCIES. + * + * Nodes that are removed from the dependency set via this API do not result in + * ::CUDA_ERROR_STREAM_CAPTURE_UNJOINED if they are unreachable from the stream at + * ::cuStreamEndCapture. + * + * Returns ::CUDA_ERROR_ILLEGAL_STATE if the stream is not capturing. + * + * This API is new in CUDA 11.3. Developers requiring compatibility across minor + * versions to CUDA 11.0 should not use this API or provide a fallback. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_ILLEGAL_STATE + * + * \sa + * ::cuStreamBeginCapture, + * ::cuStreamGetCaptureInfo, + * ::cuStreamGetCaptureInfo_v2 + */ +CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p hStream to specify stream association of + * \p length bytes of memory starting from \p dptr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p dptr must point to one of the following types of memories: + * - managed memory declared using the __managed__ keyword or allocated with + * ::cuMemAllocManaged. + * - a valid host-accessible region of system-allocated pageable memory. This + * type of memory may only be specified if the device associated with the + * stream reports a non-zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS. + * + * For managed allocations, \p length must be either zero or the entire + * allocation's size. Both indicate that the entire allocation's stream + * association is being changed. Currently, it is not possible to change stream + * association for a portion of a managed allocation. + * + * For pageable host allocations, \p length must be non-zero. + * + * The stream association is specified using \p flags which must be + * one of ::CUmemAttach_flags. + * If the ::CU_MEM_ATTACH_GLOBAL flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::CU_MEM_ATTACH_HOST flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. + * If the ::CU_MEM_ATTACH_SINGLE flag is specified and \p hStream is associated with + * a device that has a zero value for the device attribute ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, + * the program makes a guarantee that it will only access the memory on the device + * from \p hStream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p hStream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cuStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p hStream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cuMemAllocManaged. For __managed__ variables, the default + * association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param hStream - Stream in which to enqueue the attach operation + * \param dptr - Pointer to memory (must be a pointer to managed memory or + * to a valid host-accessible region of system-allocated + * pageable memory) + * \param length - Length of memory + * \param flags - Must be one of ::CUmemAttach_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cudaStreamAttachMemAsync + */ +CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + +/** + * \brief Determine status of a compute stream + * + * Returns ::CUDA_SUCCESS if all operations in the stream specified by + * \p hStream have completed, or ::CUDA_ERROR_NOT_READY if not. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuStreamSynchronize(). + * + * \param hStream - Stream to query status of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamQuery + */ +CUresult CUDAAPI cuStreamQuery(CUstream hStream); + +/** + * \brief Wait until a stream's tasks are completed + * + * Waits until the device has completed all operations in the stream specified + * by \p hStream. If the context was created with the + * ::CU_CTX_SCHED_BLOCKING_SYNC flag, the CPU thread will block until the + * stream is finished with all of its tasks. + * + * \param hStream - Stream to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamDestroy, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamAddCallback, + * ::cudaStreamSynchronize + */ +CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + +/** + * \brief Destroys a stream + * + * Destroys the stream specified by \p hStream. + * + * In case the device is still doing work in the stream \p hStream + * when ::cuStreamDestroy() is called, the function will return immediately + * and the resources associated with \p hStream will be released automatically + * once the device has completed all work in \p hStream. + * + * \param hStream - Stream to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamWaitEvent, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamAddCallback, + * ::cudaStreamDestroy + */ +CUresult CUDAAPI cuStreamDestroy(CUstream hStream); + +/** + * \brief Copies attributes from source stream to destination stream. + * + * Copies attributes from source stream \p src to destination stream \p dst. + * Both streams must have the same context. + * + * \param[out] dst Destination stream + * \param[in] src Source stream + * For list of attributes see ::CUstreamAttrID + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamCopyAttributes(CUstream dst, CUstream src); + +/** + * \brief Queries stream attribute. + * + * Queries attribute \p attr from \p hStream and stores it in corresponding + * member of \p value_out. + * + * \param[in] hStream + * \param[in] attr + * \param[out] value_out + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, + CUstreamAttrValue *value_out); + +/** + * \brief Sets stream attribute. + * + * Sets attribute \p attr on \p hStream from corresponding attribute of + * \p value. The updated attribute will be applied to subsequent work + * submitted to the stream. It will not affect previously submitted work. + * + * \param[out] hStream + * \param[in] attr + * \param[in] value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, + const CUstreamAttrValue *value); + +/** @} */ /* END CUDA_STREAM */ + + +/** + * \defgroup CUDA_EVENT Event Management + * + * ___MANBRIEF___ event management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the event management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates an event + * + * Creates an event *phEvent for the current context with the flags specified via + * \p Flags. Valid flags include: + * - ::CU_EVENT_DEFAULT: Default event creation flag. + * - ::CU_EVENT_BLOCKING_SYNC: Specifies that the created event should use blocking + * synchronization. A CPU thread that uses ::cuEventSynchronize() to wait on + * an event created with this flag will block until the event has actually + * been recorded. + * - ::CU_EVENT_DISABLE_TIMING: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::CU_EVENT_BLOCKING_SYNC flag not specified will provide the best + * performance when used with ::cuStreamWaitEvent() and ::cuEventQuery(). + * - ::CU_EVENT_INTERPROCESS: Specifies that the created event may be used as an + * interprocess event by ::cuIpcGetEventHandle(). ::CU_EVENT_INTERPROCESS must + * be specified along with ::CU_EVENT_DISABLE_TIMING. + * + * \param phEvent - Returns newly created event + * \param Flags - Event creation flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventCreate, + * ::cudaEventCreateWithFlags + */ +CUresult CUDAAPI cuEventCreate(CUevent *phEvent, unsigned int Flags); + +/** + * \brief Records an event + * + * Captures in \p hEvent the contents of \p hStream at the time of this call. + * \p hEvent and \p hStream must be from the same context. + * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p hStream after this call do not modify \p hEvent. See note on default + * stream behavior for what is captured in the default case. + * + * ::cuEventRecord() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cuStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cuEventRecord(). Before the first call to ::cuEventRecord(), an + * event represents an empty set of work, so for example ::cuEventQuery() + * would return ::CUDA_SUCCESS. + * + * \param hEvent - Event to record + * \param hStream - Stream to record event for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \note_null_stream + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuStreamWaitEvent, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventRecord, + * ::cuEventRecordWithFlags + */ +CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + +/** + * \brief Records an event + * + * Captures in \p hEvent the contents of \p hStream at the time of this call. + * \p hEvent and \p hStream must be from the same context. + * Calls such as ::cuEventQuery() or ::cuStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p hStream after this call do not modify \p hEvent. See note on default + * stream behavior for what is captured in the default case. + * + * ::cuEventRecordWithFlags() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cuStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cuEventRecordWithFlags(). Before the first call to ::cuEventRecordWithFlags(), an + * event represents an empty set of work, so for example ::cuEventQuery() + * would return ::CUDA_SUCCESS. + * + * flags include: + * - ::CU_EVENT_RECORD_DEFAULT: Default event creation flag. + * - ::CU_EVENT_RECORD_EXTERNAL: Event is captured in the graph as an external + * event node when performing stream capture. This flag is invalid outside + * of stream capture. + * + * \param hEvent - Event to record + * \param hStream - Stream to record event for + * \param flags - See ::CUevent_capture_flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \note_null_stream + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuStreamWaitEvent, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cuEventRecord, + * ::cudaEventRecord + */ +CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); + +/** + * \brief Queries an event's status + * + * Queries the status of all work currently captured by \p hEvent. See + * ::cuEventRecord() for details on what is captured by an event. + * + * Returns ::CUDA_SUCCESS if all captured work has been completed, or + * ::CUDA_ERROR_NOT_READY if any captured work is incomplete. + * + * For the purposes of Unified Memory, a return value of ::CUDA_SUCCESS + * is equivalent to having called ::cuEventSynchronize(). + * + * \param hEvent - Event to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_READY + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventQuery + */ +CUresult CUDAAPI cuEventQuery(CUevent hEvent); + +/** + * \brief Waits for an event to complete + * + * Waits until the completion of all work currently captured in \p hEvent. + * See ::cuEventRecord() for details on what is captured by an event. + * + * Waiting for an event that was created with the ::CU_EVENT_BLOCKING_SYNC + * flag will cause the calling CPU thread to block until the event has + * been completed by the device. If the ::CU_EVENT_BLOCKING_SYNC flag has + * not been set, then the CPU thread will busy-wait until the event has + * been completed by the device. + * + * \param hEvent - Event to wait for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventDestroy, + * ::cuEventElapsedTime, + * ::cudaEventSynchronize + */ +CUresult CUDAAPI cuEventSynchronize(CUevent hEvent); + +/** + * \brief Destroys an event + * + * Destroys the event specified by \p hEvent. + * + * An event may be destroyed before it is complete (i.e., while + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY). In this case, the + * call does not block on completion of the event, and any associated + * resources will automatically be released asynchronously at completion. + * + * \param hEvent - Event to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventElapsedTime, + * ::cudaEventDestroy + */ +CUresult CUDAAPI cuEventDestroy(CUevent hEvent); + +/** + * \brief Computes the elapsed time between two events + * + * Computes the elapsed time between two events (in milliseconds with a + * resolution of around 0.5 microseconds). + * + * If either event was last recorded in a non-NULL stream, the resulting time + * may be greater than expected (even if both used the same stream handle). This + * happens because the ::cuEventRecord() operation takes place asynchronously + * and there is no guarantee that the measured latency is actually just between + * the two events. Any number of other different stream operations could execute + * in between the two measured events, thus altering the timing in a significant + * way. + * + * If ::cuEventRecord() has not been called on either event then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If ::cuEventRecord() has been called + * on both events but one or both of them has not yet been completed (that is, + * ::cuEventQuery() would return ::CUDA_ERROR_NOT_READY on at least one of the + * events), ::CUDA_ERROR_NOT_READY is returned. If either event was created with + * the ::CU_EVENT_DISABLE_TIMING flag, then this function will return + * ::CUDA_ERROR_INVALID_HANDLE. + * + * \param pMilliseconds - Time between \p hStart and \p hEnd in ms + * \param hStart - Starting event + * \param hEnd - Ending event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_READY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuEventCreate, + * ::cuEventRecord, + * ::cuEventQuery, + * ::cuEventSynchronize, + * ::cuEventDestroy, + * ::cudaEventElapsedTime + */ +CUresult CUDAAPI cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd); + +/** @} */ /* END CUDA_EVENT */ + +/** + * \defgroup CUDA_EXTRES_INTEROP External Resource Interoperability + * + * ___MANBRIEF___ External resource interoperability functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the external resource interoperability functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + + /** + * \brief Imports an external memory object + * + * Imports an externally allocated memory object and returns + * a handle to that in \p extMem_out. + * + * The properties of the handle being imported must be described in + * \p memHandleDesc. The ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC structure + * is defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st { + CUexternalMemoryHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + const void *nvSciBufObject; + } handle; + unsigned long long size; + unsigned int flags; + } CUDA_EXTERNAL_MEMORY_HANDLE_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type specifies the type + * of handle being imported. ::CUexternalMemoryHandleType is + * defined as: + * + * \code + typedef enum CUexternalMemoryHandleType_enum { + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP = 4, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE = 5, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE = 6, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT = 7, + CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF = 8 + } CUexternalMemoryHandleType; + * \endcode + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a memory object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a memory object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a memory object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * be non-NULL and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * memory object are destroyed. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Heap object. This handle holds a reference to the underlying + * object. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Heap object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE, then exactly one + * of ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name must not be + * NULL. If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Resource object. This handle holds a reference to the + * underlying object. If + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Resource object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * represent a valid shared NT handle that is returned by + * IDXGIResource1::CreateSharedHandle when referring to a + * ID3D11Resource object. If + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D11Resource object. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::handle must + * represent a valid shared KMT handle that is returned by + * IDXGIResource::GetSharedHandle when referring to a + * ID3D11Resource object and + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::win32::name + * must be NULL. + * + * If ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type is + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::handle::nvSciBufObject must be non-NULL + * and reference a valid NvSciBuf object. + * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the + * application must use ::cuWaitExternalSemaphoresAsync or ::cuSignalExternalSemaphoresAsync + * as appropriate barriers to maintain coherence between CUDA and the other drivers. + * See ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC and ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC + * for memory synchronization. + * + * + * The size of the memory object must be specified in + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::size. + * + * Specifying the flag ::CUDA_EXTERNAL_MEMORY_DEDICATED in + * ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::flags indicates that the + * resource is a dedicated resource. The definition of what a + * dedicated resource is outside the scope of this extension. + * This flag must be set if ::CUDA_EXTERNAL_MEMORY_HANDLE_DESC::type + * is one of the following: + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT + * + * \param extMem_out - Returned handle to an external memory object + * \param memHandleDesc - Memory import handle descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the + * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges + * as well as appropriate Vulkan pipeline barriers to maintain coherence between + * CPU and GPU. For more information on these APIs, please refer to "Synchronization + * and Cache Control" chapter from Vulkan specification. + * + * \sa ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedBuffer, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc); + +/** + * \brief Maps a buffer onto an imported memory object + * + * Maps a buffer onto an imported memory object and returns a device + * pointer in \p devPtr. + * + * The properties of the buffer being mapped must be described in + * \p bufferDesc. The ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC structure is + * defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_BUFFER_DESC_st { + unsigned long long offset; + unsigned long long size; + unsigned int flags; + } CUDA_EXTERNAL_MEMORY_BUFFER_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::offset is the offset in + * the memory object where the buffer's base address is. + * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::size is the size of the buffer. + * ::CUDA_EXTERNAL_MEMORY_BUFFER_DESC::flags must be zero. + * + * The offset and size have to be suitably aligned to match the + * requirements of the external API. Mapping two buffers whose ranges + * overlap may or may not result in the same virtual address being + * returned for the overlapped portion. In such cases, the application + * must ensure that all accesses to that region from the GPU are + * volatile. Otherwise writes made via one address are not guaranteed + * to be visible via the other address, even if they're issued by the + * same thread. It is recommended that applications map the combined + * range instead of mapping separate buffers and then apply the + * appropriate offsets to the returned pointer to derive the + * individual buffers. + * + * The returned pointer \p devPtr must be freed using ::cuMemFree. + * + * \param devPtr - Returned device pointer to buffer + * \param extMem - Handle to external memory object + * \param bufferDesc - Buffer descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory, + * ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc); + +/** + * \brief Maps a CUDA mipmapped array onto an external memory object + * + * Maps a CUDA mipmapped array onto an external object and returns a + * handle to it in \p mipmap. + * + * The properties of the CUDA mipmapped array being mapped must be + * described in \p mipmapDesc. The structure + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC is defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st { + unsigned long long offset; + CUDA_ARRAY3D_DESCRIPTOR arrayDesc; + unsigned int numLevels; + } CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::offset is the + * offset in the memory object where the base level of the mipmap + * chain is. + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc describes + * the format, dimensions and type of the base level of the mipmap + * chain. For further details on these parameters, please refer to the + * documentation for ::cuMipmappedArrayCreate. Note that if the mipmapped + * array is bound as a color target in the graphics API, then the flag + * ::CUDA_ARRAY3D_COLOR_ATTACHMENT must be specified in + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::arrayDesc::Flags. + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels specifies + * the total number of levels in the mipmap chain. + * + * If \p extMem was imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF, then + * ::CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC::numLevels must be equal to 1. + * + * The returned CUDA mipmapped array must be freed using ::cuMipmappedArrayDestroy. + * + * \param mipmap - Returned CUDA mipmapped array + * \param extMem - Handle to external memory object + * \param mipmapDesc - CUDA array descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \note On Tegra devices, this API will always attempt to do a compressed mapping when the \p extMem is + * imported from a handle of type ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD + * + * \sa ::cuImportExternalMemory, + * ::cuDestroyExternalMemory, + * ::cuExternalMemoryGetMappedBuffer + */ +CUresult CUDAAPI cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc); + +/** + * \brief Destroys an external memory object. + * + * Destroys the specified external memory object. Any existing buffers + * and CUDA mipmapped arrays mapped onto this object must no longer be + * used and must be explicitly freed using ::cuMemFree and + * ::cuMipmappedArrayDestroy respectively. + * + * \param extMem - External memory object to be destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalMemory, + * ::cuExternalMemoryGetMappedBuffer, + * ::cuExternalMemoryGetMappedMipmappedArray + */ +CUresult CUDAAPI cuDestroyExternalMemory(CUexternalMemory extMem); + +/** + * \brief Imports an external semaphore + * + * Imports an externally allocated synchronization object and returns + * a handle to that in \p extSem_out. + * + * The properties of the handle being imported must be described in + * \p semHandleDesc. The ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC is + * defined as follows: + * + * \code + typedef struct CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st { + CUexternalSemaphoreHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + const void* NvSciSyncObj; + } handle; + unsigned int flags; + } CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC; + * \endcode + * + * where ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type specifies the type of + * handle being imported. ::CUexternalSemaphoreHandleType is defined + * as: + * + * \code + typedef enum CUexternalSemaphoreHandleType_enum { + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD = 1, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32 = 2, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT = 3, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE = 4, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE = 5, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC = 6, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX = 7, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT = 8, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD = 9, + CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 = 10 + } CUexternalSemaphoreHandleType; + * \endcode + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a synchronization object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle must + * be non-NULL and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * synchronization object are destroyed. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Fence object. This handle holds a reference to the underlying + * object. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D12Fence object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared NT handle that is returned by + * ID3D11Fence::CreateSharedHandle. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D11Fence object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::nvSciSyncObj + * represents a valid NvSciSyncObj. + * + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared NT handle that + * is returned by IDXGIResource1::CreateSharedHandle when referring to + * a IDXGIKeyedMutex object. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid IDXGIKeyedMutex object. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * represents a valid shared KMT handle that + * is returned by IDXGIResource::GetSharedHandle when referring to + * a IDXGIKeyedMutex object and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must be NULL. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, then + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::fd must be a valid + * file descriptor referencing a synchronization object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::type is + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32, then exactly one + * of ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle and + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name must not be + * NULL. If + * ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC::handle::win32::name + * is not NULL, then it must name a valid synchronization object. + * + * \param extSem_out - Returned handle to an external semaphore + * \param semHandleDesc - Semaphore import handle descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc); + +/** + * \brief Signals a set of external semaphore objects + * + * Enqueues a signal operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of signaling a semaphore depends on the type of + * the object. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then signaling the semaphore will set it to the signaled state. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 + * then the semaphore will be set to the value specified in + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::fence::value. + * + * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC + * this API sets ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence + * to a value that can be used by subsequent waiters of the same NvSciSync object + * to order operations with those currently submitted in \p stream. Such an update + * will overwrite previous contents of + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence. By default, + * signaling such an external semaphore object causes appropriate memory synchronization + * operations to be performed over all external memory objects that are imported as + * ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that any subsequent accesses + * made by other importers of the same set of NvSciBuf memory object(s) are coherent. + * These operations can be skipped by specifying the flag + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC, which can be used as a + * performance optimization when data coherency is not required. But specifying this + * flag in scenarios where data coherency is required results in undefined behavior. + * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in + * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return + * CUDA_ERROR_NOT_SUPPORTED. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT + * then the keyed mutex will be released with the key specified in + * ::CUDA_EXTERNAL_SEMAPHORE_PARAMS::params::keyedmutex::key. + * + * \param extSemArray - Set of external semaphores to be signaled + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to signal + * \param stream - Stream to enqueue the signal operations in + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + +/** + * \brief Waits on a set of external semaphore objects + * + * Enqueues a wait operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of waiting on a semaphore depends on the type + * of the object. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT + * then waiting on the semaphore will wait until the semaphore reaches + * the signaled state. The semaphore will then be reset to the + * unsignaled state. Therefore for every signal operation, there can + * only be one wait operation. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32 + * then waiting on the semaphore will wait until the value of the + * semaphore is greater than or equal to + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::fence::value. + * + * If the semaphore object is of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC + * then, waiting on the semaphore will wait until the + * ::CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS::params::nvSciSync::fence is signaled by the + * signaler of the NvSciSyncObj that was associated with this semaphore object. + * By default, waiting on such an external semaphore object causes appropriate + * memory synchronization operations to be performed over all external memory objects + * that are imported as ::CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF. This ensures that + * any subsequent accesses made by other importers of the same set of NvSciBuf memory + * object(s) are coherent. These operations can be skipped by specifying the flag + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC, which can be used as a + * performance optimization when data coherency is not required. But specifying this + * flag in scenarios where data coherency is required results in undefined behavior. + * Also, for semaphore object of the type ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, + * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in + * ::cuDeviceGetNvSciSyncAttributes to CUDA_NVSCISYNC_ATTR_WAIT, this API will return + * CUDA_ERROR_NOT_SUPPORTED. + * + * If the semaphore object is any one of the following types: + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX, + * ::CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT + * then the keyed mutex will be acquired when it is released with the key + * specified in ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::key + * or until the timeout specified by + * ::CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS::params::keyedmutex::timeoutMs + * has lapsed. The timeout interval can either be a finite value + * specified in milliseconds or an infinite value. In case an infinite + * value is specified the timeout never elapses. The windows INFINITE + * macro must be used to specify infinite timeout. + * + * \param extSemArray - External semaphores to be waited on + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to wait on + * \param stream - Stream to enqueue the wait operations in + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_TIMEOUT + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuDestroyExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync + */ +CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + +/** + * \brief Destroys an external semaphore + * + * Destroys an external semaphore object and releases any references + * to the underlying resource. Any outstanding signals or waits must + * have completed before the semaphore is destroyed. + * + * \param extSem - External semaphore to be destroyed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuDestroyExternalSemaphore(CUexternalSemaphore extSem); + +/** @} */ /* END CUDA_EXTRES_INTEROP */ + +/** + * \defgroup CUDA_MEMOP Stream Memory Operations + * + * ___MANBRIEF___ Stream memory operations of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream memory operations of the low-level CUDA + * driver application programming interface. + * + * There are two versions of these APIs, a legacy version and a newer V2 version. + * + * V1: + * + * The V1 API is disabled by default. Users are required + * to explicitly enable it, e.g. on Linux by passing the kernel module + * parameter shown below: + * modprobe nvidia NVreg_EnableStreamMemOPs=1 + * There is currently no way to enable these operations on other operating + * systems. + * + * Users can programmatically query whether the device supports these + * operations with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. + * + * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. + * + * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() + * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and + * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and + * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform + * hardware features and can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. + * + * V2: + * + * The V2 APIs are available by default on all platforms. + * + * Support for the ::CU_STREAM_WAIT_VALUE_NOR flag can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2. + * + * Support for the ::cuStreamWriteValue64() and ::cuStreamWaitValue64() + * functions, as well as for the ::CU_STREAM_MEM_OP_WAIT_VALUE_64 and + * ::CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2. + * + * Support for both ::CU_STREAM_WAIT_VALUE_FLUSH and + * ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform + * hardware features and can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES. + * + * V1 & V2: + * + * Note that all memory pointers passed as parameters to these operations + * are device pointers. Where necessary a device pointer should be + * obtained, for example with ::cuMemHostGetDevicePointer(). + * + * None of the operations accepts pointers to managed memory buffers + * (::cuMemAllocManaged). + * + * \note + * Warning: + * Improper use of these APIs may deadlock the application. Synchronization + * ordering established through these APIs is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by these APIs should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. + * + * @{ + */ + +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. + * + * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue64, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + * flag is passed, the write is preceded by a system-wide memory fence, + * equivalent to a __threadfence_system() but scoped to the stream + * rather than a CUDA thread. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue64, + * ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. Unless the ::CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER + * flag is passed, the write is preceded by a system-wide memory fence, + * equivalent to a __threadfence_system() but scoped to the stream + * rather than a CUDA thread. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS. + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue32, + * ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamBatchMemOp, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Batch operations to synchronize the stream via memory operations + * + * This is a batch version of ::cuStreamWaitValue32() and ::cuStreamWriteValue32(). + * Batching operations may avoid some performance overhead in both the API call + * and the device execution versus adding them to the stream in separate API + * calls. The operations are enqueued in the order they appear in the array. + * + * See ::CUstreamBatchMemOpType for the full set of supported operations, and + * ::cuStreamWaitValue32(), ::cuStreamWaitValue64(), ::cuStreamWriteValue32(), + * and ::cuStreamWriteValue64() for details of specific operations. + * + * Basic support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_MEM_OPS. See related APIs for details + * on querying support for specific operations. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param stream The stream to enqueue the operations in. + * \param count The number of operations in the array. Must be less than 256. + * \param paramArray The types and parameters of the individual operations. + * \param flags Reserved for future expansion; must be 0. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32, + * ::cuStreamWaitValue64, + * ::cuStreamWriteValue32, + * ::cuStreamWriteValue64, + * ::cuMemHostRegister + */ +CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * Support for CU_STREAM_WAIT_VALUE_NOR can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue64_v2, + * ::cuStreamWriteValue32_v2, + * ::cuStreamWriteValue64_v2, + * ::cuStreamBatchMemOp_v2, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Wait on a memory location + * + * Enqueues a synchronization of the stream on the given memory location. Work + * ordered after the operation will block until the given condition on the + * memory is satisfied. By default, the condition is to wait for + * (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. + * Other condition types can be specified via \p flags. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param stream The stream to synchronize on the memory location. + * \param addr The memory location to wait on. + * \param value The value to compare with the memory location. + * \param flags See ::CUstreamWaitValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32_v2, + * ::cuStreamWriteValue32_v2, + * ::cuStreamWriteValue64_v2, + * ::cuStreamBatchMemOp_v2, + * ::cuMemHostRegister, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). This function cannot + * be used with managed memory (::cuMemAllocManaged). + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue64_v2, + * ::cuStreamWaitValue32_v2, + * ::cuStreamWaitValue64_v2, + * ::cuStreamBatchMemOp_v2, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + +/** + * \brief Write a value to memory + * + * Write a value to memory. + * + * If the memory was registered via ::cuMemHostRegister(), the device pointer + * should be obtained with ::cuMemHostGetDevicePointer(). + * + * Support for this can be queried with ::cuDeviceGetAttribute() and + * ::CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS_V2. + * + * \param stream The stream to do the write in. + * \param addr The device address to write to. + * \param value The value to write. + * \param flags See ::CUstreamWriteValue_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWriteValue32_v2, + * ::cuStreamWaitValue32_v2, + * ::cuStreamWaitValue64_v2, + * ::cuStreamBatchMemOp_v2, + * ::cuMemHostRegister, + * ::cuEventRecord + */ +CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + +/** + * \brief Batch operations to synchronize the stream via memory operations + * + * This is a batch version of ::cuStreamWaitValue32_v2() and ::cuStreamWriteValue32_v2(). + * Batching operations may avoid some performance overhead in both the API call + * and the device execution versus adding them to the stream in separate API + * calls. The operations are enqueued in the order they appear in the array. + * + * See ::CUstreamBatchMemOpType for the full set of supported operations, and + * ::cuStreamWaitValue32_v2(), ::cuStreamWaitValue64_v2(), ::cuStreamWriteValue32_v2(), + * and ::cuStreamWriteValue64_v2() for details of specific operations. + * + * See related APIs for details on querying support for specific operations. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param stream The stream to enqueue the operations in. + * \param count The number of operations in the array. Must be less than 256. + * \param paramArray The types and parameters of the individual operations. + * \param flags Reserved for future expansion; must be 0. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \notefnerr + * + * \sa ::cuStreamWaitValue32_v2, + * ::cuStreamWaitValue64_v2, + * ::cuStreamWriteValue32_v2, + * ::cuStreamWriteValue64_v2, + * ::cuMemHostRegister + */ +CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + +/** @} */ /* END CUDA_MEMOP */ + +/** + * \defgroup CUDA_EXEC Execution Control + * + * ___MANBRIEF___ execution control functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the execution control functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns information about a function + * + * Returns in \p *pi the integer value of the attribute \p attrib on the kernel + * given by \p hfunc. The supported attributes are: + * - ::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: The maximum number of threads + * per block, beyond which a launch of the function would fail. This number + * depends on both the function and the device on which the function is + * currently loaded. + * - ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: The size in bytes of + * statically-allocated shared memory per block required by this function. + * This does not include dynamically-allocated shared memory requested by + * the user at runtime. + * - ::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: The size in bytes of user-allocated + * constant memory required by this function. + * - ::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: The size in bytes of local memory + * used by each thread of this function. + * - ::CU_FUNC_ATTRIBUTE_NUM_REGS: The number of registers used by each thread + * of this function. + * - ::CU_FUNC_ATTRIBUTE_PTX_VERSION: The PTX virtual architecture version for + * which the function was compiled. This value is the major PTX version * 10 + * + the minor PTX version, so a PTX version 1.3 function would return the + * value 13. Note that this may return the undefined value of 0 for cubins + * compiled prior to CUDA 3.0. + * - ::CU_FUNC_ATTRIBUTE_BINARY_VERSION: The binary architecture version for + * which the function was compiled. This value is the major binary + * version * 10 + the minor binary version, so a binary version 1.3 function + * would return the value 13. Note that this will return a value of 10 for + * legacy cubins that do not have a properly-encoded binary architecture + * version. + * - ::CU_FUNC_CACHE_MODE_CA: The attribute to indicate whether the function has + * been compiled with user specified option "-Xptxas --dlcm=ca" set . + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: The maximum size in bytes of + * dynamically-allocated shared memory. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: Preferred shared memory-L1 + * cache split ratio in percent of total shared memory. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET: If this attribute is set, the + * kernel must launch with a valid cluster size specified. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in + * blocks. + * - ::CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED: Indicates whether + * the function can be launched with non-portable cluster size. 1 is allowed, + * 0 is disallowed. A non-portable cluster size may only function on the + * specific SKUs the program is tested on. The launch might fail if the + * program is run on a different hardware platform. CUDA API provides + * cudaOccupancyMaxActiveClusters to assist with checking whether the desired + * size can be launched on the current device. A portable cluster size is + * guaranteed to be functional on all compute capabilities higher than the + * target compute capability. The portable cluster size for sm_90 is 8 blocks + * per cluster. This value may increase for future compute capabilities. The + * specific hardware unit may support higher cluster sizes that’s not + * guaranteed to be portable. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block + * scheduling policy of a function. The value type is CUclusterSchedulingPolicy. + * + * \param pi - Returned attribute value + * \param attrib - Attribute requested + * \param hfunc - Function to query attribute of + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel, + * ::cudaFuncGetAttributes, + * ::cudaFuncSetAttribute + */ +CUresult CUDAAPI cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc); + +/** + * \brief Sets information about a function + * + * This call sets the value of a specified attribute \p attrib on the kernel given + * by \p hfunc to an integer value specified by \p val + * This function returns CUDA_SUCCESS if the new value of the attribute could be + * successfully set. If the set fails, this call will return an error. + * Not all attributes can have values set. Attempting to set a value on a read-only + * attribute will result in an error (CUDA_ERROR_INVALID_VALUE) + * + * Supported attributes for the cuFuncSetAttribute call are: + * - ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: This maximum size in bytes of + * dynamically-allocated shared memory. The value should contain the requested + * maximum size of dynamically-allocated shared memory. The sum of this value and + * the function attribute ::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES cannot exceed the + * device attribute ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN. + * The maximal size of requestable dynamic shared memory may differ by GPU + * architecture. + * - ::CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: On devices where the L1 + * cache and shared memory use the same hardware resources, this sets the shared memory + * carveout preference, in percent of the total shared memory. + * See ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH: The required cluster width in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT: The required cluster height in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH: The required cluster depth in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. + * - ::CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE: The block + * scheduling policy of a function. The value type is CUclusterSchedulingPolicy. + * + * \param hfunc - Function to query attribute of + * \param attrib - Attribute requested + * \param value - The value to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuLaunchKernel, + * ::cudaFuncGetAttributes, + * ::cudaFuncSetAttribute + */ +CUresult CUDAAPI cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value); + +/** + * \brief Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p config the preferred cache configuration for + * the device function \p hfunc. This is only a preference. The driver will use + * the requested configuration if possible, but it is free to choose a different + * configuration if required to execute \p hfunc. Any context-wide preference + * set via ::cuCtxSetCacheConfig() will be overridden by this per-function + * setting unless the per-function setting is ::CU_FUNC_CACHE_PREFER_NONE. In + * that case, the current context-wide setting will be used. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * + * The supported cache configurations are: + * - ::CU_FUNC_CACHE_PREFER_NONE: no preference for shared memory or L1 (default) + * - ::CU_FUNC_CACHE_PREFER_SHARED: prefer larger shared memory and smaller L1 cache + * - ::CU_FUNC_CACHE_PREFER_L1: prefer larger L1 cache and smaller shared memory + * - ::CU_FUNC_CACHE_PREFER_EQUAL: prefer equal sized L1 cache and shared memory + * + * \param hfunc - Kernel to configure cache for + * \param config - Requested cache configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel, + * ::cudaFuncSetCacheConfig + */ +CUresult CUDAAPI cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config); + +/** + * \brief Sets the shared memory configuration for a device function. + * + * On devices with configurable shared memory banks, this function will + * force all subsequent launches of the specified device function to have + * the given shared memory bank size configuration. On any given launch of the + * function, the shared memory configuration of the device will be temporarily + * changed if needed to suit the function's preferred configuration. Changes in + * shared memory configuration between subsequent launches of functions, + * may introduce a device side synchronization point. + * + * Any per-function setting of shared memory bank size set via + * ::cuFuncSetSharedMemConfig will override the context wide setting set with + * ::cuCtxSetSharedMemConfig. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE: use the context's shared memory + * configuration when launching this function. + * - ::CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE: set shared memory bank width to + * be natively four bytes when launching this function. + * - ::CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE: set shared memory bank width to + * be natively eight bytes when launching this function. + * + * \param hfunc - kernel to be given a shared memory config + * \param config - requested shared memory configuration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuCtxGetSharedMemConfig, + * ::cuCtxSetSharedMemConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchKernel, + * ::cudaFuncSetSharedMemConfig + */ +CUresult CUDAAPI cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config); + +/** + * \brief Returns a module handle + * + * Returns in \p *hmod the handle of the module that function \p hfunc + * is located in. The lifetime of the module corresponds to the lifetime of + * the context it was loaded in or until the module is explicitly unloaded. + * + * The CUDA runtime manages its own modules loaded into the primary context. + * If the handle returned by this API refers to a module loaded by the CUDA runtime, + * calling ::cuModuleUnload() on that module will result in undefined behavior. + * + * \param hmod - Returned module handle + * \param hfunc - Function to retrieve module for + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND + * \notefnerr + * + */ +CUresult CUDAAPI cuFuncGetModule(CUmodule *hmod, CUfunction hfunc); + +/** + * \brief Launches a CUDA function + * + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p f can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * 2) Kernel parameters can also be packaged by the application into + * a single buffer that is passed in via the \p extra parameter. + * This places the burden on the application of knowing each kernel + * parameter's size and alignment/padding within the buffer. Here is + * an example of using the \p extra parameter in this manner: + * \code + size_t argBufferSize; + char argBuffer[256]; + + // populate argBuffer and argBufferSize + + void *config[] = { + CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer, + CU_LAUNCH_PARAM_BUFFER_SIZE, &argBufferSize, + CU_LAUNCH_PARAM_END + }; + status = cuLaunchKernel(f, gx, gy, gz, bx, by, bz, sh, s, NULL, config); + * \endcode + * + * The \p extra parameter exists to allow ::cuLaunchKernel to take + * additional less commonly used arguments. \p extra specifies a list of + * names of extra settings and their corresponding values. Each extra + * setting name is immediately followed by the corresponding value. The + * list must be terminated with either NULL or ::CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer containing all + * the kernel parameters for launching kernel \p f; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t containing the + * size of the buffer specified with ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel + * parameters are specified with both \p kernelParams and \p extra + * (i.e. both \p kernelParams and \p extra are non-NULL). + * + * Calling ::cuLaunchKernel() invalidates the persistent function state + * set through the following deprecated APIs: + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), + * ::cuParamSetv(). + * + * Note that to use ::cuLaunchKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param f - Kernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * \param extra - Extra options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cudaLaunchKernel + */ +CUresult CUDAAPI cuLaunchKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra); + +/** + * \brief Launches a CUDA function with launch-time configuration + * + * Invokes the kernel \p f with the specified launch-time configuration + * \p config. + * + * The ::CUlaunchConfig structure is defined as: + * \code + typedef struct CUlaunchConfig_st { + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + CUlaunchAttribute *attrs; + unsigned int numAttrs; + } CUlaunchConfig; + * \endcode + * where: + * - ::CUlaunchConfig::gridDimX is the width of the grid in blocks. + * - ::CUlaunchConfig::gridDimY is the height of the grid in blocks. + * - ::CUlaunchConfig::gridDimZ is the depth of the grid in blocks. + * - ::CUlaunchConfig::blockDimX is the X dimension of each thread block. + * - ::CUlaunchConfig::blockDimX is the Y dimension of each thread block. + * - ::CUlaunchConfig::blockDimZ is the Z dimension of each thread block. + * - ::CUlaunchConfig::sharedMemBytes is the dynamic shared-memory size per + * thread block in bytes. + * - ::CUlaunchConfig::hStream is the handle to the stream to perform the launch + * in. The CUDA context associated with this stream must match that associated + * with function f. + * - ::CUlaunchConfig::attrs is an array of ::CUlaunchConfig::numAttrs + * continguous ::CUlaunchAttribute elements. The value of this pointer is not + * considered if ::CUlaunchConfig::numAttrs is zero. However, in that case, it + * is recommended to set the pointer to NULL. + * - ::CUlaunchConfig::numAttrs is the numbers of attributes populating the + * first ::CUlaunchConfig::numAttrs positions of the ::CUlaunchConfig::attrs + * array. + * + * Launch-time configuration is specified by adding entries to + * ::CUlaunchConfig::attrs. Each entry is an attribute ID and a corresponding + * attribute value. + * + * The ::CUlaunchAttribute structure is defined as: + * \code + typedef struct CUlaunchAttribute_st { + CUlaunchAttributeID id; + CUlaunchAttributeValue value; + } CUlaunchAttribute; + * \endcode + * where: + * - ::CUlaunchAttribute::id is a unique enum identifying the attribute. + * - ::CUlaunchAttribute::value is a union that hold the attribute value. + * + * An example of using the \p config parameter: + * \code + CUlaunchAttribute coopAttr = {.id = CU_LAUNCH_ATTRIBUTE_COOPERATIVE, + .value = 1}; + CUlaunchConfig config = {... // set block and grid dimensions + .attrs = &coopAttr, + .numAttrs = 1}; + + cuLaunchKernelEx(&config, kernel, NULL, NULL); + * \endcode + * + * The ::CUlaunchAttributeID enum is defined as: + * \code + typedef enum CUlaunchAttributeID_enum { + CU_LAUNCH_ATTRIBUTE_IGNORE = 0, + CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW = 1, + CU_LAUNCH_ATTRIBUTE_COOPERATIVE = 2, + CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY = 3, + CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION = 4, + CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = 5, + CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION = 6, + CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT = 7, + } CUlaunchAttributeID; + * \endcode + * + * and the corresponding ::CUlaunchAttributeValue union as : + * \code + typedef union CUlaunchAttributeValue_union { + cuuint64_t pad[8]; + CUaccessPolicyWindow accessPolicyWindow; + int cooperative; + CUsynchronizationPolicy syncPolicy; + struct { + unsigned int x; + unsigned int y; + unsigned int z; + } clusterDim; + CUclusterSchedulingPolicy clusterSchedulingPolicyPreference; + int programmaticStreamSerializationAllowed; + struct { + CUevent event; + int flags; + int triggerAtBlockStart; + } programmaticEvent; + } CUlaunchAttributeValue; + * \endcode + * + * Setting ::CU_LAUNCH_ATTRIBUTE_COOPERATIVE to a non-zero value causes the + * kernel launch to be a cooperative launch, with exactly the same usage and + * semantics of ::cuLaunchCooperativeKernel. + * + * Setting ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION to a non-zero + * values causes the kernel to use programmatic means to resolve its stream + * dependency -- enabling the CUDA runtime to opportunistically allow the grid's + * execution to overlap with the previous kernel in the stream, if that kernel + * requests the overlap. + * + * ::CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT records an event along with the + * kernel launch. Event recorded through this launch attribute is guaranteed to + * only trigger after all block in the associated kernel trigger the event. A + * block can trigger the event through PTX launchdep.release or CUDA builtin + * function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be + * inserted at the beginning of each block's execution if triggerAtBlockStart is + * set to non-0. Note that dependents (including the CPU thread calling + * cuEventSynchronize()) are not guaranteed to observe the release precisely + * when it is released. For example, cuEventSynchronize() may only observe the + * event trigger long after the associated kernel has completed. This recording + * type is primarily meant for establishing programmatic dependency between + * device tasks. The event supplied must not be an interprocess or interop + * event. The event must disable timing (i.e. created with + * ::CU_EVENT_DISABLE_TIMING flag set). + * + * The effect of other attributes is consistent with their effect when set via + * persistent APIs. + * + * See ::cuStreamSetAttribute for + * - ::CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW + * - ::CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY + * + * See ::cuFunctionSetAttribute for + * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION + * - ::CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE + * + * Kernel parameters to \p f can be specified in the same ways that they can be + * using ::cuLaunchKernel. + * + * \param config - Kernel to launch + * \param f - Kernel to launch + * \param kernelParams - Array of pointers to kernel parameters + * \param extra - Extra options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cudaLaunchKernel, + * ::cudaLaunchKernelEx + */ +CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, + CUfunction f, + void **kernelParams, + void **extra); + +/** + * \brief Launches a CUDA function where thread blocks can cooperate and synchronize as they execute + * + * Invokes the kernel \p f on a \p gridDimX x \p gridDimY x \p gridDimZ + * grid of blocks. Each block contains \p blockDimX x \p blockDimY x + * \p blockDimZ threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * Kernel parameters must be specified via \p kernelParams. If \p f + * has N parameters, then \p kernelParams needs to be an array of N + * pointers. Each of \p kernelParams[0] through \p kernelParams[N-1] + * must point to a region of memory from which the actual kernel + * parameter will be copied. The number of kernel parameters and their + * offsets and sizes do not need to be specified as that information is + * retrieved directly from the kernel's image. + * + * Calling ::cuLaunchCooperativeKernel() sets persistent function state that is + * the same as function state set through ::cuLaunchKernel API + * + * When the kernel \p f is launched via ::cuLaunchCooperativeKernel(), the previous + * block shape, shared size and parameter info associated with \p f + * is overwritten. + * + * Note that to use ::cuLaunchCooperativeKernel(), the kernel \p f must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchCooperativeKernel() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param f - Kernel to launch + * \param gridDimX - Width of grid in blocks + * \param gridDimY - Height of grid in blocks + * \param gridDimZ - Depth of grid in blocks + * \param blockDimX - X dimension of each thread block + * \param blockDimY - Y dimension of each thread block + * \param blockDimZ - Z dimension of each thread block + * \param sharedMemBytes - Dynamic shared-memory size per thread block in bytes + * \param hStream - Stream identifier + * \param kernelParams - Array of pointers to kernel parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchCooperativeKernelMultiDevice, + * ::cudaLaunchCooperativeKernel + */ +CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams); + +/** + * \brief Launches CUDA functions on multiple devices where thread blocks can cooperate and synchronize as they execute + * + * \deprecated This function is deprecated as of CUDA 11.3. + * + * Invokes kernels as specified in the \p launchParamsList array where each element + * of the array specifies all the parameters required to perform a single kernel launch. + * These kernels can cooperate and synchronize as they execute. The size of the array is + * specified by \p numDevices. + * + * No two kernels can be launched on the same device. All the devices targeted by this + * multi-device launch must be identical. All devices must have a non-zero value for the + * device attribute ::CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH. + * + * All kernels launched must be identical with respect to the compiled code. Note that + * any __device__, __constant__ or __managed__ variables present in the module that owns + * the kernel launched on each device, are independently instantiated on every device. + * It is the application's responsiblity to ensure these variables are initialized and + * used appropriately. + * + * The size of the grids as specified in blocks, the size of the blocks themselves + * and the amount of shared memory used by each thread block must also match across + * all launched kernels. + * + * The streams used to launch these kernels must have been created via either ::cuStreamCreate + * or ::cuStreamCreateWithPriority. The NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD + * cannot be used. + * + * The total number of blocks launched per kernel cannot exceed the maximum number of blocks + * per multiprocessor as returned by ::cuOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT. Since the + * total number of blocks launched per device has to match across all devices, the maximum + * number of blocks that can be launched per device will be limited by the device with the + * least number of multiprocessors. + * + * The kernels cannot make use of CUDA dynamic parallelism. + * + * The ::CUDA_LAUNCH_PARAMS structure is defined as: + * \code + typedef struct CUDA_LAUNCH_PARAMS_st + { + CUfunction function; + unsigned int gridDimX; + unsigned int gridDimY; + unsigned int gridDimZ; + unsigned int blockDimX; + unsigned int blockDimY; + unsigned int blockDimZ; + unsigned int sharedMemBytes; + CUstream hStream; + void **kernelParams; + } CUDA_LAUNCH_PARAMS; + * \endcode + * where: + * - ::CUDA_LAUNCH_PARAMS::function specifies the kernel to be launched. All functions must + * be identical with respect to the compiled code. + * - ::CUDA_LAUNCH_PARAMS::gridDimX is the width of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::gridDimY is the height of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::gridDimZ is the depth of the grid in blocks. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimX is the X dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimX is the Y dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::blockDimZ is the Z dimension of each thread block. This must match across + * all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::sharedMemBytes is the dynamic shared-memory size per thread block in bytes. + * This must match across all kernels launched. + * - ::CUDA_LAUNCH_PARAMS::hStream is the handle to the stream to perform the launch in. This cannot + * be the NULL stream or ::CU_STREAM_LEGACY or ::CU_STREAM_PER_THREAD. The CUDA context associated + * with this stream must match that associated with ::CUDA_LAUNCH_PARAMS::function. + * - ::CUDA_LAUNCH_PARAMS::kernelParams is an array of pointers to kernel parameters. If + * ::CUDA_LAUNCH_PARAMS::function has N parameters, then ::CUDA_LAUNCH_PARAMS::kernelParams + * needs to be an array of N pointers. Each of ::CUDA_LAUNCH_PARAMS::kernelParams[0] through + * ::CUDA_LAUNCH_PARAMS::kernelParams[N-1] must point to a region of memory from which the actual + * kernel parameter will be copied. The number of kernel parameters and their offsets and sizes + * do not need to be specified as that information is retrieved directly from the kernel's image. + * + * By default, the kernel won't begin execution on any GPU until all prior work in all the specified + * streams has completed. This behavior can be overridden by specifying the flag + * ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_PRE_LAUNCH_SYNC. When this flag is specified, each kernel + * will only wait for prior work in the stream corresponding to that GPU to complete before it begins + * execution. + * + * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin + * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying + * the flag ::CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC. When this flag is specified, + * any subsequent work pushed in any of the specified streams will only wait for the kernel launched + * on the GPU corresponding to that stream to complete before it begins execution. + * + * Calling ::cuLaunchCooperativeKernelMultiDevice() sets persistent function state that is + * the same as function state set through ::cuLaunchKernel API when called individually for each + * element in \p launchParamsList. + * + * When kernels are launched via ::cuLaunchCooperativeKernelMultiDevice(), the previous + * block shape, shared size and parameter info associated with each ::CUDA_LAUNCH_PARAMS::function + * in \p launchParamsList is overwritten. + * + * Note that to use ::cuLaunchCooperativeKernelMultiDevice(), the kernels must either have + * been compiled with toolchain version 3.2 or later so that it will + * contain kernel parameter information, or have no kernel parameters. + * If either of these conditions is not met, then ::cuLaunchCooperativeKernelMultiDevice() will + * return ::CUDA_ERROR_INVALID_IMAGE. + * + * \param launchParamsList - List of launch parameters, one per device + * \param numDevices - Size of the \p launchParamsList array + * \param flags - Flags to control launch behavior + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_IMAGE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \note_null_stream + * \notefnerr + * + * \sa ::cuCtxGetCacheConfig, + * ::cuCtxSetCacheConfig, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuLaunchCooperativeKernel, + * ::cudaLaunchCooperativeKernelMultiDevice + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags); + +/** + * \brief Enqueues a host function call in a stream + * + * Enqueues a host function to run in a stream. The function will be called + * after currently enqueued work and will block work added after it. + * + * The host function must not make any CUDA API calls. Attempting to use a + * CUDA API may result in ::CUDA_ERROR_NOT_PERMITTED, but this is not required. + * The host function must not perform any synchronization that may depend on + * outstanding CUDA work not mandated to run earlier. Host functions without a + * mandated order (such as in independent streams) execute in undefined order + * and may be serialized. + * + * For the purposes of Unified Memory, execution makes a number of guarantees: + * <ul> + * <li>The stream is considered idle for the duration of the function's + * execution. Thus, for example, the function may always use memory attached + * to the stream it was enqueued in.</li> + * <li>The start of execution of the function has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the function. It thus synchronizes streams which have been "joined" + * prior to the function.</li> + * <li>Adding device work to any stream does not have the effect of making + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for + * example, a function might use global attached memory even if work has + * been added to another stream, if the work has been ordered behind the + * function call with an event.</li> + * <li>Completion of the function does not cause a stream to become + * active except as described above. The stream will remain idle + * if no device work follows the function, and will remain idle across + * consecutive host functions or stream callbacks without device work in + * between. Thus, for example, + * stream synchronization can be done by signaling from a host function at the + * end of the stream.</li> + * </ul> + * + * Note that, in contrast to ::cuStreamAddCallback, the function will not be + * called in the event of an error in the CUDA context. + * + * \param hStream - Stream to enqueue function call in + * \param fn - The function to call once preceding stream operations are complete + * \param userData - User-specified data to be passed to the function + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_SUPPORTED + * \note_null_stream + * \notefnerr + * + * \sa ::cuStreamCreate, + * ::cuStreamQuery, + * ::cuStreamSynchronize, + * ::cuStreamWaitEvent, + * ::cuStreamDestroy, + * ::cuMemAllocManaged, + * ::cuStreamAttachMemAsync, + * ::cuStreamAddCallback + */ +CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); + +/** @} */ /* END CUDA_EXEC */ + +/** + * \defgroup CUDA_EXEC_DEPRECATED Execution Control [DEPRECATED] + * + * ___MANBRIEF___ deprecated execution control functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated execution control functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the block-dimensions for the function + * + * \deprecated + * + * Specifies the \p x, \p y, and \p z dimensions of the thread blocks that are + * created when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dimensions of + * \param x - X dimension + * \param y - Y dimension + * \param z - Z dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetSharedSize, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z); + +/** + * \brief Sets the dynamic shared-memory size for the function + * + * \deprecated + * + * Sets through \p bytes the amount of dynamic shared memory that will be + * available to each thread block when the kernel given by \p hfunc is launched. + * + * \param hfunc - Kernel to specify dynamic shared-memory size for + * \param bytes - Dynamic shared-memory size per thread in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetCacheConfig, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes); + +/** + * \brief Sets the parameter size for the function + * + * \deprecated + * + * Sets through \p numbytes the total size in bytes needed by the function + * parameters of the kernel corresponding to \p hfunc. + * + * \param hfunc - Kernel to set parameter size for + * \param numbytes - Size of parameter list in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetSize(CUfunction hfunc, unsigned int numbytes); + +/** + * \brief Adds an integer parameter to the function's argument list + * + * \deprecated + * + * Sets an integer parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSeti(CUfunction hfunc, int offset, unsigned int value); + +/** + * \brief Adds a floating-point parameter to the function's argument list + * + * \deprecated + * + * Sets a floating-point parameter that will be specified the next time the + * kernel corresponding to \p hfunc will be invoked. \p offset is a byte offset. + * + * \param hfunc - Kernel to add parameter to + * \param offset - Offset to add parameter to argument list + * \param value - Value of parameter + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetf(CUfunction hfunc, int offset, float value); + +/** + * \brief Adds arbitrary data to the function's argument list + * + * \deprecated + * + * Copies an arbitrary amount of data (specified in \p numbytes) from \p ptr + * into the parameter space of the kernel corresponding to \p hfunc. \p offset + * is a byte offset. + * + * \param hfunc - Kernel to add data to + * \param offset - Offset to add data to argument list + * \param ptr - Pointer to arbitrary data + * \param numbytes - Size of data to copy in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a 1 x 1 x 1 grid of blocks. The block + * contains the number of threads specified by a previous call to + * ::cuFuncSetBlockShape(). + * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * + * \param f - Kernel to launch + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunchGrid, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunch(CUfunction f); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGridAsync, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGrid(CUfunction f, int grid_width, int grid_height); + +/** + * \brief Launches a CUDA function + * + * \deprecated + * + * Invokes the kernel \p f on a \p grid_width x \p grid_height grid of + * blocks. Each block contains the number of threads specified by a previous + * call to ::cuFuncSetBlockShape(). + * + * The block shape, dynamic shared memory size, and parameter information + * must be set using + * ::cuFuncSetBlockShape(), + * ::cuFuncSetSharedSize(), + * ::cuParamSetSize(), + * ::cuParamSeti(), + * ::cuParamSetf(), and + * ::cuParamSetv() + * prior to calling this function. + * + * Launching a function via ::cuLaunchKernel() invalidates the function's + * block shape, dynamic shared memory size, and parameter information. After + * launching via cuLaunchKernel, this state must be re-initialized prior to + * calling this function. Failure to do so results in undefined behavior. + * + * \param f - Kernel to launch + * \param grid_width - Width of grid in blocks + * \param grid_height - Height of grid in blocks + * \param hStream - Stream identifier + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_LAUNCH_FAILED, + * ::CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, + * ::CUDA_ERROR_LAUNCH_TIMEOUT, + * ::CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, + * ::CUDA_ERROR_SHARED_OBJECT_INIT_FAILED + * + * \note In certain cases where cubins are created with no ABI (i.e., using \p ptxas \p --abi-compile \p no), + * this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by + * growing the per-thread stack as needed per launch and not shrinking it afterwards. + * + * \note_null_stream + * \notefnerr + * + * \sa ::cuFuncSetBlockShape, + * ::cuFuncSetSharedSize, + * ::cuFuncGetAttribute, + * ::cuParamSetSize, + * ::cuParamSetf, + * ::cuParamSeti, + * ::cuParamSetv, + * ::cuLaunch, + * ::cuLaunchGrid, + * ::cuLaunchKernel + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream); + + +/** + * \brief Adds a texture-reference to the function's argument list + * + * \deprecated + * + * Makes the CUDA array or linear memory bound to the texture reference + * \p hTexRef available to a device program as a texture. In this version of + * CUDA, the texture-reference must be obtained via ::cuModuleGetTexRef() and + * the \p texunit parameter must be set to ::CU_PARAM_TR_DEFAULT. + * + * \param hfunc - Kernel to add texture-reference to + * \param texunit - Texture unit (must be ::CU_PARAM_TR_DEFAULT) + * \param hTexRef - Texture-reference to add to argument list + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef); +/** @} */ /* END CUDA_EXEC_DEPRECATED */ + +/** + * \defgroup CUDA_GRAPH Graph Management + * + * ___MANBRIEF___ graph management functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graph management functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Creates a graph + * + * Creates an empty graph, which is returned via \p phGraph. + * + * \param phGraph - Returns newly created graph + * \param flags - Graph creation flags, must be 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + * ::cuGraphInstantiate, + * ::cuGraphDestroy, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphCreate(CUgraph *phGraph, unsigned int flags); + +/** + * \brief Creates a kernel execution node and adds it to a graph + * + * Creates a new kernel execution node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The CUDA_KERNEL_NODE_PARAMS structure is defined as: + * + * \code + * typedef struct CUDA_KERNEL_NODE_PARAMS_st { + * CUfunction func; + * unsigned int gridDimX; + * unsigned int gridDimY; + * unsigned int gridDimZ; + * unsigned int blockDimX; + * unsigned int blockDimY; + * unsigned int blockDimZ; + * unsigned int sharedMemBytes; + * void **kernelParams; + * void **extra; + * } CUDA_KERNEL_NODE_PARAMS; + * \endcode + * + * When the graph is launched, the node will invoke kernel \p func on a (\p gridDimX x + * \p gridDimY x \p gridDimZ) grid of blocks. Each block contains + * (\p blockDimX x \p blockDimY x \p blockDimZ) threads. + * + * \p sharedMemBytes sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p func can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N + * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, + * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual + * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need + * to be specified as that information is retrieved directly from the kernel's image. + * + * 2) Kernel parameters for non-cooperative kernels can also be packaged by the application into a single + * buffer that is passed in via \p extra. This places the burden on the application of knowing each + * kernel parameter's size and alignment/padding within the buffer. The \p extra parameter exists + * to allow this function to take additional less commonly used arguments. \p extra specifies + * a list of names of extra settings and their corresponding values. Each extra setting name is + * immediately followed by the corresponding value. The list must be terminated with either NULL or + * CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer + * containing all the kernel parameters for launching kernel + * \p func; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t + * containing the size of the buffer specified with + * ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::CUDA_ERROR_INVALID_VALUE will be returned if kernel parameters are specified with both + * \p kernelParams and \p extra (i.e. both \p kernelParams and \p extra are non-NULL). + * ::CUDA_ERROR_INVALID_VALUE will be returned if \p extra is used for a cooperative kernel. + * + * The \p kernelParams or \p extra array, as well as the argument values it points to, + * are copied during this call. + * + * \note Kernels launched using graphs must not use texture and surface references. Reading or + * writing through any texture or surface reference is undefined behavior. + * This restriction does not apply to texture and surface objects. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the GPU execution node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuLaunchCooperativeKernel, + * ::cuGraphKernelNodeGetParams, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a kernel node's parameters + * + * Returns the parameters of kernel node \p hNode in \p nodeParams. + * The \p kernelParams or \p extra array returned in \p nodeParams, + * as well as the argument values it points to, are owned by the node. + * This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphKernelNodeSetParams to update the + * parameters of this node. + * + * The params will contain either \p kernelParams or \p extra, + * according to which of these was most recently set on the node. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeSetParams + */ +CUresult CUDAAPI cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a kernel node's parameters + * + * Sets the parameters of kernel node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeGetParams + */ +CUresult CUDAAPI cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a memcpy node and adds it to a graph + * + * Creates a new memcpy node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the graph is launched, the node will perform the memcpy described by \p copyParams. + * See ::cuMemcpy3D() for a description of the structure and its restrictions. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS. If one or more of the operands refer + * to managed memory, then using the memory type ::CU_MEMORYTYPE_UNIFIED is disallowed + * for those operand(s). The managed memory will be treated as residing on either the + * host or the device, depending on which memory type is specified. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param copyParams - Parameters for the memory copy + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphMemcpyNodeGetParams, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); + +/** + * \brief Returns a memcpy node's parameters + * + * Returns the parameters of memcpy node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeSetParams + */ +CUresult CUDAAPI cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams); + +/** + * \brief Sets a memcpy node's parameters + * + * Sets the parameters of memcpy node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemcpy3D, + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeGetParams + */ +CUresult CUDAAPI cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams); + +/** + * \brief Creates a memset node and adds it to a graph + * + * Creates a new memset node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The element size must be 1, 2, or 4 bytes. + * When the graph is launched, the node will perform the memset described by \p memsetParams. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param memsetParams - Parameters for the memory set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_CONTEXT + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphMemsetNodeGetParams, + * ::cuGraphMemsetNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode + */ +CUresult CUDAAPI cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); + +/** + * \brief Returns a memset node's parameters + * + * Returns the parameters of memset node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeSetParams + */ +CUresult CUDAAPI cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a memset node's parameters + * + * Sets the parameters of memset node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuMemsetD2D32, + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeGetParams + */ +CUresult CUDAAPI cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a host execution node and adds it to a graph + * + * Creates a new CPU execution node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the graph is launched, the node will invoke the specified CPU function. + * Host nodes are not supported under MPS with pre-Volta GPUs. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the host node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphHostNodeGetParams, + * ::cuGraphHostNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a host node's parameters + * + * Returns the parameters of host node \p hNode in \p nodeParams. + * + * \param hNode - Node to get the parameters for + * \param nodeParams - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeSetParams + */ +CUresult CUDAAPI cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Sets a host node's parameters + * + * Sets the parameters of host node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchHostFunc, + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeGetParams + */ +CUresult CUDAAPI cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a child graph node and adds it to a graph + * + * Creates a new node which executes an embedded graph, and adds it to \p hGraph with + * \p numDependencies dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * If \p hGraph contains allocation or free nodes, this call will return an error. + * + * The node executes an embedded child graph. The child graph is cloned in this call. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param childGraph - The graph to clone into this node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); + +/** + * \brief Gets a handle to the embedded graph of a child graph node + * + * Gets a handle to the embedded graph in a child graph node. This call + * does not clone the graph. Changes to the graph will be reflected in + * the node, and the node retains ownership of the graph. + * + * Allocation and free nodes cannot be added to the returned graph. + * Attempting to do so will return an error. + * + * \param hNode - Node to get the embedded graph for + * \param phGraph - Location to store a handle to the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphNodeFindInClone + */ +CUresult CUDAAPI cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph); + +/** + * \brief Creates an empty node and adds it to a graph + * + * Creates a new node which performs no operation, and adds it to \p hGraph with + * \p numDependencies dependencies specified via \p dependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * An empty node performs no operation during execution, but can be used for + * transitive ordering. For example, a phased execution graph with 2 groups of n + * nodes with a barrier between them can be represented using an empty node and + * 2*n dependency edges, rather than no empty node and n^2 dependency edges. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); + +/** + * \brief Creates an event record node and adds it to a graph + * + * Creates a new event record node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and event specified in \p event. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * Each launch of the graph will record \p event to capture execution of the + * node's dependencies. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param event - Event for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); + +/** + * \brief Returns the event associated with an event record node + * + * Returns the event of event record node \p hNode in \p event_out. + * + * \param hNode - Node to get the event for + * \param event_out - Pointer to return the event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out); + +/** + * \brief Sets an event record node's event + * + * Sets the event of event record node \p hNode to \p event. + * + * \param hNode - Node to set the event for + * \param event - Event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event); + +/** + * \brief Creates an event wait node and adds it to a graph + * + * Creates a new event wait node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and event specified in \p event. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The graph node will wait for all work captured in \p event. See ::cuEventRecord() + * for details on what is captured by an event. \p event may be from a different context + * or device than the launch stream. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param event - Event for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); + +/** + * \brief Returns the event associated with an event wait node + * + * Returns the event of event wait node \p hNode in \p event_out. + * + * \param hNode - Node to get the event for + * \param event_out - Pointer to return the event + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out); + +/** + * \brief Sets an event wait node's event + * + * Sets the event of event wait node \p hNode to \p event. + * + * \param hNode - Node to set the event for + * \param event - Event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent + */ +CUresult CUDAAPI cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event); + +/** + * \brief Creates an external semaphore signal node and adds it to a graph + * + * Creates a new external semaphore signal node and adds it to \p hGraph with \p + * numDependencies dependencies specified via \p dependencies and arguments specified + * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the + * node will be placed at the root of the graph. \p dependencies may not have any + * duplicate entries. A handle to the new node will be returned in \p phGraphNode. + * + * Performs a signal operation on a set of externally allocated semaphore objects + * when the node is launched. The operation(s) will occur after all of the node's + * dependencies have completed. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphExternalSemaphoresSignalNodeGetParams, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Returns an external semaphore signal node's parameters + * + * Returns the parameters of an external semaphore signal node \p hNode in \p params_out. + * The \p extSemArray and \p paramsArray returned in \p params_out, + * are owned by the node. This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out); + +/** + * \brief Sets an external semaphore signal node's parameters + * + * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphExternalSemaphoresSignalNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Creates an external semaphore wait node and adds it to a graph + * + * Creates a new external semaphore wait node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * Performs a wait operation on a set of externally allocated semaphore objects + * when the node is launched. The node's dependencies will not be launched until + * the wait operation has completed. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphExternalSemaphoresWaitNodeGetParams, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Returns an external semaphore wait node's parameters + * + * Returns the parameters of an external semaphore wait node \p hNode in \p params_out. + * The \p extSemArray and \p paramsArray returned in \p params_out, + * are owned by the node. This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphExternalSemaphoresSignalNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuLaunchKernel, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out); + +/** + * \brief Sets an external semaphore wait node's parameters + * + * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphExternalSemaphoresWaitNodeSetParams, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync + */ +CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Creates a batch memory operation node and adds it to a graph + * + * Creates a new batch memory operation node and adds it to \p hGraph with \p + * numDependencies dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * When the node is added, the paramArray inside \p nodeParams is copied and therefore it can be + * freed after the call returns. + * + * \note + * Warning: + * Improper use of this API may deadlock the application. Synchronization + * ordering established through this API is not visible to CUDA. CUDA tasks + * that are (even indirectly) ordered by this API should also have that order + * expressed with CUDA-visible dependencies such as events. This ensures that + * the scheduler does not serialize them in an improper order. For more + * information, see the Stream Memory Operations section in the programming + * guide(https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html). + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBatchMemOp_v2, + * ::cuStreamWaitValue32_v2, + * ::cuStreamWriteValue32_v2, + * ::cuStreamWaitValue64_v2, + * ::cuStreamWriteValue64_v2, + * ::cuGraphBatchMemOpNodeGetParams, + * ::cuGraphBatchMemOpNodeSetParams, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode, + */ +CUresult CUDAAPI cuGraphAddBatchMemOpNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a batch mem op node's parameters + * + * Returns the parameters of batch mem op node \p hNode in \p nodeParams_out. + * The \p paramArray returned in \p nodeParams_out is owned by the node. + * This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cuGraphBatchMemOpNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param nodeParams_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBatchMemOp_v2, + * ::cuGraphAddBatchMemOpNode, + * ::cuGraphBatchMemOpNodeSetParams + */ +CUresult CUDAAPI cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out); + +/** + * \brief Sets a batch mem op node's parameters + * + * Sets the parameters of batch mem op node \p hNode to \p nodeParams. + * + * The paramArray inside \p nodeParams is copied and therefore it can be + * freed after the call returns. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBatchMemOp_v2, + * ::cuGraphAddBatchMemOpNode, + * ::cuGraphBatchMemOpNodeGetParams + */ +CUresult CUDAAPI cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); + +/** + * \brief Sets the parameters for a batch mem op node in the given graphExec + * + * Sets the parameters of a batch mem op node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The following fields on operations may be modified on an executable graph: + * + * op.waitValue.address + * op.waitValue.value[64] + * op.waitValue.flags bits corresponding to wait type (i.e. CU_STREAM_WAIT_VALUE_FLUSH bit cannot be modified) + * op.writeValue.address + * op.writeValue.value[64] + * + * Other fields, such as the context, count or type of operations, and other types of operations such as membars, + * may not be modified. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * The paramArray inside \p nodeParams is copied and therefore it can be + * freed after the call returns. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Batch mem op node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuStreamBatchMemOp_v2, + * ::cuGraphAddBatchMemOpNode, + * ::cuGraphBatchMemOpNodeGetParams, + * ::cuGraphBatchMemOpNodeSetParams, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); + +/** + * \brief Creates an allocation node and adds it to a graph + * + * Creates a new allocation node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in + * \p nodeParams.dptr. The allocation's address remains fixed across instantiations and launches. + * + * If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode, + * the allocation can be accessed by nodes ordered after the allocation node but before the free node. + * These allocations cannot be freed outside the owning graph, and they can only be freed once in the + * owning graph. + * + * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the + * graph which are ordered after the allocation node, but also by stream operations ordered after the + * graph's execution but before the allocation is freed. + * + * Allocations which are not freed in the same graph can be freed by: + * - passing the allocation to ::cuMemFreeAsync or ::cuMemFree; + * - launching a graph with a free node for that allocation; or + * - specifying ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH during instantiation, which makes + * each launch behave as though it called ::cuMemFreeAsync for every unfreed allocation. + * + * It is not possible to free an allocation in both the owning graph and another graph. If the allocation + * is freed in the same graph, a free node cannot be added to another graph. If the allocation is freed + * in another graph, a free node can no longer be added to the owning graph. + * + * The following restrictions apply to graphs which contain allocation and/or memory free nodes: + * - Nodes and edges of the graph cannot be deleted. + * - The graph cannot be used in a child node. + * - Only one instantiation of the graph may exist at any point in time. + * - The graph cannot be cloned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemFreeNode, + * ::cuGraphMemAllocNodeGetParams, + * ::cuDeviceGraphMemTrim, + * ::cuDeviceGetGraphMemAttribute, + * ::cuDeviceSetGraphMemAttribute, + * ::cuMemAllocAsync, + * ::cuMemFreeAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams); + +/** + * \brief Returns a memory alloc node's parameters + * + * Returns the parameters of a memory alloc node \p hNode in \p params_out. + * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the + * node. This memory remains valid until the node is destroyed. The returned + * parameters must not be modified. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphMemFreeNodeGetParams + */ +CUresult CUDAAPI cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out); + +/** + * \brief Creates a memory free node and adds it to a graph + * + * Creates a new memory free node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p phGraphNode. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param dptr - Address of memory to free + * + * ::cuGraphAddMemFreeNode will return ::CUDA_ERROR_INVALID_VALUE if the user attempts to free: + * - an allocation twice in the same graph. + * - an address that was not returned by an allocation node. + * - an invalid address. + * + * The following restrictions apply to graphs which contain allocation and/or memory free nodes: + * - Nodes and edges of the graph cannot be deleted. + * - The graph cannot be used in a child node. + * - Only one instantiation of the graph may exist at any point in time. + * - The graph cannot be cloned. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphMemFreeNodeGetParams, + * ::cuDeviceGraphMemTrim, + * ::cuDeviceGetGraphMemAttribute, + * ::cuDeviceSetGraphMemAttribute, + * ::cuMemAllocAsync, + * ::cuMemFreeAsync, + * ::cuGraphCreate, + * ::cuGraphDestroyNode, + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddEventRecordNode, + * ::cuGraphAddEventWaitNode, + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr); + +/** + * \brief Returns a memory free node's parameters + * + * Returns the address of a memory free node \p hNode in \p dptr_out. + * + * \param hNode - Node to get the parameters for + * \param dptr_out - Pointer to return the device address + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemFreeNode, + * ::cuGraphMemAllocNodeGetParams + */ +CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out); + +/** + * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS. + * + * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are + * freed back to the operating system. + * + * \param device - The device for which cached memory should be freed. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode, + * ::cuDeviceSetGraphMemAttribute, + * ::cuDeviceGetGraphMemAttribute + */ +CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device); + +/** + * \brief Query asynchronous allocation attributes related to graphs + * + * Valid attributes are: + * + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT: Amount of memory, in bytes, currently associated with graphs + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT: Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * + * \param device - Specifies the scope of the query + * \param attr - attribute to get + * \param value - retrieved value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuDeviceSetGraphMemAttribute, + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode + */ +CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); + +/** + * \brief Set asynchronous allocation attributes related to graphs + * + * Valid attributes are: + * + * - ::CU_GRAPH_MEM_ATTR_USED_MEM_HIGH: High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + * - ::CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH: High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * + * \param device - Specifies the scope of the query + * \param attr - attribute to get + * \param value - pointer to value to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_DEVICE + * + * \sa + * ::cuDeviceGetGraphMemAttribute, + * ::cuGraphAddMemAllocNode, + * ::cuGraphAddMemFreeNode + */ +CUresult CUDAAPI cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value); + +/** + * \brief Clones a graph + * + * This function creates a copy of \p originalGraph and returns it in \p phGraphClone. + * All parameters are copied into the cloned graph. The original graph may be modified + * after this call without affecting the clone. + * + * Child graph nodes in the original graph are recursively copied into the clone. + * + * \param phGraphClone - Returns newly created cloned graph + * \param originalGraph - Graph to clone + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphNodeFindInClone + */ +CUresult CUDAAPI cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph); + +/** + * \brief Finds a cloned version of a node + * + * This function returns the node in \p hClonedGraph corresponding to \p hOriginalNode + * in the original graph. + * + * \p hClonedGraph must have been cloned from \p hOriginalGraph via ::cuGraphClone. + * \p hOriginalNode must have been in \p hOriginalGraph at the time of the call to + * ::cuGraphClone, and the corresponding cloned node in \p hClonedGraph must not have + * been removed. The cloned node is then returned via \p phClonedNode. + * + * \param phNode - Returns handle to the cloned node + * \param hOriginalNode - Handle to the original node + * \param hClonedGraph - Cloned graph to query + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphClone + */ +CUresult CUDAAPI cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); + +/** + * \brief Returns a node's type + * + * Returns the node type of \p hNode in \p type. + * + * \param hNode - Node to query + * \param type - Pointer to return the node type + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphKernelNodeGetParams, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphHostNodeGetParams, + * ::cuGraphHostNodeSetParams, + * ::cuGraphMemcpyNodeGetParams, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphMemsetNodeGetParams, + * ::cuGraphMemsetNodeSetParams + */ +CUresult CUDAAPI cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type); + +/** + * \brief Returns a graph's nodes + * + * Returns a list of \p hGraph's nodes. \p nodes may be NULL, in which case this + * function will return the number of nodes in \p numNodes. Otherwise, + * \p numNodes entries will be filled in. If \p numNodes is higher than the actual + * number of nodes, the remaining entries in \p nodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numNodes. + * + * \param hGraph - Graph to query + * \param nodes - Pointer to return the nodes + * \param numNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetType, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); + +/** + * \brief Returns a graph's root nodes + * + * Returns a list of \p hGraph's root nodes. \p rootNodes may be NULL, in which case this + * function will return the number of root nodes in \p numRootNodes. Otherwise, + * \p numRootNodes entries will be filled in. If \p numRootNodes is higher than the actual + * number of root nodes, the remaining entries in \p rootNodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numRootNodes. + * + * \param hGraph - Graph to query + * \param rootNodes - Pointer to return the root nodes + * \param numRootNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate, + * ::cuGraphGetNodes, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetType, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); + +/** + * \brief Returns a graph's dependency edges + * + * Returns a list of \p hGraph's dependency edges. Edges are returned via corresponding + * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the + * node in \p from[i]. \p from and \p to may both be NULL, in which + * case this function only returns the number of edges in \p numEdges. Otherwise, + * \p numEdges entries will be filled in. If \p numEdges is higher than the actual + * number of edges, the remaining entries in \p from and \p to will be set to NULL, and + * the number of edges actually returned will be written to \p numEdges. + * + * \param hGraph - Graph to get the edges from + * \param from - Location to return edge endpoints + * \param to - Location to return edge endpoints + * \param numEdges - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); + +/** + * \brief Returns a node's dependencies + * + * Returns a list of \p node's dependencies. \p dependencies may be NULL, in which case this + * function will return the number of dependencies in \p numDependencies. Otherwise, + * \p numDependencies entries will be filled in. If \p numDependencies is higher than the actual + * number of dependencies, the remaining entries in \p dependencies will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numDependencies. + * + * \param hNode - Node to query + * \param dependencies - Pointer to return the dependencies + * \param numDependencies - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetDependentNodes, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies + */ +CUresult CUDAAPI cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); + +/** + * \brief Returns a node's dependent nodes + * + * Returns a list of \p node's dependent nodes. \p dependentNodes may be NULL, in which + * case this function will return the number of dependent nodes in \p numDependentNodes. + * Otherwise, \p numDependentNodes entries will be filled in. If \p numDependentNodes is + * higher than the actual number of dependent nodes, the remaining entries in + * \p dependentNodes will be set to NULL, and the number of nodes actually obtained will + * be returned in \p numDependentNodes. + * + * \param hNode - Node to query + * \param dependentNodes - Pointer to return the dependent nodes + * \param numDependentNodes - See description + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetDependencies, + * ::cuGraphGetNodes, + * ::cuGraphGetRootNodes, + * ::cuGraphGetEdges, + * ::cuGraphAddDependencies, + * ::cuGraphRemoveDependencies + */ +CUresult CUDAAPI cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); + +/** + * \brief Adds dependency edges to a graph + * + * The number of dependencies to be added is defined by \p numDependencies + * Elements in \p from and \p to at corresponding indices define a dependency. + * Each node in \p from and \p to must belong to \p hGraph. + * + * If \p numDependencies is 0, elements in \p from and \p to will be ignored. + * Specifying an existing dependency will return an error. + * + * \param hGraph - Graph to which dependencies are added + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be added + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphRemoveDependencies, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); + +/** + * \brief Removes dependency edges from a graph + * + * The number of \p dependencies to be removed is defined by \p numDependencies. + * Elements in \p from and \p to at corresponding indices define a dependency. + * Each node in \p from and \p to must belong to \p hGraph. + * + * If \p numDependencies is 0, elements in \p from and \p to will be ignored. + * Specifying a non-existing dependency will return an error. + * + * Dependencies cannot be removed from graphs which contain allocation or free nodes. + * Any attempt to do so will return an error. + * + * \param hGraph - Graph from which to remove dependencies + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be removed + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddDependencies, + * ::cuGraphGetEdges, + * ::cuGraphNodeGetDependencies, + * ::cuGraphNodeGetDependentNodes + */ +CUresult CUDAAPI cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); + +/** + * \brief Remove a node from the graph + * + * Removes \p hNode from its graph. This operation also severs any dependencies of other nodes + * on \p hNode and vice versa. + * + * Nodes which belong to a graph which contains allocation or free nodes cannot be destroyed. + * Any attempt to do so will return an error. + * + * \param hNode - Node to remove + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphAddEmptyNode, + * ::cuGraphAddKernelNode, + * ::cuGraphAddHostNode, + * ::cuGraphAddMemcpyNode, + * ::cuGraphAddMemsetNode + */ +CUresult CUDAAPI cuGraphDestroyNode(CUgraphNode hNode); + +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p hGraph as an executable graph. The graph is validated for any + * structural constraints or intra-node constraints which were not previously + * validated. If instantiation is successful, a handle to the instantiated graph + * is returned in \p phGraphExec. + * + * If there are any errors, diagnostic information may be returned in \p errorNode and + * \p logBuffer. This is the primary way to inspect instantiation errors. The output + * will be null terminated unless the diagnostics overflow + * the buffer. In this case, they will be truncated, and the last byte can be + * inspected to determine if truncation occurred. + * + * \param phGraphExec - Returns instantiated graph + * \param hGraph - Graph to instantiate + * \param phErrorNode - In case of an instantiation error, this may be modified to + * indicate a node contributing to the error + * \param logBuffer - A character buffer to store diagnostic messages + * \param bufferSize - Size of the log buffer in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiateWithFlags, + * ::cuGraphCreate, + * ::cuGraphUpload, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); + +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p hGraph as an executable graph. The graph is validated for any + * structural constraints or intra-node constraints which were not previously + * validated. If instantiation is successful, a handle to the instantiated graph + * is returned in \p phGraphExec. + * + * The \p flags parameter controls the behavior of instantiation and subsequent + * graph launches. Valid flags are: + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, which configures a + * graph containing memory allocation nodes to automatically free any + * unfreed memory allocations before the graph is relaunched. + * + * - ::CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, which causes the graph + * to use the priorities from the per-node attributes rather than the priority + * of the launch stream during execution. Note that priorities are only available + * on kernel nodes, and are copied from stream priority during stream capture. + * + * If \p hGraph contains any allocation or free nodes, there can be at most one + * executable graph in existence for that graph at a time. + * + * An attempt to instantiate a second executable graph before destroying the first + * with ::cuGraphExecDestroy will result in an error. + * + * \param phGraphExec - Returns instantiated graph + * \param hGraph - Graph to instantiate + * \param flags - Flags to control instantiation. See ::CUgraphInstantiate_flags. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphCreate, + * ::cuGraphUpload, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags); + +/** + * \brief Sets the parameters for a kernel node in the given graphExec + * + * Sets the parameters of a kernel node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. All \p nodeParams + * fields may change, but the following restrictions apply to \p func updates: + * + * - The owning context of the function cannot change. + * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated + * to a function which uses CDP + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - kernel node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddKernelNode, + * ::cuGraphKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p copyParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The source and destination memory in \p copyParams must be allocated from the same + * contexts as the original source and destination memory. Both the instantiation-time + * memory operands and the memory operands in \p copyParams must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings changed or + * either the original or new memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Memcpy node from the graph which was used to instantiate graphExec + * \param copyParams - The updated parameters to set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemcpyNode, + * ::cuGraphMemcpyNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx); + +/** + * \brief Sets the parameters for a memset node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p memsetParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The destination memory in \p memsetParams must be allocated from the same + * contexts as the original destination memory. Both the instantiation-time + * memory operand and the memory operand in \p memsetParams must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * Returns CUDA_ERROR_INVALID_VALUE if the memory operand's mappings changed or + * either the original or new memory operand are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Memset node from the graph which was used to instantiate graphExec + * \param memsetParams - The updated parameters to set + * \param ctx - Context on which to run the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddMemsetNode, + * ::cuGraphMemsetNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx); + +/** + * \brief Sets the parameters for a host node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though \p hNode had + * contained \p nodeParams at instantiation. hNode must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from hNode are ignored. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. hNode is also + * not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Host node from the graph which was used to instantiate graphExec + * \param nodeParams - The updated parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddHostNode, + * ::cuGraphHostNodeSetParams, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams); + +/** + * \brief Updates node parameters in the child graph node in the given graphExec. + * + * Updates the work represented by \p hNode in \p hGraphExec as though the nodes contained + * in \p hNode's graph had the parameters contained in \p childGraph's nodes at instantiation. + * \p hNode must remain in the graph which was used to instantiate \p hGraphExec. + * Changed edges to and from \p hNode are ignored. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p hNode is also + * not modified by this call. + * + * The topology of \p childGraph, as well as the node insertion order, must match that + * of the graph contained in \p hNode. See ::cuGraphExecUpdate() for a list of restrictions + * on what can be updated in an instantiated graph. The update is recursive, so child graph + * nodes contained within the top level child graph will also be updated. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Host node from the graph which was used to instantiate graphExec + * \param childGraph - The graph supplying the updated parameters + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddChildGraphNode, + * ::cuGraphChildGraphNodeGetGraph, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph); + +/** + * \brief Sets the event for an event record node in the given graphExec + * + * Sets the event of an event record node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - event record node from the graph from which graphExec was instantiated + * \param event - Updated event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventRecordNode, + * ::cuGraphEventRecordNodeGetEvent, + * ::cuGraphEventWaitNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); + +/** + * \brief Sets the event for an event wait node in the given graphExec + * + * Sets the event of an event wait node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - event wait node from the graph from which graphExec was instantiated + * \param event - Updated event to use + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddEventWaitNode, + * ::cuGraphEventWaitNodeGetEvent, + * ::cuGraphEventRecordNodeSetEvent, + * ::cuEventRecordWithFlags, + * ::cuStreamWaitEvent, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); + +/** + * \brief Sets the parameters for an external semaphore signal node in the given graphExec + * + * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * Changing \p nodeParams->numExtSems is not supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - semaphore signal node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresSignalNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams); + +/** + * \brief Sets the parameters for an external semaphore wait node in the given graphExec + * + * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * Changing \p nodeParams->numExtSems is not supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - semaphore wait node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphAddExternalSemaphoresWaitNode, + * ::cuImportExternalSemaphore, + * ::cuSignalExternalSemaphoresAsync, + * ::cuWaitExternalSemaphoresAsync, + * ::cuGraphExecKernelNodeSetParams, + * ::cuGraphExecMemcpyNodeSetParams, + * ::cuGraphExecMemsetNodeSetParams, + * ::cuGraphExecHostNodeSetParams, + * ::cuGraphExecChildGraphNodeSetParams, + * ::cuGraphExecEventRecordNodeSetEvent, + * ::cuGraphExecEventWaitNodeSetEvent, + * ::cuGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + */ +CUresult CUDAAPI cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams); + +/** + * \brief Enables or disables the specified node in the given graphExec + * + * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent + * to empty nodes until they are reenabled. Existing node parameters are not affected by + * disabling/enabling the node. + * + * The node is identified by the corresponding node \p hNode in the non-executable + * graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \note Currently only kernel, memset and memcpy nodes are supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Node from the graph from which graphExec was instantiated + * \param isEnabled - Node is enabled if != 0, otherwise the node is disabled + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeGetEnabled, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + * ::cuGraphLaunch + */ +CUresult CUDAAPI cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled); + +/** + * \brief Query whether a node in the given graphExec is enabled + * + * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled. + * + * The node is identified by the corresponding node \p hNode in the non-executable + * graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * \note Currently only kernel, memset and memcpy nodes are supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Node from the graph from which graphExec was instantiated + * \param isEnabled - Location to return the enabled status of the node + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphNodeSetEnabled, + * ::cuGraphExecUpdate, + * ::cuGraphInstantiate + * ::cuGraphLaunch + */ +CUresult CUDAAPI cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled); + +/** + * \brief Uploads an executable graph in a stream + * + * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of + * the same \p hGraphExec will be serialized. Each upload is ordered behind both any + * previous work in \p hStream and any previous launches of \p hGraphExec. + * Uses memory cached by \p stream to back the allocations owned by \p hGraphExec. + * + * \param hGraphExec - Executable graph to upload + * \param hStream - Stream in which to upload the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphLaunch, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream); + +/** + * \brief Launches an executable graph in a stream + * + * Executes \p hGraphExec in \p hStream. Only one instance of \p hGraphExec may be executing + * at a time. Each launch is ordered behind both any previous work in \p hStream + * and any previous launches of \p hGraphExec. To execute a graph concurrently, it must be + * instantiated multiple times into multiple executable graphs. + * + * If any allocations created by \p hGraphExec remain unfreed (from a previous launch) and + * \p hGraphExec was not instantiated with ::CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH, + * the launch will fail with ::CUDA_ERROR_INVALID_VALUE. + * + * \param hGraphExec - Executable graph to launch + * \param hStream - Stream in which to launch the graph + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphUpload, + * ::cuGraphExecDestroy + */ +CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream); + +/** + * \brief Destroys an executable graph + * + * Destroys the executable graph specified by \p hGraphExec, as well + * as all of its executable nodes. If the executable graph is + * in-flight, it will not be terminated, but rather freed + * asynchronously on completion. + * + * \param hGraphExec - Executable graph to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + * ::cuGraphUpload, + * ::cuGraphLaunch + */ +CUresult CUDAAPI cuGraphExecDestroy(CUgraphExec hGraphExec); + +/** + * \brief Destroys a graph + * + * Destroys the graph specified by \p hGraph, as well as all of its nodes. + * + * \param hGraph - Graph to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphDestroy(CUgraph hGraph); + +/** + * \brief Check whether an executable graph can be updated with a graph and perform the update if possible + * + * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the + * node parameters in a topologically identical graph specified by \p hGraph. + * + * Limitations: + * + * - Kernel nodes: + * - The owning context of the function cannot change. + * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated + * to a function which uses CDP. + * - A cooperative node cannot be updated to a non-cooperative node, and vice-versa. + * - If the graph was instantiated with CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the + * priority attribute cannot change. Equality is checked on the originally requested + * priority values, before they are clamped to the device's supported range. + * - Memset and memcpy nodes: + * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. + * - The source/destination memory must be allocated from the same contexts as the original + * source/destination memory. + * - Only 1D memsets can be changed. + * - Additional memcpy node restrictions: + * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, + * CU_MEMORYTYPE_ARRAY, etc.) is not supported. + * - External semaphore wait nodes and record nodes: + * - Changing the number of semaphores is not supported. + * + * Note: The API may add further restrictions in future releases. The return code should always be checked. + * + * cuGraphExecUpdate sets \p updateResult_out to CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under + * the following conditions: + * + * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out + * is NULL. + * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out + * is NULL. + * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is + * the pairless node from \p hGraph. + * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph. + * + * cuGraphExecUpdate sets \p updateResult_out to: + * - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value. + * - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology changed + * - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node changed, in which case + * \p hErrorNode_out is set to the node from \p hGraph. + * - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the function changed in an unsupported + * way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph + * - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a node changed in a way + * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + * - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a node changed in a way + * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph. + * - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is unsupported, like + * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph + * + * If \p updateResult_out isn't set in one of the situations described above, the update check passes + * and cuGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph. If an error happens + * during the update, \p updateResult_out will be set to CU_GRAPH_EXEC_UPDATE_ERROR; otherwise, + * \p updateResult_out is set to CU_GRAPH_EXEC_UPDATE_SUCCESS. + * + * cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed successfully. It returns + * CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + * + * \param hGraphExec The instantiated graph to be updated + * \param hGraph The graph containing the updated parameters + * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any + * \param updateResult_out Whether the graph update was permitted. If was forbidden, the reason why + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE, + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cuGraphInstantiate, + */ +CUresult CUDAAPI cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); + +/** + * \brief Copies attributes from source node to destination node. + * + * Copies attributes from source node \p src to destination node \p dst. + * Both node must have the same context. + * + * \param[out] dst Destination node + * \param[in] src Source node + * For list of attributes see ::CUkernelNodeAttrID + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src); + +/** + * \brief Queries node attribute. + * + * Queries attribute \p attr from node \p hNode and stores it in corresponding + * member of \p value_out. + * + * \param[in] hNode + * \param[in] attr + * \param[out] value_out + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, + CUkernelNodeAttrValue *value_out); + +/** + * \brief Sets node attribute. + * + * Sets attribute \p attr on node \p hNode from corresponding attribute of + * \p value. + * + * \param[out] hNode + * \param[in] attr + * \param[out] value + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa + * ::CUaccessPolicyWindow + */ +CUresult CUDAAPI cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, + const CUkernelNodeAttrValue *value); + +/** + * \brief Write a DOT file describing graph structure + * + * Using the provided \p hGraph, write to \p path a DOT formatted description of the graph. + * By default this includes the graph topology, node types, node id, kernel names and memcpy direction. + * \p flags can be specified to write more detailed information about each node type such as + * parameter values, kernel attributes, node and function handles. + * + * \param hGraph - The graph to create a DOT file from + * \param path - The path to write the DOT file to + * \param flags - Flags from CUgraphDebugDot_flags for specifying which additional node information to write + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OPERATING_SYSTEM + */ +CUresult CUDAAPI cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags); + +/** + * \brief Create a user object + * + * Create a user object with the specified destructor callback and initial reference count. The + * initial references are owned by the caller. + * + * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they + * are executed by a shared internal thread. Another thread may be signaled to perform such + * actions, if it does not block forward progress of tasks scheduled through CUDA. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object_out - Location to return the user object handle + * \param ptr - The pointer to pass to the destroy function + * \param destroy - Callback to free the user object when it is no longer in use + * \param initialRefcount - The initial refcount to create the object with, typically 1. The + * initial references are owned by the calling thread. + * \param flags - Currently it is required to pass ::CU_USER_OBJECT_NO_DESTRUCTOR_SYNC, + * which is the only defined flag. This indicates that the destroy + * callback cannot be waited on by any CUDA API. Users requiring + * synchronization of the callback should signal its completion + * manually. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy, + unsigned int initialRefcount, unsigned int flags); + +/** + * \brief Retain a reference to a user object + * + * Retains new references to a user object. The new references are owned by the caller. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object - The object to retain + * \param count - The number of references to retain, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectRetain(CUuserObject object, unsigned int count); + +/** + * \brief Release a reference to a user object + * + * Releases user object references owned by the caller. The object's destructor is invoked if + * the reference count reaches zero. + * + * It is undefined behavior to release references not owned by the caller, or to use a user + * object handle after all references are released. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object - The object to release + * \param count - The number of references to release, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuGraphRetainUserObject, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuUserObjectRelease(CUuserObject object, unsigned int count); + +/** + * \brief Retain a reference to a user object from a graph + * + * Creates or moves user object references that will be owned by a CUDA graph. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param graph - The graph to associate the reference with + * \param object - The user object to retain a reference for + * \param count - The number of references to add to the graph, typically 1. Must be + * nonzero and not larger than INT_MAX. + * \param flags - The optional flag ::CU_GRAPH_USER_OBJECT_MOVE transfers references + * from the calling thread, rather than create new references. Pass 0 + * to create new references. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphReleaseUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags); + +/** + * \brief Release a user object reference from a graph + * + * Releases user object references owned by a graph. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param graph - The graph that will release the reference + * \param object - The user object to release a reference for + * \param count - The number of references to release, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuUserObjectCreate, + * ::cuUserObjectRetain, + * ::cuUserObjectRelease, + * ::cuGraphRetainUserObject, + * ::cuGraphCreate + */ +CUresult CUDAAPI cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count); + +/** @} */ /* END CUDA_GRAPH */ + +/** + * \defgroup CUDA_OCCUPANCY Occupancy + * + * ___MANBRIEF___ occupancy calculation functions of the low-level CUDA driver + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the occupancy calculation functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); + +/** + * \brief Returns occupancy of a function + * + * Returns in \p *numBlocks the number of the maximum active blocks per + * streaming multiprocessor. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxActiveBlocksPerMultiprocessor; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, if caching is enabled, but + * per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching + * is disabled. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE makes + * the occupancy calculator to return 0 in such cases. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + */ +CUresult CUDAAPI cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * Returns in \p *blockSize a reasonable block size that can achieve + * the maximum occupancy (or, the maximum number of active warps with + * the fewest blocks per multiprocessor), and in \p *minGridSize the + * minimum grid size to achieve the maximum occupancy. + * + * If \p blockSizeLimit is 0, the configurator will use the maximum + * block size permitted by the device / function instead. + * + * If per-block dynamic shared memory allocation is not needed, the + * user should leave both \p blockSizeToDynamicSMemSize and \p + * dynamicSMemSize as 0. + * + * If per-block dynamic shared memory allocation is needed, then if + * the dynamic shared memory size is constant regardless of block + * size, the size should be passed through \p dynamicSMemSize, and \p + * blockSizeToDynamicSMemSize should be NULL. + * + * Otherwise, if the per-block dynamic shared memory size varies with + * different block sizes, the user needs to provide a unary function + * through \p blockSizeToDynamicSMemSize that computes the dynamic + * shared memory needed by \p func for any given block size. \p + * dynamicSMemSize is ignored. An example signature is: + * + * \code + * // Take block size, returns dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * \endcode + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxPotentialBlockSize + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); + +/** + * \brief Suggest a launch configuration with reasonable occupancy + * + * An extended version of ::cuOccupancyMaxPotentialBlockSize. In + * addition to arguments passed to ::cuOccupancyMaxPotentialBlockSize, + * ::cuOccupancyMaxPotentialBlockSizeWithFlags also takes a \p Flags + * parameter. + * + * The \p Flags parameter controls how special cases are handled. The + * valid flags are: + * + * - ::CU_OCCUPANCY_DEFAULT, which maintains the default behavior as + * ::cuOccupancyMaxPotentialBlockSize; + * + * - ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE, which suppresses the + * default behavior on platform where global caching affects + * occupancy. On such platforms, the launch configurations that + * produces maximal occupancy might not support global + * caching. Setting ::CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE + * guarantees that the the produced launch configuration is global + * caching compatible at a potential cost of occupancy. More information + * can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the maximum occupancy + * \param blockSize - Returned maximum block size that can achieve the maximum occupancy + * \param func - Kernel for which launch configuration is calculated + * \param blockSizeToDynamicSMemSize - A function that calculates how much per-block dynamic shared memory \p func uses based on the block size + * \param dynamicSMemSize - Dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to handle + * \param flags - Options + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaOccupancyMaxPotentialBlockSizeWithFlags + */ +CUresult CUDAAPI cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); + +/** + * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM + * + * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. + * + * \param dynamicSmemSize - Returned maximum dynamic shared memory + * \param func - Kernel function for which occupancy is calculated + * \param numBlocks - Number of blocks to fit on SM + * \param blockSize - Size of the blocks + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + */ +CUresult CUDAAPI cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum cluster size in \p *clusterSize. + * + * The cluster dimensions in \p config are ignored. If func has a required + * cluster size set (see ::cudaFuncGetAttributes / ::cuFuncGetAttribute),\p + * *clusterSize will reflect the required cluster size. + * + * By default this function will always return a value that's portable on + * future hardware. A higher value may be returned if the kernel function + * allows non-portable cluster sizes. + * + * This function will respect the compile time launch bounds. + * + * \param clusterSize - Returned maximum cluster size that can be launched + * for the given kernel function and launch configuration + * \param func - Kernel function for which maximum cluster + * size is calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaFuncGetAttributes + * ::cuFuncGetAttributes + */ +CUresult CUDAAPI cuOccupancyMaxPotentialClusterSize(int *clusterSize, CUfunction func, const CUlaunchConfig *config); + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum number of clusters that could co-exist + * on the target device in \p *numClusters. + * + * If the function has required cluster size already set (see + * ::cudaFuncGetAttributes / ::cuFuncGetAttribute), the cluster size + * from config must either be unspecified or match the required size. + * Without required sizes, the cluster size must be specified in config, + * else the function will return an error. + * + * Note that various attributes of the kernel function may affect occupancy + * calculation. Runtime environment may affect how the hardware schedules + * the clusters, so the calculated occupancy is not guaranteed to be achievable. + * + * \param numClusters - Returned maximum number of clusters that + * could co-exist on the target device + * \param func - Kernel function for which maximum number + * of clusters are calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_CLUSTER_SIZE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cudaFuncGetAttributes + * ::cuFuncGetAttributes + */ +CUresult CUDAAPI cuOccupancyMaxActiveClusters(int *numClusters, CUfunction func, const CUlaunchConfig *config); +/** @} */ /* END CUDA_OCCUPANCY */ + +/** + * \defgroup CUDA_TEXREF_DEPRECATED Texture Reference Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated texture reference management functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the deprecated texture reference management + * functions of the low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Binds an array as a texture reference + * + * \deprecated + * + * Binds the CUDA array \p hArray to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to + * ::CU_TRSA_OVERRIDE_FORMAT. Any CUDA array previously bound to \p hTexRef is + * unbound. + * + * \param hTexRef - Texture reference to bind + * \param hArray - Array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Binds a mipmapped array to a texture reference + * + * \deprecated + * + * Binds the CUDA mipmapped array \p hMipmappedArray to the texture reference \p hTexRef. + * Any previous address or CUDA array state associated with the texture reference + * is superseded by this function. \p Flags must be set to ::CU_TRSA_OVERRIDE_FORMAT. + * Any CUDA array previously bound to \p hTexRef is unbound. + * + * \param hTexRef - Texture reference to bind + * \param hMipmappedArray - Mipmapped array to bind + * \param Flags - Options (must be ::CU_TRSA_OVERRIDE_FORMAT) + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); + +/** + * \brief Binds an address as a texture reference + * + * \deprecated + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, ::cuTexRefSetAddress() passes back a byte offset in + * \p *ByteOffset that must be applied to texture fetches in order to read from + * the desired memory. This offset must be divided by the texel size and + * passed to kernels that read from the texture so they can be applied to the + * ::tex1Dfetch() function. + * + * If the device memory pointer was returned from ::cuMemAlloc(), the offset + * is guaranteed to be 0 and NULL may be passed as the \p ByteOffset parameter. + * + * The total number of elements (or texels) in the linear address range + * cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. + * The number of elements is computed as (\p bytes / bytesPerElement), + * where bytesPerElement is determined from the data format and number of + * components set using ::cuTexRefSetFormat(). + * + * \param ByteOffset - Returned byte offset + * \param hTexRef - Texture reference to bind + * \param dptr - Device pointer to bind + * \param bytes - Size of memory to bind in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes); + +/** + * \brief Binds an address as a 2D texture reference + * + * \deprecated + * + * Binds a linear address range to the texture reference \p hTexRef. Any + * previous address or CUDA array state associated with the texture reference + * is superseded by this function. Any memory previously bound to \p hTexRef + * is unbound. + * + * Using a ::tex2D() function inside a kernel requires a call to either + * ::cuTexRefSetArray() to bind the corresponding texture reference to an + * array, or ::cuTexRefSetAddress2D() to bind the texture reference to linear + * memory. + * + * Function calls to ::cuTexRefSetFormat() cannot follow calls to + * ::cuTexRefSetAddress2D() for the same texture reference. + * + * It is required that \p dptr be aligned to the appropriate hardware-specific + * texture alignment. You can query this value using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. If an unaligned \p dptr is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \p Pitch has to be aligned to the hardware-specific texture pitch alignment. + * This value can be queried using the device attribute + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. If an unaligned \p Pitch is + * supplied, ::CUDA_ERROR_INVALID_VALUE is returned. + * + * Width and Height, which are specified in elements (or texels), cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * \p Pitch, which is specified in bytes, cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * \param hTexRef - Texture reference to bind + * \param desc - Descriptor of CUDA array + * \param dptr - Device pointer to bind + * \param Pitch - Line pitch in bytes + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture2D + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); + +/** + * \brief Sets the format for a texture reference + * + * \deprecated + * + * Specifies the format of the data to be read by the texture reference + * \p hTexRef. \p fmt and \p NumPackedComponents are exactly analogous to the + * ::Format and ::NumChannels members of the ::CUDA_ARRAY_DESCRIPTOR structure: + * They specify the format of each component and the number of components per + * array element. + * + * \param hTexRef - Texture reference + * \param fmt - Format to set + * \param NumPackedComponents - Number of components per array element + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaCreateChannelDesc, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); + +/** + * \brief Sets the addressing mode for a texture reference + * + * \deprecated + * + * Specifies the addressing mode \p am for the given dimension \p dim of the + * texture reference \p hTexRef. If \p dim is zero, the addressing mode is + * applied to the first parameter of the functions used to fetch from the + * texture; if \p dim is 1, the second, and so on. ::CUaddress_mode is defined + * as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES, is not set, the only + * supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * \param hTexRef - Texture reference + * \param dim - Dimension + * \param am - Addressing mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am); + +/** + * \brief Sets the filtering mode for a texture reference + * + * \deprecated + * + * Specifies the filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap filtering mode for a texture reference + * + * \deprecated + * + * Specifies the mipmap filtering mode \p fm to be used when reading memory through + * the texture reference \p hTexRef. ::CUfilter_mode_enum is defined as: + * + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param fm - Filtering mode to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm); + +/** + * \brief Sets the mipmap level bias for a texture reference + * + * \deprecated + * + * Specifies the mipmap level bias \p bias to be added to the specified mipmap level when + * reading memory through the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param bias - Mipmap level bias + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias); + +/** + * \brief Sets the mipmap min/max mipmap level clamps for a texture reference + * + * \deprecated + * + * Specifies the min/max mipmap level clamps, \p minMipmapLevelClamp and \p maxMipmapLevelClamp + * respectively, to be used when reading memory through the texture reference + * \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is not bound to a mipmapped array. + * + * \param hTexRef - Texture reference + * \param minMipmapLevelClamp - Mipmap min level clamp + * \param maxMipmapLevelClamp - Mipmap max level clamp + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); + +/** + * \brief Sets the maximum anisotropy for a texture reference + * + * \deprecated + * + * Specifies the maximum anisotropy \p maxAniso to be used when reading memory through + * the texture reference \p hTexRef. + * + * Note that this call has no effect if \p hTexRef is bound to linear memory. + * + * \param hTexRef - Texture reference + * \param maxAniso - Maximum anisotropy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso); + +/** + * \brief Sets the border color for a texture reference + * + * \deprecated + * + * Specifies the value of the RGBA color via the \p pBorderColor to the texture reference + * \p hTexRef. The color value supports only float type and holds color components in + * the following sequence: + * pBorderColor[0] holds 'R' component + * pBorderColor[1] holds 'G' component + * pBorderColor[2] holds 'B' component + * pBorderColor[3] holds 'A' component + * + * Note that the color values can be set only when the Address mode is set to + * CU_TR_ADDRESS_MODE_BORDER using ::cuTexRefSetAddressMode. + * Applications using integer border color values have to "reinterpret_cast" their values to float. + * + * \param hTexRef - Texture reference + * \param pBorderColor - RGBA color + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddressMode, + * ::cuTexRefGetAddressMode, ::cuTexRefGetBorderColor, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor); + +/** + * \brief Sets the flags for a texture reference + * + * \deprecated + * + * Specifies optional flags via \p Flags to specify the behavior of data + * returned through the texture reference \p hTexRef. The valid flags are: + * + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format + * would not be promoted, regardless of whether or not this + * flag is specified; + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the + * default behavior of having the texture coordinates range + * from [0, Dim) where Dim is the width or height of the CUDA + * array. Instead, the texture coordinates [0, 1.0) reference + * the entire breadth of the array dimension; + * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear + * filtering optimizations. Trilinear optimizations improve texture filtering + * performance by allowing bilinear filtering on textures in scenarios where + * it can closely approximate the expected results. + * + * \param hTexRef - Texture reference + * \param Flags - Optional flags to set + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat, + * ::cudaBindTexture, + * ::cudaBindTexture2D, + * ::cudaBindTextureToArray, + * ::cudaBindTextureToMipmappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags); + +/** + * \brief Gets the address associated with a texture reference + * + * \deprecated + * + * Returns in \p *pdptr the base address bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any device memory range. + * + * \param pdptr - Returned device address + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef); + +/** + * \brief Gets the array bound to a texture reference + * + * \deprecated + * + * Returns in \p *phArray the CUDA array bound to the texture reference + * \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA array. + * + * \param phArray - Returned array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef); + +/** + * \brief Gets the mipmapped array bound to a texture reference + * + * \deprecated + * + * Returns in \p *phMipmappedArray the CUDA mipmapped array bound to the texture + * reference \p hTexRef, or returns ::CUDA_ERROR_INVALID_VALUE if the texture reference + * is not bound to any CUDA mipmapped array. + * + * \param phMipmappedArray - Returned mipmapped array + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); + +/** + * \brief Gets the addressing mode used by a texture reference + * + * \deprecated + * + * Returns in \p *pam the addressing mode corresponding to the + * dimension \p dim of the texture reference \p hTexRef. Currently, the only + * valid value for \p dim are 0 and 1. + * + * \param pam - Returned addressing mode + * \param hTexRef - Texture reference + * \param dim - Dimension + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim); + +/** + * \brief Gets the filter-mode used by a texture reference + * + * \deprecated + * + * Returns in \p *pfm the filtering mode of the texture reference + * \p hTexRef. + * + * \param pfm - Returned filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the format used by a texture reference + * + * \deprecated + * + * Returns in \p *pFormat and \p *pNumChannels the format and number + * of components of the CUDA array bound to the texture reference \p hTexRef. + * If \p pFormat or \p pNumChannels is NULL, it will be ignored. + * + * \param pFormat - Returned format + * \param pNumChannels - Returned number of components + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); + +/** + * \brief Gets the mipmap filtering mode for a texture reference + * + * \deprecated + * + * Returns the mipmap filtering mode in \p pfm that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pfm - Returned mipmap filtering mode + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef); + +/** + * \brief Gets the mipmap level bias for a texture reference + * + * \deprecated + * + * Returns the mipmap level bias in \p pBias that's added to the specified mipmap + * level when reading memory through the texture reference \p hTexRef. + * + * \param pbias - Returned mipmap level bias + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef); + +/** + * \brief Gets the min/max mipmap level clamps for a texture reference + * + * \deprecated + * + * Returns the min/max mipmap level clamps in \p pminMipmapLevelClamp and \p pmaxMipmapLevelClamp + * that's used when reading memory through the texture reference \p hTexRef. + * + * \param pminMipmapLevelClamp - Returned mipmap min level clamp + * \param pmaxMipmapLevelClamp - Returned mipmap max level clamp + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); + +/** + * \brief Gets the maximum anisotropy for a texture reference + * + * \deprecated + * + * Returns the maximum anisotropy in \p pmaxAniso that's used when reading memory through + * the texture reference \p hTexRef. + * + * \param pmaxAniso - Returned maximum anisotropy + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFlags, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef); + +/** + * \brief Gets the border color used by a texture reference + * + * \deprecated + * + * Returns in \p pBorderColor, values of the RGBA color used by + * the texture reference \p hTexRef. + * The color value is of type float and holds color components in + * the following sequence: + * pBorderColor[0] holds 'R' component + * pBorderColor[1] holds 'G' component + * pBorderColor[2] holds 'B' component + * pBorderColor[3] holds 'A' component + * + * \param hTexRef - Texture reference + * \param pBorderColor - Returned Type and Value of RGBA color + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddressMode, + * ::cuTexRefSetAddressMode, ::cuTexRefSetBorderColor + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef); + +/** + * \brief Gets the flags used by a texture reference + * + * \deprecated + * + * Returns in \p *pFlags the flags of the texture reference \p hTexRef. + * + * \param pFlags - Returned flags + * \param hTexRef - Texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefSetAddress, + * ::cuTexRefSetAddress2D, ::cuTexRefSetAddressMode, ::cuTexRefSetArray, + * ::cuTexRefSetFilterMode, ::cuTexRefSetFlags, ::cuTexRefSetFormat, + * ::cuTexRefGetAddress, ::cuTexRefGetAddressMode, ::cuTexRefGetArray, + * ::cuTexRefGetFilterMode, ::cuTexRefGetFormat + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef); + +/** + * \brief Creates a texture reference + * + * \deprecated + * + * Creates a texture reference and returns its handle in \p *pTexRef. Once + * created, the application must call ::cuTexRefSetArray() or + * ::cuTexRefSetAddress() to associate the reference with allocated memory. + * Other texture reference functions are used to specify the format and + * interpretation (addressing, filtering, etc.) to be used when the memory is + * read through this texture reference. + * + * \param pTexRef - Returned texture reference + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefDestroy + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefCreate(CUtexref *pTexRef); + +/** + * \brief Destroys a texture reference + * + * \deprecated + * + * Destroys the texture reference specified by \p hTexRef. + * + * \param hTexRef - Texture reference to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuTexRefCreate + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuTexRefDestroy(CUtexref hTexRef); + +/** @} */ /* END CUDA_TEXREF_DEPRECATED */ + + +/** + * \defgroup CUDA_SURFREF_DEPRECATED Surface Reference Management [DEPRECATED] + * + * ___MANBRIEF___ surface reference management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface reference management functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Sets the CUDA array for a surface reference. + * + * \deprecated + * + * Sets the CUDA array \p hArray to be read and written by the surface reference + * \p hSurfRef. Any previous CUDA array state associated with the surface + * reference is superseded by this function. \p Flags must be set to 0. + * The ::CUDA_ARRAY3D_SURFACE_LDST flag must have been set for the CUDA array. + * Any CUDA array previously bound to \p hSurfRef is unbound. + + * \param hSurfRef - Surface reference handle + * \param hArray - CUDA array handle + * \param Flags - set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuModuleGetSurfRef, + * ::cuSurfRefGetArray, + * ::cudaBindSurfaceToArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); + +/** + * \brief Passes back the CUDA array bound to a surface reference. + * + * \deprecated + * + * Returns in \p *phArray the CUDA array bound to the surface reference + * \p hSurfRef, or returns ::CUDA_ERROR_INVALID_VALUE if the surface reference + * is not bound to any CUDA array. + + * \param phArray - Surface reference handle + * \param hSurfRef - Surface reference handle + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa ::cuModuleGetSurfRef, ::cuSurfRefSetArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef); + +/** @} */ /* END CUDA_SURFREF_DEPRECATED */ + +/** + * \defgroup CUDA_TEXOBJECT Texture Object Management + * + * ___MANBRIEF___ texture object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the texture object management functions of the + * low-level CUDA driver application programming interface. The texture + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a texture object + * + * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes + * the data to texture from. \p pTexDesc describes how the data should be sampled. + * \p pResViewDesc is an optional argument that specifies an alternate format for + * the data described by \p pResDesc, and also describes the subresource region + * to restrict access to when texturing. \p pResViewDesc can only be specified if + * the type of resource is a CUDA array or a CUDA mipmapped array. + * + * Texture objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a texture object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The ::CUDA_RESOURCE_DESC structure is defined as: + * \code + typedef struct CUDA_RESOURCE_DESC_st + { + CUresourcetype resType; + + union { + struct { + CUarray hArray; + } array; + struct { + CUmipmappedArray hMipmappedArray; + } mipmap; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t sizeInBytes; + } linear; + struct { + CUdeviceptr devPtr; + CUarray_format format; + unsigned int numChannels; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; + + unsigned int flags; + } CUDA_RESOURCE_DESC; + + * \endcode + * where: + * - ::CUDA_RESOURCE_DESC::resType specifies the type of resource to texture from. + * CUresourceType is defined as: + * \code + typedef enum CUresourcetype_enum { + CU_RESOURCE_TYPE_ARRAY = 0x00, + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, + CU_RESOURCE_TYPE_LINEAR = 0x02, + CU_RESOURCE_TYPE_PITCH2D = 0x03 + } CUresourcetype; + * \endcode + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_ARRAY, ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_MIPMAPPED_ARRAY, ::CUDA_RESOURCE_DESC::res::mipmap::hMipmappedArray + * must be set to a valid CUDA mipmapped array handle. + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_LINEAR, ::CUDA_RESOURCE_DESC::res::linear::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::linear::format and ::CUDA_RESOURCE_DESC::res::linear::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::linear::sizeInBytes + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH. The number of elements is computed as (sizeInBytes / (sizeof(format) * numChannels)). + * + * \par + * If ::CUDA_RESOURCE_DESC::resType is set to ::CU_RESOURCE_TYPE_PITCH2D, ::CUDA_RESOURCE_DESC::res::pitch2D::devPtr + * must be set to a valid device pointer, that is aligned to ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT. + * ::CUDA_RESOURCE_DESC::res::pitch2D::format and ::CUDA_RESOURCE_DESC::res::pitch2D::numChannels + * describe the format of each component and the number of components per array element. ::CUDA_RESOURCE_DESC::res::pitch2D::width + * and ::CUDA_RESOURCE_DESC::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed + * ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH and ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT respectively. + * ::CUDA_RESOURCE_DESC::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT. Pitch cannot exceed ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH. + * + * - ::flags must be set to zero. + * + * + * The ::CUDA_TEXTURE_DESC struct is defined as + * \code + typedef struct CUDA_TEXTURE_DESC_st { + CUaddress_mode addressMode[3]; + CUfilter_mode filterMode; + unsigned int flags; + unsigned int maxAnisotropy; + CUfilter_mode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + } CUDA_TEXTURE_DESC; + * \endcode + * where + * - ::CUDA_TEXTURE_DESC::addressMode specifies the addressing mode for each dimension of the texture data. ::CUaddress_mode is defined as: + * \code + typedef enum CUaddress_mode_enum { + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 + } CUaddress_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. Also, if the flag, ::CU_TRSF_NORMALIZED_COORDINATES + * is not set, the only supported address mode is ::CU_TR_ADDRESS_MODE_CLAMP. + * + * - ::CUDA_TEXTURE_DESC::filterMode specifies the filtering mode to be used when fetching from the texture. CUfilter_mode is defined as: + * \code + typedef enum CUfilter_mode_enum { + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 + } CUfilter_mode; + * \endcode + * This is ignored if ::CUDA_RESOURCE_DESC::resType is ::CU_RESOURCE_TYPE_LINEAR. + * + * - ::CUDA_TEXTURE_DESC::flags can be any combination of the following: + * - ::CU_TRSF_READ_AS_INTEGER, which suppresses the default behavior of + * having the texture promote integer data to floating point data in the + * range [0, 1]. Note that texture with 32-bit integer format would not be + * promoted, regardless of whether or not this flag is specified. + * - ::CU_TRSF_NORMALIZED_COORDINATES, which suppresses the default behavior + * of having the texture coordinates range from [0, Dim) where Dim is the + * width or height of the CUDA array. Instead, the texture coordinates + * [0, 1.0) reference the entire breadth of the array dimension; Note that + * for CUDA mipmapped arrays, this flag has to be set. + * - ::CU_TRSF_DISABLE_TRILINEAR_OPTIMIZATION, which disables any trilinear + * filtering optimizations. Trilinear optimizations improve texture filtering + * performance by allowing bilinear filtering on textures in scenarios where + * it can closely approximate the expected results. + * - ::CU_TRSF_SEAMLESS_CUBEMAP, which enables seamless cube map filtering. + * This flag can only be specified if the underlying resource is a CUDA array + * or a CUDA mipmapped array that was created with the flag ::CUDA_ARRAY3D_CUBEMAP. + * When seamless cube map filtering is enabled, texture address modes specified + * by ::CUDA_TEXTURE_DESC::addressMode are ignored. Instead, if the ::CUDA_TEXTURE_DESC::filterMode + * is set to ::CU_TR_FILTER_MODE_POINT the address mode ::CU_TR_ADDRESS_MODE_CLAMP + * will be applied for all dimensions. If the ::CUDA_TEXTURE_DESC::filterMode is + * set to ::CU_TR_FILTER_MODE_LINEAR seamless cube map filtering will be performed + * when sampling along the cube face borders. + * + * - ::CUDA_TEXTURE_DESC::maxAnisotropy specifies the maximum anisotropy ratio to be used when doing anisotropic filtering. This value will be + * clamped to the range [1,16]. + * + * - ::CUDA_TEXTURE_DESC::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. + * + * - ::CUDA_TEXTURE_DESC::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. + * + * - ::CUDA_TEXTURE_DESC::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. + * + * - ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. + * + * + * The ::CUDA_RESOURCE_VIEW_DESC struct is defined as + * \code + typedef struct CUDA_RESOURCE_VIEW_DESC_st + { + CUresourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + } CUDA_RESOURCE_VIEW_DESC; + * \endcode + * where: + * - ::CUDA_RESOURCE_VIEW_DESC::format specifies how the data contained in the CUDA array or CUDA mipmapped array should + * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block + * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a base of format ::CU_AD_FORMAT_UNSIGNED_INT32. + * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have + * a format of ::CU_AD_FORMAT_UNSIGNED_INT32 with 2 channels. The other BC formats require the underlying resource to have the same base + * format but with 4 channels. + * + * - ::CUDA_RESOURCE_VIEW_DESC::width specifies the new width of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::height specifies the new height of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::depth specifies the new depth of the texture data. This value has to be equal to that of the + * original resource. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. + * For non-mipmapped resources, this value has to be zero.::CUDA_TEXTURE_DESC::minMipmapLevelClamp and ::CUDA_TEXTURE_DESC::maxMipmapLevelClamp + * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, + * then the actual minimum mipmap level clamp will be 3.2. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value + * has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. + * For non-layered resources, this value has to be zero. + * + * - ::CUDA_RESOURCE_VIEW_DESC::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * this value has to be zero. + * + * + * \param pTexObject - Texture object to create + * \param pResDesc - Resource descriptor + * \param pTexDesc - Texture descriptor + * \param pResViewDesc - Resource view descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectDestroy, + * ::cudaCreateTextureObject + */ +CUresult CUDAAPI cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); + +/** + * \brief Destroys a texture object + * + * Destroys the texture object specified by \p texObject. + * + * \param texObject - Texture object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaDestroyTextureObject + */ +CUresult CUDAAPI cuTexObjectDestroy(CUtexObject texObject); + +/** + * \brief Returns a texture object's resource descriptor + * + * Returns the resource descriptor for the texture object specified by \p texObject. + * + * \param pResDesc - Resource descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectResourceDesc, + */ +CUresult CUDAAPI cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's texture descriptor + * + * Returns the texture descriptor for the texture object specified by \p texObject. + * + * \param pTexDesc - Texture descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectTextureDesc + */ +CUresult CUDAAPI cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject); + +/** + * \brief Returns a texture object's resource view descriptor + * + * Returns the resource view descriptor for the texture object specified by \p texObject. + * If no resource view was set for \p texObject, the ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pResViewDesc - Resource view descriptor + * \param texObject - Texture object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuTexObjectCreate, + * ::cudaGetTextureObjectResourceViewDesc + */ +CUresult CUDAAPI cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject); + +/** @} */ /* END CUDA_TEXOBJECT */ + +/** + * \defgroup CUDA_SURFOBJECT Surface Object Management + * + * ___MANBRIEF___ surface object management functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the surface object management functions of the + * low-level CUDA driver application programming interface. The surface + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a surface object + * + * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes + * the data to perform surface load/stores on. ::CUDA_RESOURCE_DESC::resType must be + * ::CU_RESOURCE_TYPE_ARRAY and ::CUDA_RESOURCE_DESC::res::array::hArray + * must be set to a valid CUDA array handle. ::CUDA_RESOURCE_DESC::flags must be set to zero. + * + * Surface objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a surface object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * \param pSurfObject - Surface object to create + * \param pResDesc - Resource descriptor + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectDestroy, + * ::cudaCreateSurfaceObject + */ +CUresult CUDAAPI cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc); + +/** + * \brief Destroys a surface object + * + * Destroys the surface object specified by \p surfObject. + * + * \param surfObject - Surface object to destroy + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectCreate, + * ::cudaDestroySurfaceObject + */ +CUresult CUDAAPI cuSurfObjectDestroy(CUsurfObject surfObject); + +/** + * \brief Returns a surface object's resource descriptor + * + * Returns the resource descriptor for the surface object specified by \p surfObject. + * + * \param pResDesc - Resource descriptor + * \param surfObject - Surface object + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * + * \sa + * ::cuSurfObjectCreate, + * ::cudaGetSurfaceObjectResourceDesc + */ +CUresult CUDAAPI cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject); + +/** @} */ /* END CUDA_SURFOBJECT */ + +/** + * \defgroup CUDA_PEER_ACCESS Peer Context Memory Access + * + * ___MANBRIEF___ direct peer context memory access functions of the low-level + * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the direct peer context memory access functions + * of the low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Queries if a device may directly access a peer device's memory. + * + * Returns in \p *canAccessPeer a value of 1 if contexts on \p dev are capable of + * directly accessing memory from contexts on \p peerDev and 0 otherwise. + * If direct access of \p peerDev from \p dev is possible, then access may be + * enabled on two specific contexts by calling ::cuCtxEnablePeerAccess(). + * + * \param canAccessPeer - Returned access capability + * \param dev - Device from which allocations on \p peerDev are to + * be directly accessed. + * \param peerDev - Device on which the allocations to be directly accessed + * by \p dev reside. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE + * \notefnerr + * + * \sa + * ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess, + * ::cudaDeviceCanAccessPeer + */ +CUresult CUDAAPI cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev); + +/** + * \brief Enables direct access to memory allocations in a peer context. + * + * If both the current context and \p peerContext are on devices which support unified + * addressing (as may be queried using ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING) and same + * major compute capability, then on success all allocations from \p peerContext will + * immediately be accessible by the current context. See \ref CUDA_UNIFIED for additional + * details. + * + * Note that access granted by this call is unidirectional and that in order to access + * memory from the current context in \p peerContext, a separate symmetric call + * to ::cuCtxEnablePeerAccess() is required. + * + * Note that there are both device-wide and system-wide limitations per system + * configuration, as noted in the CUDA Programming Guide under the section + * "Peer-to-Peer Memory Access". + * + * Returns ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED if ::cuDeviceCanAccessPeer() indicates + * that the ::CUdevice of the current context cannot directly access memory + * from the ::CUdevice of \p peerContext. + * + * Returns ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED if direct access of + * \p peerContext from the current context has already been enabled. + * + * Returns ::CUDA_ERROR_TOO_MANY_PEERS if direct peer access is not possible + * because hardware resources required for peer access have been exhausted. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, \p peerContext + * is not a valid context, or if the current context is \p peerContext. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p Flags is not 0. + * + * \param peerContext - Peer context to enable direct access to from the current context + * \param Flags - Reserved for future use and must be set to 0 + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED, + * ::CUDA_ERROR_TOO_MANY_PEERS, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_PEER_ACCESS_UNSUPPORTED, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuDeviceCanAccessPeer, + * ::cuCtxDisablePeerAccess, + * ::cudaDeviceEnablePeerAccess + */ +CUresult CUDAAPI cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags); + +/** + * \brief Disables direct access to memory allocations in a peer context and + * unregisters any registered allocations. + * + Returns ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED if direct peer access has + * not yet been enabled from \p peerContext to the current context. + * + * Returns ::CUDA_ERROR_INVALID_CONTEXT if there is no current context, or if + * \p peerContext is not a valid context. + * + * \param peerContext - Peer context to disable direct access to + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_PEER_ACCESS_NOT_ENABLED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa + * ::cuDeviceCanAccessPeer, + * ::cuCtxEnablePeerAccess, + * ::cudaDeviceDisablePeerAccess + */ +CUresult CUDAAPI cuCtxDisablePeerAccess(CUcontext peerContext); + +/** + * \brief Queries attributes of the link between two devices. + * + * Returns in \p *value the value of the requested attribute \p attrib of the + * link between \p srcDevice and \p dstDevice. The supported attributes are: + * - ::CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK: A relative value indicating the + * performance of the link between two devices. + * - ::CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED P2P: 1 if P2P Access is enable. + * - ::CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED: 1 if Atomic operations over + * the link are supported. + * - ::CU_DEVICE_P2P_ATTRIBUTE_CUDA_ARRAY_ACCESS_SUPPORTED: 1 if cudaArray can + * be accessed over the link. + * + * Returns ::CUDA_ERROR_INVALID_DEVICE if \p srcDevice or \p dstDevice are not valid + * or if they represent the same device. + * + * Returns ::CUDA_ERROR_INVALID_VALUE if \p attrib is not valid or if \p value is + * a null pointer. + * + * \param value - Returned value of the requested attribute + * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. + * \param srcDevice - The source device of the target link. + * \param dstDevice - The destination device of the target link. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa + * ::cuCtxEnablePeerAccess, + * ::cuCtxDisablePeerAccess, + * ::cuDeviceCanAccessPeer, + * ::cudaDeviceGetP2PAttribute + */ +CUresult CUDAAPI cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice); + +/** @} */ /* END CUDA_PEER_ACCESS */ + +/** + * \defgroup CUDA_GRAPHICS Graphics Interoperability + * + * ___MANBRIEF___ graphics interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graphics interoperability functions of the + * low-level CUDA driver application programming interface. + * + * @{ + */ + +/** + * \brief Unregisters a graphics resource for access by CUDA + * + * Unregisters the graphics resource \p resource so it is not accessible by + * CUDA unless registered again. + * + * If \p resource is invalid then ::CUDA_ERROR_INVALID_HANDLE is + * returned. + * + * \param resource - Resource to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsD3D9RegisterResource, + * ::cuGraphicsD3D10RegisterResource, + * ::cuGraphicsD3D11RegisterResource, + * ::cuGraphicsGLRegisterBuffer, + * ::cuGraphicsGLRegisterImage, + * ::cudaGraphicsUnregisterResource + */ +CUresult CUDAAPI cuGraphicsUnregisterResource(CUgraphicsResource resource); + +/** + * \brief Get an array through which to access a subresource of a mapped graphics resource. + * + * Returns in \p *pArray an array through which the subresource of the mapped + * graphics resource \p resource which corresponds to array index \p arrayIndex + * and mipmap level \p mipLevel may be accessed. The value set in \p *pArray may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p arrayIndex is not a valid array index for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p mipLevel is not a valid mipmap level for \p resource then + * ::CUDA_ERROR_INVALID_VALUE is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pArray - Returned array through which a subresource of \p resource may be accessed + * \param resource - Mapped resource to access + * \param arrayIndex - Array index for array textures or cubemap face + * index as defined by ::CUarray_cubemap_face for + * cubemap textures for the subresource to access + * \param mipLevel - Mipmap level for the subresource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsSubResourceGetMappedArray + */ +CUresult CUDAAPI cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); + +/** + * \brief Get a mipmapped array through which to access a mapped graphics resource. + * + * Returns in \p *pMipmappedArray a mipmapped array through which the mapped graphics + * resource \p resource. The value set in \p *pMipmappedArray may change every time + * that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via a mipmapped array and + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param pMipmappedArray - Returned mipmapped array through which \p resource may be accessed + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_ARRAY + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsResourceGetMappedMipmappedArray + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); + +/** + * \brief Get a device pointer through which to access a mapped graphics resource. + * + * Returns in \p *pDevPtr a pointer through which the mapped graphics resource + * \p resource may be accessed. + * Returns in \p pSize the size of the memory in bytes which may be accessed from that pointer. + * The value set in \p pPointer may change every time that \p resource is mapped. + * + * If \p resource is not a buffer then it cannot be accessed via a pointer and + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER is returned. + * If \p resource is not mapped then ::CUDA_ERROR_NOT_MAPPED is returned. + * * + * \param pDevPtr - Returned pointer through which \p resource may be accessed + * \param pSize - Returned size of the buffer accessible starting at \p *pPointer + * \param resource - Mapped resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_NOT_MAPPED_AS_POINTER + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer + */ +CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource); + +/** + * \brief Set usage flags for mapping a graphics resource + * + * Set \p flags for mapping the graphics resource \p resource. + * + * Changes to \p flags will take effect the next time \p resource is mapped. + * The \p flags argument may be any of the following: + + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA kernels. This is the default value. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which + * access this resource will not write to this resource. + * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels + * which access this resource will not read from this resource and will + * write over the entire contents of the resource, so none of the data + * previously stored in the resource will be preserved. + * + * If \p resource is presently mapped for access by CUDA then + * ::CUDA_ERROR_ALREADY_MAPPED is returned. + * If \p flags is not one of the above values then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param resource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cudaGraphicsResourceSetMapFlags + */ +CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + +/** + * \brief Map graphics resources for access by CUDA + * + * Maps the \p count graphics resources in \p resources for access by CUDA. + * + * The resources in \p resources may be accessed by CUDA until they + * are unmapped. The graphics API from which \p resources were registered + * should not access any resources while they are mapped by CUDA. If an + * application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any graphics calls + * issued before ::cuGraphicsMapResources() will complete before any subsequent CUDA + * work issued in \p stream begins. + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are presently mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. + * + * \param count - Number of resources to map + * \param resources - Resources to map for CUDA usage + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsResourceGetMappedPointer, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cuGraphicsUnmapResources, + * ::cudaGraphicsMapResources + */ +CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** + * \brief Unmap graphics resources. + * + * Unmaps the \p count graphics resources in \p resources. + * + * Once unmapped, the resources in \p resources may not be accessed by CUDA + * until they are mapped again. + * + * This function provides the synchronization guarantee that any CUDA work issued + * in \p stream before ::cuGraphicsUnmapResources() will complete before any + * subsequently issued graphics work begins. + * + * + * If \p resources includes any duplicate entries then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If any of \p resources are not presently mapped for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * \param count - Number of resources to unmap + * \param resources - Resources to unmap + * \param hStream - Stream with which to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \note_null_stream + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources, + * ::cudaGraphicsUnmapResources + */ +CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + +/** @} */ /* END CUDA_GRAPHICS */ + +/** + * \defgroup CUDA_DRIVER_ENTRY_POINT Driver Entry Point Access + * + * ___MANBRIEF___ driver entry point access functions of the low-level CUDA driver API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the driver entry point access functions of the low-level CUDA + * driver application programming interface. + * + * @{ + */ + +/** + * \brief Returns the requested driver API function pointer + * + * Returns in \p **pfn the address of the CUDA driver function for the requested + * CUDA version and flags. + * + * The CUDA version is specified as (1000 * major + 10 * minor), so CUDA 11.2 + * should be specified as 11020. For a requested driver symbol, if the specified + * CUDA version is greater than or equal to the CUDA version in which the driver symbol + * was introduced, this API will return the function pointer to the corresponding + * versioned function. + * + * The pointer returned by the API should be cast to a function pointer matching the + * requested driver function's definition in the API header file. The function pointer + * typedef can be picked up from the corresponding typedefs header file. For example, + * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h. + * + * The API will return ::CUDA_ERROR_NOT_FOUND if the requested driver function is not + * supported on the platform, no ABI compatible driver function exists for the specified + * \p cudaVersion or if the driver symbol is invalid. + * + * The requested flags can be: + * - ::CU_GET_PROC_ADDRESS_DEFAULT: This is the default mode. This is equivalent to + * ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM if the code is compiled with + * --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM + * is defined; ::CU_GET_PROC_ADDRESS_LEGACY_STREAM otherwise. + * - ::CU_GET_PROC_ADDRESS_LEGACY_STREAM: This will enable the search for all driver symbols + * that match the requested driver symbol name except the corresponding per-thread versions. + * - ::CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM: This will enable the search for all + * driver symbols that match the requested driver symbol name including the per-thread + * versions. If a per-thread version is not found, the API will return the legacy version + * of the driver function. + * + * \param symbol - The base name of the driver API function to look for. As an example, + * for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc and + * \p cudaVersion would be the ABI compatible CUDA version for the _v2 variant. + * \param pfn - Location to return the function pointer to the requested driver function + * \param cudaVersion - The CUDA version to look for the requested driver symbol + * \param flags - Flags to specify search options. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_SUPPORTED, + * ::CUDA_ERROR_NOT_FOUND + * \note_version_mixing + * + * \sa + * ::cudaGetDriverEntryPoint + */ +CUresult CUDAAPI cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags); + +/** @} */ /* END CUDA_DRIVER_ENTRY_POINT */ + +CUresult CUDAAPI cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId); + +/** + * CUDA API versioning support + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuMemHostRegister + #undef cuGraphicsResourceSetMapFlags + #undef cuLinkCreate + #undef cuLinkAddData + #undef cuLinkAddFile + #undef cuDeviceTotalMem + #undef cuCtxCreate + #undef cuModuleGetGlobal + #undef cuMemGetInfo + #undef cuMemAlloc + #undef cuMemAllocPitch + #undef cuMemFree + #undef cuMemGetAddressRange + #undef cuMemAllocHost + #undef cuMemHostGetDevicePointer + #undef cuMemcpyHtoD + #undef cuMemcpyDtoH + #undef cuMemcpyDtoD + #undef cuMemcpyDtoA + #undef cuMemcpyAtoD + #undef cuMemcpyHtoA + #undef cuMemcpyAtoH + #undef cuMemcpyAtoA + #undef cuMemcpyHtoAAsync + #undef cuMemcpyAtoHAsync + #undef cuMemcpy2D + #undef cuMemcpy2DUnaligned + #undef cuMemcpy3D + #undef cuMemcpyHtoDAsync + #undef cuMemcpyDtoHAsync + #undef cuMemcpyDtoDAsync + #undef cuMemcpy2DAsync + #undef cuMemcpy3DAsync + #undef cuMemsetD8 + #undef cuMemsetD16 + #undef cuMemsetD32 + #undef cuMemsetD2D8 + #undef cuMemsetD2D16 + #undef cuMemsetD2D32 + #undef cuArrayCreate + #undef cuArrayGetDescriptor + #undef cuArray3DCreate + #undef cuArray3DGetDescriptor + #undef cuTexRefSetAddress + #undef cuTexRefSetAddress2D + #undef cuTexRefGetAddress + #undef cuGraphicsResourceGetMappedPointer + #undef cuCtxDestroy + #undef cuCtxPopCurrent + #undef cuCtxPushCurrent + #undef cuStreamDestroy + #undef cuEventDestroy + #undef cuMemcpy + #undef cuMemcpyAsync + #undef cuMemcpyPeer + #undef cuMemcpyPeerAsync + #undef cuMemcpy3DPeer + #undef cuMemcpy3DPeerAsync + #undef cuMemsetD8Async + #undef cuMemsetD16Async + #undef cuMemsetD32Async + #undef cuMemsetD2D8Async + #undef cuMemsetD2D16Async + #undef cuMemsetD2D32Async + #undef cuStreamGetPriority + #undef cuStreamGetFlags + #undef cuStreamGetCtx + #undef cuStreamWaitEvent + #undef cuStreamAddCallback + #undef cuStreamAttachMemAsync + #undef cuStreamQuery + #undef cuStreamSynchronize + #undef cuEventRecord + #undef cuEventRecordWithFlags + #undef cuLaunchKernel + #undef cuLaunchKernelEx + #undef cuLaunchHostFunc + #undef cuGraphicsMapResources + #undef cuGraphicsUnmapResources + #undef cuStreamWriteValue32 + #undef cuStreamWaitValue32 + #undef cuStreamWriteValue64 + #undef cuStreamWaitValue64 + #undef cuStreamBatchMemOp + #undef cuStreamWriteValue32_v2 + #undef cuStreamWaitValue32_v2 + #undef cuStreamWriteValue64_v2 + #undef cuStreamWaitValue64_v2 + #undef cuStreamBatchMemOp_v2 + #undef cuMemPrefetchAsync + #undef cuLaunchCooperativeKernel + #undef cuSignalExternalSemaphoresAsync + #undef cuWaitExternalSemaphoresAsync + #undef cuStreamBeginCapture + #undef cuStreamEndCapture + #undef cuStreamIsCapturing + #undef cuStreamGetCaptureInfo + #undef cuStreamGetCaptureInfo_v2 + #undef cuGraphUpload + #undef cuGraphLaunch + #undef cuDevicePrimaryCtxRelease + #undef cuDevicePrimaryCtxReset + #undef cuDevicePrimaryCtxSetFlags + #undef cuIpcOpenMemHandle + #undef cuStreamCopyAttributes + #undef cuStreamSetAttribute + #undef cuStreamGetAttribute + #undef cuGraphInstantiate + #undef cuMemMapArrayAsync + #undef cuMemFreeAsync + #undef cuMemAllocAsync + #undef cuMemAllocFromPoolAsync + #undef cuStreamUpdateCaptureDependencies + + CUresult CUDAAPI cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags); + CUresult CUDAAPI cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags); + CUresult CUDAAPI cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + CUresult CUDAAPI cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, + unsigned int numOptions, CUjit_option *options, void **optionValues); + CUresult CUDAAPI cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, + unsigned int numOptions, CUjit_option *options, void **optionValues); + CUresult CUDAAPI cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch); + + typedef unsigned int CUdeviceptr_v1; + + typedef struct CUDA_MEMCPY2D_v1_st + { + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr_v1 srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + + unsigned int WidthInBytes; /**< Width of 2D memory copy in bytes */ + unsigned int Height; /**< Height of 2D memory copy */ + } CUDA_MEMCPY2D_v1; + + typedef struct CUDA_MEMCPY3D_v1_st + { + unsigned int srcXInBytes; /**< Source X in bytes */ + unsigned int srcY; /**< Source Y */ + unsigned int srcZ; /**< Source Z */ + unsigned int srcLOD; /**< Source LOD */ + CUmemorytype srcMemoryType; /**< Source memory type (host, device, array) */ + const void *srcHost; /**< Source host pointer */ + CUdeviceptr_v1 srcDevice; /**< Source device pointer */ + CUarray srcArray; /**< Source array reference */ + void *reserved0; /**< Must be NULL */ + unsigned int srcPitch; /**< Source pitch (ignored when src is array) */ + unsigned int srcHeight; /**< Source height (ignored when src is array; may be 0 if Depth==1) */ + + unsigned int dstXInBytes; /**< Destination X in bytes */ + unsigned int dstY; /**< Destination Y */ + unsigned int dstZ; /**< Destination Z */ + unsigned int dstLOD; /**< Destination LOD */ + CUmemorytype dstMemoryType; /**< Destination memory type (host, device, array) */ + void *dstHost; /**< Destination host pointer */ + CUdeviceptr_v1 dstDevice; /**< Destination device pointer */ + CUarray dstArray; /**< Destination array reference */ + void *reserved1; /**< Must be NULL */ + unsigned int dstPitch; /**< Destination pitch (ignored when dst is array) */ + unsigned int dstHeight; /**< Destination height (ignored when dst is array; may be 0 if Depth==1) */ + + unsigned int WidthInBytes; /**< Width of 3D memory copy in bytes */ + unsigned int Height; /**< Height of 3D memory copy */ + unsigned int Depth; /**< Depth of 3D memory copy */ + } CUDA_MEMCPY3D_v1; + + typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st + { + unsigned int Width; /**< Width of array */ + unsigned int Height; /**< Height of array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + } CUDA_ARRAY_DESCRIPTOR_v1; + + typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st + { + unsigned int Width; /**< Width of 3D array */ + unsigned int Height; /**< Height of 3D array */ + unsigned int Depth; /**< Depth of 3D array */ + + CUarray_format Format; /**< Array format */ + unsigned int NumChannels; /**< Channels per array element */ + unsigned int Flags; /**< Flags */ + } CUDA_ARRAY3D_DESCRIPTOR_v1; + + CUresult CUDAAPI cuDeviceTotalMem(unsigned int *bytes, CUdevice dev); + CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev); + CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); + CUresult CUDAAPI cuMemGetInfo(unsigned int *free, unsigned int *total); + CUresult CUDAAPI cuMemAlloc(CUdeviceptr_v1 *dptr, unsigned int bytesize); + CUresult CUDAAPI cuMemAllocPitch(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); + CUresult CUDAAPI cuMemFree(CUdeviceptr_v1 dptr); + CUresult CUDAAPI cuMemGetAddressRange(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); + CUresult CUDAAPI cuMemAllocHost(void **pp, unsigned int bytesize); + CUresult CUDAAPI cuMemHostGetDevicePointer(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); + CUresult CUDAAPI cuMemcpyHtoD(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoH(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoD(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyDtoA(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoD(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + CUresult CUDAAPI cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2D(const CUDA_MEMCPY2D_v1 *pCopy); + CUresult CUDAAPI cuMemcpy2DUnaligned(const CUDA_MEMCPY2D_v1 *pCopy); + CUresult CUDAAPI cuMemcpy3D(const CUDA_MEMCPY3D_v1 *pCopy); + CUresult CUDAAPI cuMemcpyHtoDAsync(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoDAsync(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2DAsync(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DAsync(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemsetD8(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); + CUresult CUDAAPI cuMemsetD16(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); + CUresult CUDAAPI cuMemsetD32(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); + CUresult CUDAAPI cuMemsetD2D8(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuMemsetD2D16(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuMemsetD2D32(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); + CUresult CUDAAPI cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); + CUresult CUDAAPI cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); + CUresult CUDAAPI cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); + CUresult CUDAAPI cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); + CUresult CUDAAPI cuTexRefSetAddress(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); + CUresult CUDAAPI cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); + CUresult CUDAAPI cuTexRefGetAddress(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); + CUresult CUDAAPI cuGraphicsResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); + + CUresult CUDAAPI cuCtxDestroy(CUcontext ctx); + CUresult CUDAAPI cuCtxPopCurrent(CUcontext *pctx); + CUresult CUDAAPI cuCtxPushCurrent(CUcontext ctx); + CUresult CUDAAPI cuStreamDestroy(CUstream hStream); + CUresult CUDAAPI cuEventDestroy(CUevent hEvent); + CUresult CUDAAPI cuDevicePrimaryCtxRelease(CUdevice dev); + CUresult CUDAAPI cuDevicePrimaryCtxReset(CUdevice dev); + CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); + + CUresult CUDAAPI cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoD_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyDtoA_v2(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoD_v2(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoA_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoH_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAtoA_v2(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); + CUresult CUDAAPI cuMemcpyHtoAAsync_v2(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyAtoHAsync_v2(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2D_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy2DUnaligned_v2(const CUDA_MEMCPY2D *pCopy); + CUresult CUDAAPI cuMemcpy3D_v2(const CUDA_MEMCPY3D *pCopy); + CUresult CUDAAPI cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyDtoDAsync_v2(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy2DAsync_v2(const CUDA_MEMCPY2D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DAsync_v2(const CUDA_MEMCPY3D *pCopy, CUstream hStream); + CUresult CUDAAPI cuMemsetD8_v2(CUdeviceptr dstDevice, unsigned char uc, size_t N); + CUresult CUDAAPI cuMemsetD16_v2(CUdeviceptr dstDevice, unsigned short us, size_t N); + CUresult CUDAAPI cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N); + CUresult CUDAAPI cuMemsetD2D8_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D16_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); + CUresult CUDAAPI cuMemsetD2D32_v2(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); + CUresult CUDAAPI cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount); + CUresult CUDAAPI cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount); + CUresult CUDAAPI cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); + CUresult CUDAAPI cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy); + CUresult CUDAAPI cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream); + + CUresult CUDAAPI cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); + CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); + + CUresult CUDAAPI cuStreamGetPriority(CUstream hStream, int *priority); + CUresult CUDAAPI cuStreamGetFlags(CUstream hStream, unsigned int *flags); + CUresult CUDAAPI cuStreamGetCtx(CUstream hStream, CUcontext *pctx); + CUresult CUDAAPI cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags); + CUresult CUDAAPI cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); + CUresult CUDAAPI cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags); + CUresult CUDAAPI cuStreamQuery(CUstream hStream); + CUresult CUDAAPI cuStreamSynchronize(CUstream hStream); + CUresult CUDAAPI cuEventRecord(CUevent hEvent, CUstream hStream); + CUresult CUDAAPI cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags); + CUresult CUDAAPI cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); + CUresult CUDAAPI cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra); + CUresult CUDAAPI cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData); + CUresult CUDAAPI cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream); + CUresult CUDAAPI cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + CUresult CUDAAPI cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags); + CUresult CUDAAPI cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); + CUresult CUDAAPI cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream); + CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); + CUresult CUDAAPI cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + CUresult CUDAAPI cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream); + CUresult CUDAAPI cuStreamBeginCapture(CUstream hStream); + CUresult CUDAAPI cuStreamBeginCapture_ptsz(CUstream hStream); + CUresult CUDAAPI cuStreamBeginCapture_v2(CUstream hStream, CUstreamCaptureMode mode); + CUresult CUDAAPI cuStreamEndCapture(CUstream hStream, CUgraph *phGraph); + CUresult CUDAAPI cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus); + CUresult CUDAAPI cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); + CUresult CUDAAPI cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); + + CUresult CUDAAPI cuGraphUpload(CUgraphExec hGraph, CUstream hStream); + CUresult CUDAAPI cuGraphLaunch(CUgraphExec hGraph, CUstream hStream); + CUresult CUDAAPI cuStreamCopyAttributes(CUstream dstStream, CUstream srcStream); + CUresult CUDAAPI cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value); + CUresult CUDAAPI cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *param); + + CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); + CUresult CUDAAPI cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); + CUresult CUDAAPI cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream); + + CUresult CUDAAPI cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream); + CUresult CUDAAPI cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream); + CUresult CUDAAPI cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); + + CUresult CUDAAPI cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); +#elif defined(__CUDA_API_PER_THREAD_DEFAULT_STREAM) +static inline CUresult cuGetProcAddress_ptsz(const char *symbol, void **funcPtr, int driverVersion, cuuint64_t flags) { + const int procAddressMask = (CU_GET_PROC_ADDRESS_LEGACY_STREAM| + CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM); + if ((flags & procAddressMask) == 0) { + flags |= CU_GET_PROC_ADDRESS_PER_THREAD_DEFAULT_STREAM; + } + return cuGetProcAddress(symbol, funcPtr, driverVersion, flags); +} +#define cuGetProcAddress cuGetProcAddress_ptsz +#endif + +#ifdef __cplusplus +} +#endif + +#if defined(__GNUC__) + #if defined(__CUDA_API_PUSH_VISIBILITY_DEFAULT) + #pragma GCC visibility pop + #endif +#endif + +#undef __CUDA_DEPRECATED + +#endif /* __cuda_cuda_h__ */ diff --git a/ext/cudart/include/cudaD3D10.h b/ext/cudart/include/cudaD3D10.h new file mode 100644 index 0000000000000000000000000000000000000000..9342cd7832a167958d4591c5afa346f43fa65d2d --- /dev/null +++ b/ext/cudart/include/cudaD3D10.h @@ -0,0 +1,805 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAD3D10_H +#define CUDAD3D10_H + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif + +#ifdef CUDA_FORCE_API_VERSION +#error "CUDA_FORCE_API_VERSION is no longer supported." +#endif + +#define cuD3D10CtxCreate cuD3D10CtxCreate_v2 +#define cuD3D10ResourceGetSurfaceDimensions cuD3D10ResourceGetSurfaceDimensions_v2 +#define cuD3D10ResourceGetMappedPointer cuD3D10ResourceGetMappedPointer_v2 +#define cuD3D10ResourceGetMappedSize cuD3D10ResourceGetMappedSize_v2 +#define cuD3D10ResourceGetMappedPitch cuD3D10ResourceGetMappedPitch_v2 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \defgroup CUDA_D3D10 Direct3D 10 Interoperability + * \ingroup CUDA_DRIVER + * + * ___MANBRIEF___ Direct3D 10 interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the Direct3D 10 interoperability functions of the + * low-level CUDA driver application programming interface. Note that mapping + * of Direct3D 10 resources is performed with the graphics API agnostic, resource + * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability". + * + * @{ + */ + +/** + * CUDA devices corresponding to a D3D10 device + */ +typedef enum CUd3d10DeviceList_enum { + CU_D3D10_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by a D3D10 device */ + CU_D3D10_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by a D3D10 device in its currently rendering frame */ + CU_D3D10_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D10 device in the next frame */ +} CUd3d10DeviceList; + +/** + * \brief Gets the CUDA device corresponding to a display adapter. + * + * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the + * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. + * + * If no device on \p pAdapter is CUDA-compatible then the call will fail. + * + * \param pCudaDevice - Returned CUDA device corresponding to \p pAdapter + * \param pAdapter - Adapter to query for CUDA device + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D10GetDevices, + * ::cudaD3D10GetDevice + */ +CUresult CUDAAPI cuD3D10GetDevice(CUdevice *pCudaDevice, IDXGIAdapter *pAdapter); + +/** + * \brief Gets the CUDA devices corresponding to a Direct3D 10 device + * + * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding + * to the Direct3D 10 device \p pD3D10Device. + * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices + * corresponding to the Direct3D 10 device \p pD3D10Device. + * + * If any of the GPUs being used to render \p pDevice are not CUDA capable then the + * call will return ::CUDA_ERROR_NO_DEVICE. + * + * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D10Device + * \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D10Device + * \param cudaDeviceCount - The size of the output device array \p pCudaDevices + * \param pD3D10Device - Direct3D 10 device to query for CUDA devices + * \param deviceList - The set of devices to return. This set may be + * ::CU_D3D10_DEVICE_LIST_ALL for all devices, + * ::CU_D3D10_DEVICE_LIST_CURRENT_FRAME for the devices used to + * render the current frame (in SLI), or + * ::CU_D3D10_DEVICE_LIST_NEXT_FRAME for the devices used to + * render the next frame (in SLI). + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NO_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D10GetDevice, + * ::cudaD3D10GetDevices + */ +CUresult CUDAAPI cuD3D10GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, CUd3d10DeviceList deviceList); + +/** + * \brief Register a Direct3D 10 resource for access by CUDA + * + * Registers the Direct3D 10 resource \p pD3DResource for access by CUDA and + * returns a CUDA handle to \p pD3Dresource in \p pCudaResource. + * The handle returned in \p pCudaResource may be used to map and unmap this + * resource until it is unregistered. + * On success this call will increase the internal reference count on + * \p pD3DResource. This reference count will be decremented when this + * resource is unregistered through ::cuGraphicsUnregisterResource(). + * + * This call is potentially high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pD3DResource must be one of the following. + * - ::ID3D10Buffer: may be accessed through a device pointer. + * - ::ID3D10Texture1D: individual subresources of the texture may be accessed via arrays + * - ::ID3D10Texture2D: individual subresources of the texture may be accessed via arrays + * - ::ID3D10Texture3D: individual subresources of the texture may be accessed via arrays + * + * The \p Flags argument may be used to specify additional parameters at register + * time. The valid values for this parameter are + * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this + * resource will be used. + * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will + * bind this resource to a surface reference. + * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform + * texture gather operations on this resource. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * - The primary rendertarget may not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * A complete list of supported DXGI formats is as follows. For compactness the + * notation A_{B,C,D} represents A_B, A_C, and A_D. + * - DXGI_FORMAT_A8_UNORM + * - DXGI_FORMAT_B8G8R8A8_UNORM + * - DXGI_FORMAT_B8G8R8X8_UNORM + * - DXGI_FORMAT_R16_FLOAT + * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R32_FLOAT + * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT} + * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT} + * - DXGI_FORMAT_R32_{SINT,UINT} + * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB} + * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM} + * + * If \p pD3DResource is of incorrect type or is already registered then + * ::CUDA_ERROR_INVALID_HANDLE is returned. + * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned. + * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE + * is returned. + * + * \param pCudaResource - Returned graphics resource handle + * \param pD3DResource - Direct3D resource to register + * \param Flags - Parameters for resource registration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsUnregisterResource, + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsD3D10RegisterResource + */ +CUresult CUDAAPI cuGraphicsD3D10RegisterResource(CUgraphicsResource *pCudaResource, ID3D10Resource *pD3DResource, unsigned int Flags); + +/** + * \defgroup CUDA_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] + * + * ___MANBRIEF___ deprecated Direct3D 10 interoperability functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes deprecated Direct3D 10 interoperability functionality. + * @{ + */ + +/** Flags to register a resource */ +typedef enum CUD3D10register_flags_enum { + CU_D3D10_REGISTER_FLAGS_NONE = 0x00, + CU_D3D10_REGISTER_FLAGS_ARRAY = 0x01, +} CUD3D10register_flags; + +/** Flags to map or unmap a resource */ +typedef enum CUD3D10map_flags_enum { + CU_D3D10_MAPRESOURCE_FLAGS_NONE = 0x00, + CU_D3D10_MAPRESOURCE_FLAGS_READONLY = 0x01, + CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD = 0x02, +} CUD3D10map_flags; + + +/** + * \brief Create a CUDA context for interoperability with Direct3D 10 + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA context with a D3D10 + * device in order to achieve maximum interoperability performance. + * + * \param pCtx - Returned newly created CUDA context + * \param pCudaDevice - Returned pointer to the device on which the context was created + * \param Flags - Context creation flags (see ::cuCtxCreate() for details) + * \param pD3DDevice - Direct3D device to create interoperability context with + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D10GetDevice, + * ::cuGraphicsD3D10RegisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice); + +/** + * \brief Create a CUDA context for interoperability with Direct3D 10 + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA context with a D3D10 + * device in order to achieve maximum interoperability performance. + * + * \param pCtx - Returned newly created CUDA context + * \param flags - Context creation flags (see ::cuCtxCreate() for details) + * \param pD3DDevice - Direct3D device to create interoperability context with + * \param cudaDevice - The CUDA device on which to create the context. This device + * must be among the devices returned when querying + * ::CU_D3D10_DEVICES_ALL from ::cuD3D10GetDevices. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D10GetDevices, + * ::cuGraphicsD3D10RegisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, ID3D10Device *pD3DDevice, CUdevice cudaDevice); + +/** + * \brief Get the Direct3D 10 device against which the current CUDA context was + * created + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA context with a D3D10 + * device in order to achieve maximum interoperability performance. + * + * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa + * ::cuD3D10GetDevice + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10GetDirect3DDevice(ID3D10Device **ppD3DDevice); + +/** + * \brief Register a Direct3D resource for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Registers the Direct3D resource \p pResource for access by CUDA. + * + * If this call is successful, then the application will be able to map and + * unmap this resource until it is unregistered through + * ::cuD3D10UnregisterResource(). Also on success, this call will increase the + * internal reference count on \p pResource. This reference count will be + * decremented when this resource is unregistered through + * ::cuD3D10UnregisterResource(). + * + * This call is potentially high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pResource must be one of the following. + * + * - ::ID3D10Buffer: Cannot be used with \p Flags set to + * ::CU_D3D10_REGISTER_FLAGS_ARRAY. + * - ::ID3D10Texture1D: No restrictions. + * - ::ID3D10Texture2D: No restrictions. + * - ::ID3D10Texture3D: No restrictions. + * + * The \p Flags argument specifies the mechanism through which CUDA will + * access the Direct3D resource. The following values are allowed. + * + * - ::CU_D3D10_REGISTER_FLAGS_NONE: Specifies that CUDA will access this + * resource through a ::CUdeviceptr. The pointer, size, and (for textures), + * pitch for each subresource of this allocation may be queried through + * ::cuD3D10ResourceGetMappedPointer(), ::cuD3D10ResourceGetMappedSize(), + * and ::cuD3D10ResourceGetMappedPitch() respectively. This option is valid + * for all resource types. + * - ::CU_D3D10_REGISTER_FLAGS_ARRAY: Specifies that CUDA will access this + * resource through a ::CUarray queried on a sub-resource basis through + * ::cuD3D10ResourceGetMappedArray(). This option is only valid for + * resources of type ::ID3D10Texture1D, ::ID3D10Texture2D, and + * ::ID3D10Texture3D. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * + * - The primary rendertarget may not be registered with CUDA. + * - Resources allocated as shared may not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * If Direct3D interoperability is not initialized on this context then + * ::CUDA_ERROR_INVALID_CONTEXT is returned. If \p pResource is of incorrect + * type or is already registered, then ::CUDA_ERROR_INVALID_HANDLE is + * returned. If \p pResource cannot be registered, then ::CUDA_ERROR_UNKNOWN + * is returned. + * + * \param pResource - Resource to register + * \param Flags - Parameters for resource registration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuGraphicsD3D10RegisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10RegisterResource(ID3D10Resource *pResource, unsigned int Flags); + +/** + * \brief Unregister a Direct3D resource + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unregisters the Direct3D resource \p pResource so it is not accessible by + * CUDA unless registered again. + * + * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is + * returned. + * + * \param pResource - Resources to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuGraphicsUnregisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10UnregisterResource(ID3D10Resource *pResource); + +/** + * \brief Map Direct3D resources for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Maps the \p count Direct3D resources in \p ppResources for access by CUDA. + * + * The resources in \p ppResources may be accessed in CUDA kernels until they + * are unmapped. Direct3D should not access any resources while they are mapped + * by CUDA. If an application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any Direct3D calls + * issued before ::cuD3D10MapResources() will complete before any CUDA kernels + * issued after ::cuD3D10MapResources() begin. + * + * If any of \p ppResources have not been registered for use with CUDA or if + * \p ppResources contains any duplicate entries, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResources are + * presently mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is + * returned. + * + * \param count - Number of resources to map for CUDA + * \param ppResources - Resources to map for CUDA + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuGraphicsMapResources + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10MapResources(unsigned int count, ID3D10Resource **ppResources); + +/** + * \brief Unmap Direct3D resources + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unmaps the \p count Direct3D resources in \p ppResources. + * + * This function provides the synchronization guarantee that any CUDA kernels + * issued before ::cuD3D10UnmapResources() will complete before any Direct3D + * calls issued after ::cuD3D10UnmapResources() begin. + * + * If any of \p ppResources have not been registered for use with CUDA or if + * \p ppResources contains any duplicate entries, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResources are not + * presently mapped for access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is + * returned. + * + * \param count - Number of resources to unmap for CUDA + * \param ppResources - Resources to unmap for CUDA + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuGraphicsUnmapResources + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10UnmapResources(unsigned int count, ID3D10Resource **ppResources); + +/** + * \brief Set usage flags for mapping a Direct3D resource + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Set flags for mapping the Direct3D resource \p pResource. + * + * Changes to flags will take effect the next time \p pResource is mapped. The + * \p Flags argument may be any of the following. + * + * - ::CU_D3D10_MAPRESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA kernels. This is the default value. + * - ::CU_D3D10_MAPRESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which + * access this resource will not write to this resource. + * - ::CU_D3D10_MAPRESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels + * which access this resource will not read from this resource and will + * write over the entire contents of the resource, so none of the data + * previously stored in the resource will be preserved. + * + * If \p pResource has not been registered for use with CUDA, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is presently + * mapped for access by CUDA then ::CUDA_ERROR_ALREADY_MAPPED is returned. + * + * \param pResource - Registered resource to set flags for + * \param Flags - Parameters for resource mapping + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsResourceSetMapFlags + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceSetMapFlags(ID3D10Resource *pResource, unsigned int Flags); + +/** + * \brief Get an array through which to access a subresource of a Direct3D + * resource which has been mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pArray an array through which the subresource of the mapped + * Direct3D resource \p pResource, which corresponds to \p SubResource may be + * accessed. The value set in \p pArray may change every time that \p pResource + * is mapped. + * + * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is + * returned. If \p pResource was not registered with usage flags + * ::CU_D3D10_REGISTER_FLAGS_ARRAY, then ::CUDA_ERROR_INVALID_HANDLE is + * returned. If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is + * returned. + * + * For usage requirements of the \p SubResource parameter, see + * ::cuD3D10ResourceGetMappedPointer(). + * + * \param pArray - Returned array corresponding to subresource + * \param pResource - Mapped resource to access + * \param SubResource - Subresource of pResource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsSubResourceGetMappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedArray(CUarray *pArray, ID3D10Resource *pResource, unsigned int SubResource); + +/** + * \brief Get a pointer through which to access a subresource of a Direct3D + * resource which has been mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pDevPtr the base pointer of the subresource of the mapped + * Direct3D resource \p pResource, which corresponds to \p SubResource. The + * value set in \p pDevPtr may change every time that \p pResource is mapped. + * + * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is + * returned. If \p pResource was not registered with usage flags + * ::CU_D3D10_REGISTER_FLAGS_NONE, then ::CUDA_ERROR_INVALID_HANDLE is + * returned. If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is + * returned. + * + * If \p pResource is of type ::ID3D10Buffer, then \p SubResource must be 0. + * If \p pResource is of any other type, then the value of \p SubResource must + * come from the subresource calculation in ::D3D10CalcSubResource(). + * + * \param pDevPtr - Returned pointer corresponding to subresource + * \param pResource - Mapped resource to access + * \param SubResource - Subresource of pResource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsResourceGetMappedPointer + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedPointer(CUdeviceptr *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource); + +/** + * \brief Get the size of a subresource of a Direct3D resource which has been + * mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pSize the size of the subresource of the mapped Direct3D + * resource \p pResource, which corresponds to \p SubResource. The value set + * in \p pSize may change every time that \p pResource is mapped. + * + * If \p pResource has not been registered for use with CUDA, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered + * with usage flags ::CU_D3D10_REGISTER_FLAGS_NONE, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for + * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * For usage requirements of the \p SubResource parameter, see + * ::cuD3D10ResourceGetMappedPointer(). + * + * \param pSize - Returned size of subresource + * \param pResource - Mapped resource to access + * \param SubResource - Subresource of pResource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsResourceGetMappedPointer + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedSize(size_t *pSize, ID3D10Resource *pResource, unsigned int SubResource); + +/** + * \brief Get the pitch of a subresource of a Direct3D resource which has been + * mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of the + * subresource of the mapped Direct3D resource \p pResource, which corresponds + * to \p SubResource. The values set in \p pPitch and \p pPitchSlice may + * change every time that \p pResource is mapped. + * + * The pitch and Z-slice pitch values may be used to compute the location of a + * sample on a surface as follows. + * + * For a 2D surface, the byte offset of the sample at position \b x, \b y from + * the base pointer of the surface is: + * + * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x + * + * For a 3D surface, the byte offset of the sample at position \b x, \b y, + * \b z from the base pointer of the surface is: + * + * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x + * + * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to + * NULL. + * + * If \p pResource is not of type ::IDirect3DBaseTexture10 or one of its + * sub-types or if \p pResource has not been registered for use with CUDA, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered + * with usage flags ::CU_D3D10_REGISTER_FLAGS_NONE, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for + * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * For usage requirements of the \p SubResource parameter, see + * ::cuD3D10ResourceGetMappedPointer(). + * + * \param pPitch - Returned pitch of subresource + * \param pPitchSlice - Returned Z-slice pitch of subresource + * \param pResource - Mapped resource to access + * \param SubResource - Subresource of pResource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsSubResourceGetMappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource); + +/** + * \brief Get the dimensions of a registered surface + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the + * subresource of the mapped Direct3D resource \p pResource, which corresponds + * to \p SubResource. + * + * Because anti-aliased surfaces may have multiple samples per pixel, it is + * possible that the dimensions of a resource will be an integer factor larger + * than the dimensions reported by the Direct3D runtime. + * + * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D + * surfaces, the value returned in \p *pDepth will be 0. + * + * If \p pResource is not of type ::IDirect3DBaseTexture10 or + * ::IDirect3DSurface10 or if \p pResource has not been registered for use + * with CUDA, then ::CUDA_ERROR_INVALID_HANDLE is returned. + * + * For usage requirements of the \p SubResource parameter, see + * ::cuD3D10ResourceGetMappedPointer(). + * + * \param pWidth - Returned width of surface + * \param pHeight - Returned height of surface + * \param pDepth - Returned depth of surface + * \param pResource - Registered resource to access + * \param SubResource - Subresource of pResource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuGraphicsSubResourceGetMappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D10ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int SubResource); + +/** @} */ /* END CUDA_D3D10_DEPRECATED */ +/** @} */ /* END CUDA_D3D10 */ + + +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuD3D10CtxCreate + #undef cuD3D10ResourceGetSurfaceDimensions + #undef cuD3D10ResourceGetMappedPointer + #undef cuD3D10ResourceGetMappedSize + #undef cuD3D10ResourceGetMappedPitch + + CUresult CUDAAPI cuD3D10CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice); + CUresult CUDAAPI cuD3D10ResourceGetMappedPitch(unsigned int *pPitch, unsigned int *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource); + CUresult CUDAAPI cuD3D10ResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource); + CUresult CUDAAPI cuD3D10ResourceGetMappedSize(unsigned int *pSize, ID3D10Resource *pResource, unsigned int SubResource); + CUresult CUDAAPI cuD3D10ResourceGetSurfaceDimensions(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, ID3D10Resource *pResource, unsigned int SubResource); +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#ifdef __cplusplus +}; +#endif + +#undef __CUDA_DEPRECATED + +#endif + diff --git a/ext/cudart/include/cudaD3D10Typedefs.h b/ext/cudart/include/cudaD3D10Typedefs.h new file mode 100644 index 0000000000000000000000000000000000000000..d84885dc51366b629166244a6ff398305ac3a47f --- /dev/null +++ b/ext/cudart/include/cudaD3D10Typedefs.h @@ -0,0 +1,119 @@ +/* + * Copyright 2020-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAD3D10TYPEDEFS_H +#define CUDAD3D10TYPEDEFS_H + +// Dependent includes for cudaD3D10.h +#include <rpcsal.h> +#include <D3D10_1.h> + +#include <cudaD3D10.h> + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/* + * Macros for the latest version for each driver function in cudaD3D10.h + */ +#define PFN_cuD3D10GetDevice PFN_cuD3D10GetDevice_v2010 +#define PFN_cuD3D10GetDevices PFN_cuD3D10GetDevices_v3020 +#define PFN_cuGraphicsD3D10RegisterResource PFN_cuGraphicsD3D10RegisterResource_v3000 +#define PFN_cuD3D10CtxCreate PFN_cuD3D10CtxCreate_v3020 +#define PFN_cuD3D10CtxCreateOnDevice PFN_cuD3D10CtxCreateOnDevice_v3020 +#define PFN_cuD3D10GetDirect3DDevice PFN_cuD3D10GetDirect3DDevice_v3020 +#define PFN_cuD3D10RegisterResource PFN_cuD3D10RegisterResource_v2010 +#define PFN_cuD3D10UnregisterResource PFN_cuD3D10UnregisterResource_v2010 +#define PFN_cuD3D10MapResources PFN_cuD3D10MapResources_v2010 +#define PFN_cuD3D10UnmapResources PFN_cuD3D10UnmapResources_v2010 +#define PFN_cuD3D10ResourceSetMapFlags PFN_cuD3D10ResourceSetMapFlags_v2010 +#define PFN_cuD3D10ResourceGetMappedArray PFN_cuD3D10ResourceGetMappedArray_v2010 +#define PFN_cuD3D10ResourceGetMappedPointer PFN_cuD3D10ResourceGetMappedPointer_v3020 +#define PFN_cuD3D10ResourceGetMappedSize PFN_cuD3D10ResourceGetMappedSize_v3020 +#define PFN_cuD3D10ResourceGetMappedPitch PFN_cuD3D10ResourceGetMappedPitch_v3020 +#define PFN_cuD3D10ResourceGetSurfaceDimensions PFN_cuD3D10ResourceGetSurfaceDimensions_v3020 + + +/** + * Type definitions for functions defined in cudaD3D10.h + */ +typedef CUresult (CUDAAPI *PFN_cuD3D10GetDevice_v2010)(CUdevice_v1 *pCudaDevice, IDXGIAdapter *pAdapter); +typedef CUresult (CUDAAPI *PFN_cuD3D10GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, CUd3d10DeviceList deviceList); +typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D10RegisterResource_v3000)(CUgraphicsResource *pCudaResource, ID3D10Resource *pD3DResource, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice); +typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, ID3D10Device *pD3DDevice, CUdevice_v1 cudaDevice); +typedef CUresult (CUDAAPI *PFN_cuD3D10GetDirect3DDevice_v3020)(ID3D10Device **ppD3DDevice); +typedef CUresult (CUDAAPI *PFN_cuD3D10RegisterResource_v2010)(ID3D10Resource *pResource, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuD3D10UnregisterResource_v2010)(ID3D10Resource *pResource); +typedef CUresult (CUDAAPI *PFN_cuD3D10MapResources_v2010)(unsigned int count, ID3D10Resource **ppResources); +typedef CUresult (CUDAAPI *PFN_cuD3D10UnmapResources_v2010)(unsigned int count, ID3D10Resource **ppResources); +typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceSetMapFlags_v2010)(ID3D10Resource *pResource, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedArray_v2010)(CUarray *pArray, ID3D10Resource *pResource, unsigned int SubResource); +typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource); +typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedSize_v3020)(size_t *pSize, ID3D10Resource *pResource, unsigned int SubResource); +typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPitch_v3020)(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource); +typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetSurfaceDimensions_v3020)(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int SubResource); + +/* + * Type definitions for older versioned functions in cudaD3D10.h + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + typedef CUresult (CUDAAPI *PFN_cuD3D10CtxCreate_v2010)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D10Device *pD3DDevice); + typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPitch_v2010)(unsigned int *pPitch, unsigned int *pPitchSlice, ID3D10Resource *pResource, unsigned int SubResource); + typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedPointer_v2010)(CUdeviceptr_v1 *pDevPtr, ID3D10Resource *pResource, unsigned int SubResource); + typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetMappedSize_v2010)(unsigned int *pSize, ID3D10Resource *pResource, unsigned int SubResource); + typedef CUresult (CUDAAPI *PFN_cuD3D10ResourceGetSurfaceDimensions_v2010)(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, ID3D10Resource *pResource, unsigned int SubResource); +#endif + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // file guard diff --git a/ext/cudart/include/cudaD3D11.h b/ext/cudart/include/cudaD3D11.h new file mode 100644 index 0000000000000000000000000000000000000000..302d297b0985540c64f09a575b982422170b8634 --- /dev/null +++ b/ext/cudart/include/cudaD3D11.h @@ -0,0 +1,357 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAD3D11_H +#define CUDAD3D11_H + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif + +#ifdef CUDA_FORCE_API_VERSION +#error "CUDA_FORCE_API_VERSION is no longer supported." +#endif + +#define cuD3D11CtxCreate cuD3D11CtxCreate_v2 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \defgroup CUDA_D3D11 Direct3D 11 Interoperability + * \ingroup CUDA_DRIVER + * + * ___MANBRIEF___ Direct3D 11 interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the Direct3D 11 interoperability functions of the + * low-level CUDA driver application programming interface. Note that mapping + * of Direct3D 11 resources is performed with the graphics API agnostic, resource + * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability". + * + * @{ + */ + +/** + * CUDA devices corresponding to a D3D11 device + */ +typedef enum CUd3d11DeviceList_enum { + CU_D3D11_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by a D3D11 device */ + CU_D3D11_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame */ + CU_D3D11_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D11 device in the next frame */ +} CUd3d11DeviceList; + +/** + * \brief Gets the CUDA device corresponding to a display adapter. + * + * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the + * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. + * + * If no device on \p pAdapter is CUDA-compatible the call will return + * ::CUDA_ERROR_NO_DEVICE. + * + * \param pCudaDevice - Returned CUDA device corresponding to \p pAdapter + * \param pAdapter - Adapter to query for CUDA device + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NO_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D11GetDevices, + * ::cudaD3D11GetDevice + */ +CUresult CUDAAPI cuD3D11GetDevice(CUdevice *pCudaDevice, IDXGIAdapter *pAdapter); + +/** + * \brief Gets the CUDA devices corresponding to a Direct3D 11 device + * + * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding + * to the Direct3D 11 device \p pD3D11Device. + * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices + * corresponding to the Direct3D 11 device \p pD3D11Device. + * + * If any of the GPUs being used to render \p pDevice are not CUDA capable then the + * call will return ::CUDA_ERROR_NO_DEVICE. + * + * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D11Device + * \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D11Device + * \param cudaDeviceCount - The size of the output device array \p pCudaDevices + * \param pD3D11Device - Direct3D 11 device to query for CUDA devices + * \param deviceList - The set of devices to return. This set may be + * ::CU_D3D11_DEVICE_LIST_ALL for all devices, + * ::CU_D3D11_DEVICE_LIST_CURRENT_FRAME for the devices used to + * render the current frame (in SLI), or + * ::CU_D3D11_DEVICE_LIST_NEXT_FRAME for the devices used to + * render the next frame (in SLI). + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NO_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D11GetDevice, + * ::cudaD3D11GetDevices + */ +CUresult CUDAAPI cuD3D11GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, CUd3d11DeviceList deviceList); + +/** + * \brief Register a Direct3D 11 resource for access by CUDA + * + * Registers the Direct3D 11 resource \p pD3DResource for access by CUDA and + * returns a CUDA handle to \p pD3Dresource in \p pCudaResource. + * The handle returned in \p pCudaResource may be used to map and unmap this + * resource until it is unregistered. + * On success this call will increase the internal reference count on + * \p pD3DResource. This reference count will be decremented when this + * resource is unregistered through ::cuGraphicsUnregisterResource(). + * + * This call is potentially high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pD3DResource must be one of the following. + * - ::ID3D11Buffer: may be accessed through a device pointer. + * - ::ID3D11Texture1D: individual subresources of the texture may be accessed via arrays + * - ::ID3D11Texture2D: individual subresources of the texture may be accessed via arrays + * - ::ID3D11Texture3D: individual subresources of the texture may be accessed via arrays + * + * The \p Flags argument may be used to specify additional parameters at register + * time. The valid values for this parameter are + * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this + * resource will be used. + * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will + * bind this resource to a surface reference. + * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform + * texture gather operations on this resource. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * - The primary rendertarget may not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * A complete list of supported DXGI formats is as follows. For compactness the + * notation A_{B,C,D} represents A_B, A_C, and A_D. + * - DXGI_FORMAT_A8_UNORM + * - DXGI_FORMAT_B8G8R8A8_UNORM + * - DXGI_FORMAT_B8G8R8X8_UNORM + * - DXGI_FORMAT_R16_FLOAT + * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R32_FLOAT + * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT} + * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT} + * - DXGI_FORMAT_R32_{SINT,UINT} + * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB} + * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM} + * + * If \p pD3DResource is of incorrect type or is already registered then + * ::CUDA_ERROR_INVALID_HANDLE is returned. + * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned. + * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE + * is returned. + * + * \param pCudaResource - Returned graphics resource handle + * \param pD3DResource - Direct3D resource to register + * \param Flags - Parameters for resource registration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsUnregisterResource, + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsD3D11RegisterResource + */ +CUresult CUDAAPI cuGraphicsD3D11RegisterResource(CUgraphicsResource *pCudaResource, ID3D11Resource *pD3DResource, unsigned int Flags); + +/** + * \defgroup CUDA_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] + * + * ___MANBRIEF___ deprecated Direct3D 11 interoperability functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes deprecated Direct3D 11 interoperability functionality. + * @{ + */ + +/** + * \brief Create a CUDA context for interoperability with Direct3D 11 + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA context with a D3D11 + * device in order to achieve maximum interoperability performance. + * + * \param pCtx - Returned newly created CUDA context + * \param pCudaDevice - Returned pointer to the device on which the context was created + * \param Flags - Context creation flags (see ::cuCtxCreate() for details) + * \param pD3DDevice - Direct3D device to create interoperability context with + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D11GetDevice, + * ::cuGraphicsD3D11RegisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice); + +/** + * \brief Create a CUDA context for interoperability with Direct3D 11 + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA context with a D3D11 + * device in order to achieve maximum interoperability performance. + * + * \param pCtx - Returned newly created CUDA context + * \param flags - Context creation flags (see ::cuCtxCreate() for details) + * \param pD3DDevice - Direct3D device to create interoperability context with + * \param cudaDevice - The CUDA device on which to create the context. This device + * must be among the devices returned when querying + * ::CU_D3D11_DEVICES_ALL from ::cuD3D11GetDevices. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D11GetDevices, + * ::cuGraphicsD3D11RegisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, ID3D11Device *pD3DDevice, CUdevice cudaDevice); + +/** + * \brief Get the Direct3D 11 device against which the current CUDA context was + * created + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA context with a D3D11 + * device in order to achieve maximum interoperability performance. + * + * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * \notefnerr + * + * \sa + * ::cuD3D11GetDevice + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D11GetDirect3DDevice(ID3D11Device **ppD3DDevice); + +/** @} */ /* END CUDA_D3D11_DEPRECATED */ +/** @} */ /* END CUDA_D3D11 */ + + +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuD3D11CtxCreate + + CUresult CUDAAPI cuD3D11CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice); +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#ifdef __cplusplus +}; +#endif + +#undef __CUDA_DEPRECATED + +#endif + diff --git a/ext/cudart/include/cudaD3D11Typedefs.h b/ext/cudart/include/cudaD3D11Typedefs.h new file mode 100644 index 0000000000000000000000000000000000000000..f517c3d48391eee5e5c48d046876c6fc9fdd3a71 --- /dev/null +++ b/ext/cudart/include/cudaD3D11Typedefs.h @@ -0,0 +1,92 @@ +/* + * Copyright 2020-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAD3D11TYPEDEFS_H +#define CUDAD3D11TYPEDEFS_H + +// Dependent includes for cudaD3D11.h +#include <rpcsal.h> +#include <D3D11_1.h> + +#include <cudaD3D11.h> + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/* + * Macros for the latest version for each driver function in cudaD3D11.h + */ +#define PFN_cuD3D11GetDevice PFN_cuD3D11GetDevice_v3000 +#define PFN_cuD3D11GetDevices PFN_cuD3D11GetDevices_v3020 +#define PFN_cuGraphicsD3D11RegisterResource PFN_cuGraphicsD3D11RegisterResource_v3000 +#define PFN_cuD3D11CtxCreate PFN_cuD3D11CtxCreate_v3020 +#define PFN_cuD3D11CtxCreateOnDevice PFN_cuD3D11CtxCreateOnDevice_v3020 +#define PFN_cuD3D11GetDirect3DDevice PFN_cuD3D11GetDirect3DDevice_v3020 + + +/** + * Type definitions for functions defined in cudaD3D11.h + */ +typedef CUresult (CUDAAPI *PFN_cuD3D11GetDevice_v3000)(CUdevice_v1 *pCudaDevice, IDXGIAdapter *pAdapter); +typedef CUresult (CUDAAPI *PFN_cuD3D11GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, CUd3d11DeviceList deviceList); +typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D11RegisterResource_v3000)(CUgraphicsResource *pCudaResource, ID3D11Resource *pD3DResource, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice); +typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, ID3D11Device *pD3DDevice, CUdevice_v1 cudaDevice); +typedef CUresult (CUDAAPI *PFN_cuD3D11GetDirect3DDevice_v3020)(ID3D11Device **ppD3DDevice); + +#if defined(__CUDA_API_VERSION_INTERNAL) + typedef CUresult (CUDAAPI *PFN_cuD3D11CtxCreate_v3000)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, ID3D11Device *pD3DDevice); +#endif + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // file guard diff --git a/ext/cudart/include/cudaD3D9.h b/ext/cudart/include/cudaD3D9.h new file mode 100644 index 0000000000000000000000000000000000000000..d615e35316740b83446462041df8645d3af22ca7 --- /dev/null +++ b/ext/cudart/include/cudaD3D9.h @@ -0,0 +1,886 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAD3D9_H +#define CUDAD3D9_H + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif + +#ifdef CUDA_FORCE_API_VERSION +#error "CUDA_FORCE_API_VERSION is no longer supported." +#endif + +#define cuD3D9CtxCreate cuD3D9CtxCreate_v2 +#define cuD3D9ResourceGetSurfaceDimensions cuD3D9ResourceGetSurfaceDimensions_v2 +#define cuD3D9ResourceGetMappedPointer cuD3D9ResourceGetMappedPointer_v2 +#define cuD3D9ResourceGetMappedSize cuD3D9ResourceGetMappedSize_v2 +#define cuD3D9ResourceGetMappedPitch cuD3D9ResourceGetMappedPitch_v2 +#define cuD3D9MapVertexBuffer cuD3D9MapVertexBuffer_v2 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \file cudaD3D9.h + * \brief Header file for the Direct3D 9 interoperability functions of the + * low-level CUDA driver application programming interface. + */ + +/** + * \defgroup CUDA_D3D9 Direct3D 9 Interoperability + * \ingroup CUDA_DRIVER + * + * ___MANBRIEF___ Direct3D 9 interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the Direct3D 9 interoperability functions of the + * low-level CUDA driver application programming interface. Note that mapping + * of Direct3D 9 resources is performed with the graphics API agnostic, resource + * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability". + * + * @{ + */ + +/** + * CUDA devices corresponding to a D3D9 device + */ +typedef enum CUd3d9DeviceList_enum { + CU_D3D9_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by a D3D9 device */ + CU_D3D9_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by a D3D9 device in its currently rendering frame */ + CU_D3D9_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by a D3D9 device in the next frame */ +} CUd3d9DeviceList; + +/** + * \brief Gets the CUDA device corresponding to a display adapter. + * + * Returns in \p *pCudaDevice the CUDA-compatible device corresponding to the + * adapter name \p pszAdapterName obtained from ::EnumDisplayDevices() or + * ::IDirect3D9::GetAdapterIdentifier(). + * + * If no device on the adapter with name \p pszAdapterName is CUDA-compatible, + * then the call will fail. + * + * \param pCudaDevice - Returned CUDA device corresponding to pszAdapterName + * \param pszAdapterName - Adapter name to query for device + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D9CtxCreate, + * ::cudaD3D9GetDevice + */ +CUresult CUDAAPI cuD3D9GetDevice(CUdevice *pCudaDevice, const char *pszAdapterName); + +/** + * \brief Gets the CUDA devices corresponding to a Direct3D 9 device + * + * Returns in \p *pCudaDeviceCount the number of CUDA-compatible device corresponding + * to the Direct3D 9 device \p pD3D9Device. + * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the CUDA-compatible devices + * corresponding to the Direct3D 9 device \p pD3D9Device. + * + * If any of the GPUs being used to render \p pDevice are not CUDA capable then the + * call will return ::CUDA_ERROR_NO_DEVICE. + * + * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D9Device + * \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D9Device + * \param cudaDeviceCount - The size of the output device array \p pCudaDevices + * \param pD3D9Device - Direct3D 9 device to query for CUDA devices + * \param deviceList - The set of devices to return. This set may be + * ::CU_D3D9_DEVICE_LIST_ALL for all devices, + * ::CU_D3D9_DEVICE_LIST_CURRENT_FRAME for the devices used to + * render the current frame (in SLI), or + * ::CU_D3D9_DEVICE_LIST_NEXT_FRAME for the devices used to + * render the next frame (in SLI). + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_NO_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_FOUND, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D9CtxCreate, + * ::cudaD3D9GetDevices + */ +CUresult CUDAAPI cuD3D9GetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, CUd3d9DeviceList deviceList); + +/** + * \brief Create a CUDA context for interoperability with Direct3D 9 + * + * Creates a new CUDA context, enables interoperability for that context with + * the Direct3D device \p pD3DDevice, and associates the created CUDA context + * with the calling thread. + * The created ::CUcontext will be returned in \p *pCtx. + * Direct3D resources from this device may be registered and mapped through the + * lifetime of this CUDA context. + * If \p pCudaDevice is non-NULL then the ::CUdevice on which this CUDA context was + * created will be returned in \p *pCudaDevice. + * + * On success, this call will increase the internal reference count on + * \p pD3DDevice. This reference count will be decremented upon destruction of + * this context through ::cuCtxDestroy(). + * This context will cease to function if \p pD3DDevice is destroyed or encounters + * an error. + * + * Note that this function is never required for correct functionality. Use of + * this function will result in accelerated interoperability only when the + * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice + * is not an IDirect3DDevice9Ex. In all other circumstances, this function is + * not necessary. + * + * \param pCtx - Returned newly created CUDA context + * \param pCudaDevice - Returned pointer to the device on which the context was created + * \param Flags - Context creation flags (see ::cuCtxCreate() for details) + * \param pD3DDevice - Direct3D device to create interoperability context with + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D9GetDevice, + * ::cuGraphicsD3D9RegisterResource + */ +CUresult CUDAAPI cuD3D9CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice); + +/** + * \brief Create a CUDA context for interoperability with Direct3D 9 + * + * Creates a new CUDA context, enables interoperability for that context with + * the Direct3D device \p pD3DDevice, and associates the created CUDA context + * with the calling thread. + * The created ::CUcontext will be returned in \p *pCtx. + * Direct3D resources from this device may be registered and mapped through the + * lifetime of this CUDA context. + * + * On success, this call will increase the internal reference count on + * \p pD3DDevice. This reference count will be decremented upon destruction of + * this context through ::cuCtxDestroy(). + * This context will cease to function if \p pD3DDevice is destroyed or encounters + * an error. + * + * Note that this function is never required for correct functionality. Use of + * this function will result in accelerated interoperability only when the + * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice + * is not an IDirect3DDevice9Ex. In all other circumstances, this function is + * not necessary. + * + * \param pCtx - Returned newly created CUDA context + * \param flags - Context creation flags (see ::cuCtxCreate() for details) + * \param pD3DDevice - Direct3D device to create interoperability context with + * \param cudaDevice - The CUDA device on which to create the context. This device + * must be among the devices returned when querying + * ::CU_D3D9_DEVICES_ALL from ::cuD3D9GetDevices. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D9GetDevices, + * ::cuGraphicsD3D9RegisterResource + */ +CUresult CUDAAPI cuD3D9CtxCreateOnDevice(CUcontext *pCtx, unsigned int flags, IDirect3DDevice9 *pD3DDevice, CUdevice cudaDevice); + +/** + * \brief Get the Direct3D 9 device against which the current CUDA context was + * created + * + * Returns in \p *ppD3DDevice the Direct3D device against which this CUDA context + * was created in ::cuD3D9CtxCreate(). + * + * \param ppD3DDevice - Returned Direct3D device corresponding to CUDA context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT + * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT + * \notefnerr + * + * \sa + * ::cuD3D9GetDevice, + * ::cudaD3D9GetDirect3DDevice + */ +CUresult CUDAAPI cuD3D9GetDirect3DDevice(IDirect3DDevice9 **ppD3DDevice); + +/** + * \brief Register a Direct3D 9 resource for access by CUDA + * + * Registers the Direct3D 9 resource \p pD3DResource for access by CUDA and + * returns a CUDA handle to \p pD3Dresource in \p pCudaResource. + * The handle returned in \p pCudaResource may be used to map and unmap this + * resource until it is unregistered. + * On success this call will increase the internal reference count on + * \p pD3DResource. This reference count will be decremented when this + * resource is unregistered through ::cuGraphicsUnregisterResource(). + * + * This call is potentially high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pD3DResource must be one of the following. + * - ::IDirect3DVertexBuffer9: may be accessed through a device pointer + * - ::IDirect3DIndexBuffer9: may be accessed through a device pointer + * - ::IDirect3DSurface9: may be accessed through an array. + * Only stand-alone objects of type ::IDirect3DSurface9 + * may be explicitly shared. In particular, individual mipmap levels and faces + * of cube maps may not be registered directly. To access individual surfaces + * associated with a texture, one must register the base texture object. + * - ::IDirect3DBaseTexture9: individual surfaces on this texture may be accessed + * through an array. + * + * The \p Flags argument may be used to specify additional parameters at register + * time. The valid values for this parameter are + * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this + * resource will be used. + * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will + * bind this resource to a surface reference. + * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform + * texture gather operations on this resource. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * - The primary rendertarget may not be registered with CUDA. + * - Resources allocated as shared may not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * A complete list of supported formats is as follows: + * - D3DFMT_L8 + * - D3DFMT_L16 + * - D3DFMT_A8R8G8B8 + * - D3DFMT_X8R8G8B8 + * - D3DFMT_G16R16 + * - D3DFMT_A8B8G8R8 + * - D3DFMT_A8 + * - D3DFMT_A8L8 + * - D3DFMT_Q8W8V8U8 + * - D3DFMT_V16U16 + * - D3DFMT_A16B16G16R16F + * - D3DFMT_A16B16G16R16 + * - D3DFMT_R32F + * - D3DFMT_G16R16F + * - D3DFMT_A32B32G32R32F + * - D3DFMT_G32R32F + * - D3DFMT_R16F + * + * If Direct3D interoperability is not initialized for this context using + * ::cuD3D9CtxCreate then ::CUDA_ERROR_INVALID_CONTEXT is returned. + * If \p pD3DResource is of incorrect type or is already registered then + * ::CUDA_ERROR_INVALID_HANDLE is returned. + * If \p pD3DResource cannot be registered then ::CUDA_ERROR_UNKNOWN is returned. + * If \p Flags is not one of the above specified value then ::CUDA_ERROR_INVALID_VALUE + * is returned. + * + * \param pCudaResource - Returned graphics resource handle + * \param pD3DResource - Direct3D resource to register + * \param Flags - Parameters for resource registration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuD3D9CtxCreate, + * ::cuGraphicsUnregisterResource, + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsD3D9RegisterResource + */ +CUresult CUDAAPI cuGraphicsD3D9RegisterResource(CUgraphicsResource *pCudaResource, IDirect3DResource9 *pD3DResource, unsigned int Flags); + +/** + * \defgroup CUDA_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] + * + * ___MANBRIEF___ deprecated Direct3D 9 interoperability functions of the + * low-level CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes deprecated Direct3D 9 interoperability functionality. + * @{ + */ + +/** Flags to register a resource */ +typedef enum CUd3d9register_flags_enum { + CU_D3D9_REGISTER_FLAGS_NONE = 0x00, + CU_D3D9_REGISTER_FLAGS_ARRAY = 0x01, +} CUd3d9register_flags; + +/** Flags to map or unmap a resource */ +typedef enum CUd3d9map_flags_enum { + CU_D3D9_MAPRESOURCE_FLAGS_NONE = 0x00, + CU_D3D9_MAPRESOURCE_FLAGS_READONLY = 0x01, + CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD = 0x02, +} CUd3d9map_flags; + +/** + * \brief Register a Direct3D resource for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Registers the Direct3D resource \p pResource for access by CUDA. + * + * If this call is successful, then the application will be able to map and + * unmap this resource until it is unregistered through + * ::cuD3D9UnregisterResource(). Also on success, this call will increase the + * internal reference count on \p pResource. This reference count will be + * decremented when this resource is unregistered through + * ::cuD3D9UnregisterResource(). + * + * This call is potentially high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pResource must be one of the following. + * + * - ::IDirect3DVertexBuffer9: Cannot be used with \p Flags set to + * ::CU_D3D9_REGISTER_FLAGS_ARRAY. + * - ::IDirect3DIndexBuffer9: Cannot be used with \p Flags set to + * ::CU_D3D9_REGISTER_FLAGS_ARRAY. + * - ::IDirect3DSurface9: Only stand-alone objects of type ::IDirect3DSurface9 + * may be explicitly shared. In particular, individual mipmap levels and + * faces of cube maps may not be registered directly. To access individual + * surfaces associated with a texture, one must register the base texture + * object. For restrictions on the \p Flags parameter, see type + * ::IDirect3DBaseTexture9. + * - ::IDirect3DBaseTexture9: When a texture is registered, all surfaces + * associated with the all mipmap levels of all faces of the texture will be + * accessible to CUDA. + * + * The \p Flags argument specifies the mechanism through which CUDA will access + * the Direct3D resource. The following values are allowed. + * + * - CU_D3D9_REGISTER_FLAGS_NONE: Specifies that CUDA will access this resource + * through a ::CUdeviceptr. The pointer, size, and (for textures), pitch for + * each subresource of this allocation may be queried through + * ::cuD3D9ResourceGetMappedPointer(), ::cuD3D9ResourceGetMappedSize(), and + * ::cuD3D9ResourceGetMappedPitch() respectively. This option is valid for + * all resource types. + * - ::CU_D3D9_REGISTER_FLAGS_ARRAY: Specifies that CUDA will access this + * resource through a ::CUarray queried on a sub-resource basis through + * ::cuD3D9ResourceGetMappedArray(). This option is only valid for resources + * of type ::IDirect3DSurface9 and subtypes of ::IDirect3DBaseTexture9. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * + * - The primary rendertarget may not be registered with CUDA. + * - Resources allocated as shared may not be registered with CUDA. + * - Any resources allocated in ::D3DPOOL_SYSTEMMEM or ::D3DPOOL_MANAGED may + * not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * If Direct3D interoperability is not initialized on this context, then + * ::CUDA_ERROR_INVALID_CONTEXT is returned. If \p pResource is of incorrect + * type (e.g. is a non-stand-alone ::IDirect3DSurface9) or is already + * registered, then ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource + * cannot be registered then ::CUDA_ERROR_UNKNOWN is returned. + * + * \param pResource - Resource to register for CUDA access + * \param Flags - Flags for resource registration + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_OUT_OF_MEMORY, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsD3D9RegisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9RegisterResource(IDirect3DResource9 *pResource, unsigned int Flags); + +/** + * \brief Unregister a Direct3D resource + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unregisters the Direct3D resource \p pResource so it is not accessible by + * CUDA unless registered again. + * + * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is + * returned. + * + * \param pResource - Resource to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsUnregisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnregisterResource(IDirect3DResource9 *pResource); + +/** + * \brief Map Direct3D resources for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Maps the \p count Direct3D resources in \p ppResource for access by CUDA. + * + * The resources in \p ppResource may be accessed in CUDA kernels until they + * are unmapped. Direct3D should not access any resources while they are mapped + * by CUDA. If an application does so the results are undefined. + * + * This function provides the synchronization guarantee that any Direct3D calls + * issued before ::cuD3D9MapResources() will complete before any CUDA kernels + * issued after ::cuD3D9MapResources() begin. + * + * If any of \p ppResource have not been registered for use with CUDA or if + * \p ppResource contains any duplicate entries, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResource are + * presently mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is + * returned. + * + * \param count - Number of resources in ppResource + * \param ppResource - Resources to map for CUDA usage + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsMapResources + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9MapResources(unsigned int count, IDirect3DResource9 **ppResource); + +/** + * \brief Unmaps Direct3D resources + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unmaps the \p count Direct3D resources in \p ppResource. + * + * This function provides the synchronization guarantee that any CUDA kernels + * issued before ::cuD3D9UnmapResources() will complete before any Direct3D + * calls issued after ::cuD3D9UnmapResources() begin. + * + * If any of \p ppResource have not been registered for use with CUDA or if + * \p ppResource contains any duplicate entries, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If any of \p ppResource are not + * presently mapped for access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is + * returned. + * + * \param count - Number of resources to unmap for CUDA + * \param ppResource - Resources to unmap for CUDA + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa + * ::cuGraphicsUnmapResources + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnmapResources(unsigned int count, IDirect3DResource9 **ppResource); + +/** + * \brief Set usage flags for mapping a Direct3D resource + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Set \p Flags for mapping the Direct3D resource \p pResource. + * + * Changes to \p Flags will take effect the next time \p pResource is mapped. + * The \p Flags argument may be any of the following: + * - ::CU_D3D9_MAPRESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA kernels. This is the default value. + * - ::CU_D3D9_MAPRESOURCE_FLAGS_READONLY: Specifies that CUDA kernels which + * access this resource will not write to this resource. + * - ::CU_D3D9_MAPRESOURCE_FLAGS_WRITEDISCARD: Specifies that CUDA kernels + * which access this resource will not read from this resource and will + * write over the entire contents of the resource, so none of the data + * previously stored in the resource will be preserved. + * + * If \p pResource has not been registered for use with CUDA, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is presently + * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned. + * + * \param pResource - Registered resource to set flags for + * \param Flags - Parameters for resource mapping + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsResourceSetMapFlags + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceSetMapFlags(IDirect3DResource9 *pResource, unsigned int Flags); + +/** + * \brief Get the dimensions of a registered surface + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the + * subresource of the mapped Direct3D resource \p pResource, which corresponds + * to \p Face and \p Level. + * + * Because anti-aliased surfaces may have multiple samples per pixel, it is + * possible that the dimensions of a resource will be an integer factor larger + * than the dimensions reported by the Direct3D runtime. + * + * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D + * surfaces, the value returned in \p *pDepth will be 0. + * + * If \p pResource is not of type ::IDirect3DBaseTexture9 or + * ::IDirect3DSurface9 or if \p pResource has not been registered for use with + * CUDA, then ::CUDA_ERROR_INVALID_HANDLE is returned. + * + * For usage requirements of \p Face and \p Level parameters, see + * ::cuD3D9ResourceGetMappedPointer(). + * + * \param pWidth - Returned width of surface + * \param pHeight - Returned height of surface + * \param pDepth - Returned depth of surface + * \param pResource - Registered resource to access + * \param Face - Face of resource to access + * \param Level - Level of resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE + * \notefnerr + * + * \sa ::cuGraphicsSubResourceGetMappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + +/** + * \brief Get an array through which to access a subresource of a Direct3D + * resource which has been mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pArray an array through which the subresource of the mapped + * Direct3D resource \p pResource which corresponds to \p Face and \p Level may + * be accessed. The value set in \p pArray may change every time that + * \p pResource is mapped. + * + * If \p pResource is not registered then ::CUDA_ERROR_INVALID_HANDLE is + * returned. If \p pResource was not registered with usage flags + * ::CU_D3D9_REGISTER_FLAGS_ARRAY then ::CUDA_ERROR_INVALID_HANDLE is + * returned. If \p pResource is not mapped then ::CUDA_ERROR_NOT_MAPPED is + * returned. + * + * For usage requirements of \p Face and \p Level parameters, see + * ::cuD3D9ResourceGetMappedPointer(). + * + * \param pArray - Returned array corresponding to subresource + * \param pResource - Mapped resource to access + * \param Face - Face of resource to access + * \param Level - Level of resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsSubResourceGetMappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedArray(CUarray *pArray, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + +/** + * \brief Get the pointer through which to access a subresource of a Direct3D + * resource which has been mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pDevPtr the base pointer of the subresource of the mapped + * Direct3D resource \p pResource, which corresponds to \p Face and \p Level. + * The value set in \p pDevPtr may change every time that \p pResource is + * mapped. + * + * If \p pResource is not registered, then ::CUDA_ERROR_INVALID_HANDLE is + * returned. If \p pResource was not registered with usage flags + * ::CU_D3D9_REGISTER_FLAGS_NONE, then ::CUDA_ERROR_INVALID_HANDLE is returned. + * If \p pResource is not mapped, then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * If \p pResource is of type ::IDirect3DCubeTexture9, then \p Face must one + * of the values enumerated by type ::D3DCUBEMAP_FACES. For all other types + * \p Face must be 0. If \p Face is invalid, then ::CUDA_ERROR_INVALID_VALUE + * is returned. + * + * If \p pResource is of type ::IDirect3DBaseTexture9, then \p Level must + * correspond to a valid mipmap level. At present only mipmap level 0 is + * supported. For all other types \p Level must be 0. If \p Level is invalid, + * then ::CUDA_ERROR_INVALID_VALUE is returned. + * + * \param pDevPtr - Returned pointer corresponding to subresource + * \param pResource - Mapped resource to access + * \param Face - Face of resource to access + * \param Level - Level of resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsResourceGetMappedPointer + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedPointer(CUdeviceptr *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + +/** + * \brief Get the size of a subresource of a Direct3D resource which has been + * mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pSize the size of the subresource of the mapped Direct3D + * resource \p pResource, which corresponds to \p Face and \p Level. The value + * set in \p pSize may change every time that \p pResource is mapped. + * + * If \p pResource has not been registered for use with CUDA, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource was not registered + * with usage flags ::CU_D3D9_REGISTER_FLAGS_NONE, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped for + * access by CUDA, then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * For usage requirements of \p Face and \p Level parameters, see + * ::cuD3D9ResourceGetMappedPointer. + * + * \param pSize - Returned size of subresource + * \param pResource - Mapped resource to access + * \param Face - Face of resource to access + * \param Level - Level of resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsResourceGetMappedPointer + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedSize(size_t *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + +/** + * \brief Get the pitch of a subresource of a Direct3D resource which has been + * mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of + * the subresource of the mapped Direct3D resource \p pResource, which + * corresponds to \p Face and \p Level. The values set in \p pPitch and + * \p pPitchSlice may change every time that \p pResource is mapped. + * + * The pitch and Z-slice pitch values may be used to compute the location of a + * sample on a surface as follows. + * + * For a 2D surface, the byte offset of the sample at position \b x, \b y from + * the base pointer of the surface is: + * + * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x + * + * For a 3D surface, the byte offset of the sample at position \b x, \b y, + * \b z from the base pointer of the surface is: + * + * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x + * + * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to + * NULL. + * + * If \p pResource is not of type ::IDirect3DBaseTexture9 or one of its + * sub-types or if \p pResource has not been registered for use with CUDA, + * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not + * registered with usage flags ::CU_D3D9_REGISTER_FLAGS_NONE, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p pResource is not mapped + * for access by CUDA then ::CUDA_ERROR_NOT_MAPPED is returned. + * + * For usage requirements of \p Face and \p Level parameters, see + * ::cuD3D9ResourceGetMappedPointer(). + * + * \param pPitch - Returned pitch of subresource + * \param pPitchSlice - Returned Z-slice pitch of subresource + * \param pResource - Mapped resource to access + * \param Face - Face of resource to access + * \param Level - Level of resource to access + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_NOT_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsSubResourceGetMappedArray + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + +/* CUDA 1.x compatibility API. These functions are deprecated, please use the ones above. */ +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9Begin(IDirect3DDevice9 *pDevice); +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9End(void); +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9RegisterVertexBuffer(IDirect3DVertexBuffer9 *pVB); +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9MapVertexBuffer(CUdeviceptr *pDevPtr, size_t *pSize, IDirect3DVertexBuffer9 *pVB); +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnmapVertexBuffer(IDirect3DVertexBuffer9 *pVB); +__CUDA_DEPRECATED CUresult CUDAAPI cuD3D9UnregisterVertexBuffer(IDirect3DVertexBuffer9 *pVB); + +/** @} */ /* END CUDA_D3D9_DEPRECATED */ +/** @} */ /* END CUDA_D3D9 */ + + +/** + * CUDA API versioning support + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuD3D9CtxCreate + #undef cuD3D9ResourceGetSurfaceDimensions + #undef cuD3D9ResourceGetMappedPointer + #undef cuD3D9ResourceGetMappedSize + #undef cuD3D9ResourceGetMappedPitch + #undef cuD3D9MapVertexBuffer + + CUresult CUDAAPI cuD3D9CtxCreate(CUcontext *pCtx, CUdevice *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice); + CUresult CUDAAPI cuD3D9ResourceGetSurfaceDimensions(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + CUresult CUDAAPI cuD3D9ResourceGetMappedPointer(CUdeviceptr_v1 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + CUresult CUDAAPI cuD3D9ResourceGetMappedSize(unsigned int *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + CUresult CUDAAPI cuD3D9ResourceGetMappedPitch(unsigned int *pPitch, unsigned int *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + CUresult CUDAAPI cuD3D9MapVertexBuffer(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, IDirect3DVertexBuffer9 *pVB); +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#ifdef __cplusplus +}; +#endif + +#undef __CUDA_DEPRECATED + +#endif + diff --git a/ext/cudart/include/cudaD3D9Typedefs.h b/ext/cudart/include/cudaD3D9Typedefs.h new file mode 100644 index 0000000000000000000000000000000000000000..40bcdc6014707f64864ca69cdaf24b3ec966fbfc --- /dev/null +++ b/ext/cudart/include/cudaD3D9Typedefs.h @@ -0,0 +1,131 @@ +/* + * Copyright 2020-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAD3D9TYPEDEFS_H +#define CUDAD3D9TYPEDEFS_H + +// Dependent includes for cudaD3D11.h +#include <d3d9.h> + +#include <cudaD3D9.h> + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/* + * Macros for the latest version for each driver function in cudaD3D9.h + */ +#define PFN_cuD3D9GetDevice PFN_cuD3D9GetDevice_v2000 +#define PFN_cuD3D9GetDevices PFN_cuD3D9GetDevices_v3020 +#define PFN_cuD3D9CtxCreate PFN_cuD3D9CtxCreate_v3020 +#define PFN_cuD3D9CtxCreateOnDevice PFN_cuD3D9CtxCreateOnDevice_v3020 +#define PFN_cuD3D9GetDirect3DDevice PFN_cuD3D9GetDirect3DDevice_v2000 +#define PFN_cuGraphicsD3D9RegisterResource PFN_cuGraphicsD3D9RegisterResource_v3000 +#define PFN_cuD3D9RegisterResource PFN_cuD3D9RegisterResource_v2000 +#define PFN_cuD3D9UnregisterResource PFN_cuD3D9UnregisterResource_v2000 +#define PFN_cuD3D9MapResources PFN_cuD3D9MapResources_v2000 +#define PFN_cuD3D9UnmapResources PFN_cuD3D9UnmapResources_v2000 +#define PFN_cuD3D9ResourceSetMapFlags PFN_cuD3D9ResourceSetMapFlags_v2000 +#define PFN_cuD3D9ResourceGetSurfaceDimensions PFN_cuD3D9ResourceGetSurfaceDimensions_v3020 +#define PFN_cuD3D9ResourceGetMappedArray PFN_cuD3D9ResourceGetMappedArray_v2010 +#define PFN_cuD3D9ResourceGetMappedPointer PFN_cuD3D9ResourceGetMappedPointer_v3020 +#define PFN_cuD3D9ResourceGetMappedSize PFN_cuD3D9ResourceGetMappedSize_v3020 +#define PFN_cuD3D9ResourceGetMappedPitch PFN_cuD3D9ResourceGetMappedPitch_v3020 +#define PFN_cuD3D9Begin PFN_cuD3D9Begin_v2000 +#define PFN_cuD3D9End PFN_cuD3D9End_v2000 +#define PFN_cuD3D9RegisterVertexBuffer PFN_cuD3D9RegisterVertexBuffer_v2000 +#define PFN_cuD3D9MapVertexBuffer PFN_cuD3D9MapVertexBuffer_v3020 +#define PFN_cuD3D9UnmapVertexBuffer PFN_cuD3D9UnmapVertexBuffer_v2000 +#define PFN_cuD3D9UnregisterVertexBuffer PFN_cuD3D9UnregisterVertexBuffer_v2000 + + +/** + * Type definitions for functions defined in cudaD3D9.h + */ +typedef CUresult (CUDAAPI *PFN_cuD3D9GetDevice_v2000)(CUdevice_v1 *pCudaDevice, const char *pszAdapterName); +typedef CUresult (CUDAAPI *PFN_cuD3D9GetDevices_v3020)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, CUd3d9DeviceList deviceList); +typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreate_v3020)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice); +typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreateOnDevice_v3020)(CUcontext *pCtx, unsigned int flags, IDirect3DDevice9 *pD3DDevice, CUdevice_v1 cudaDevice); +typedef CUresult (CUDAAPI *PFN_cuD3D9GetDirect3DDevice_v2000)(IDirect3DDevice9 **ppD3DDevice); +typedef CUresult (CUDAAPI *PFN_cuGraphicsD3D9RegisterResource_v3000)(CUgraphicsResource *pCudaResource, IDirect3DResource9 *pD3DResource, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuD3D9RegisterResource_v2000)(IDirect3DResource9 *pResource, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuD3D9UnregisterResource_v2000)(IDirect3DResource9 *pResource); +typedef CUresult (CUDAAPI *PFN_cuD3D9MapResources_v2000)(unsigned int count, IDirect3DResource9 **ppResource); +typedef CUresult (CUDAAPI *PFN_cuD3D9UnmapResources_v2000)(unsigned int count, IDirect3DResource9 **ppResource); +typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceSetMapFlags_v2000)(IDirect3DResource9 *pResource, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetSurfaceDimensions_v3020)(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); +typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedArray_v2010)(CUarray *pArray, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); +typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); +typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedSize_v3020)(size_t *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); +typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPitch_v3020)(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); +typedef CUresult (CUDAAPI *PFN_cuD3D9Begin_v2000)(IDirect3DDevice9 *pDevice); +typedef CUresult (CUDAAPI *PFN_cuD3D9End_v2000)(void); +typedef CUresult (CUDAAPI *PFN_cuD3D9RegisterVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB); +typedef CUresult (CUDAAPI *PFN_cuD3D9MapVertexBuffer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, IDirect3DVertexBuffer9 *pVB); +typedef CUresult (CUDAAPI *PFN_cuD3D9UnmapVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB); +typedef CUresult (CUDAAPI *PFN_cuD3D9UnregisterVertexBuffer_v2000)(IDirect3DVertexBuffer9 *pVB); + +/* + * Type definitions for older versioned functions in cudaD3D9.h + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + typedef CUresult (CUDAAPI *PFN_cuD3D9CtxCreate_v2000)(CUcontext *pCtx, CUdevice_v1 *pCudaDevice, unsigned int Flags, IDirect3DDevice9 *pD3DDevice); + typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetSurfaceDimensions_v2000)(unsigned int *pWidth, unsigned int *pHeight, unsigned int *pDepth, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPointer_v2000)(CUdeviceptr_v1 *pDevPtr, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedSize_v2000)(unsigned int *pSize, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + typedef CUresult (CUDAAPI *PFN_cuD3D9ResourceGetMappedPitch_v2000)(unsigned int *pPitch, unsigned int *pPitchSlice, IDirect3DResource9 *pResource, unsigned int Face, unsigned int Level); + typedef CUresult (CUDAAPI *PFN_cuD3D9MapVertexBuffer_v2000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, IDirect3DVertexBuffer9 *pVB); +#endif + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // file guard diff --git a/ext/cudart/include/cudaGL.h b/ext/cudart/include/cudaGL.h new file mode 100644 index 0000000000000000000000000000000000000000..21e972ae22658c55d6387599c9a92f3c1d50f20c --- /dev/null +++ b/ext/cudart/include/cudaGL.h @@ -0,0 +1,605 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAGL_H +#define CUDAGL_H + +#include <cuda.h> +#include <GL/gl.h> + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif + +#ifdef CUDA_FORCE_API_VERSION +#error "CUDA_FORCE_API_VERSION is no longer supported." +#endif + +#if defined(__CUDA_API_VERSION_INTERNAL) || defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define __CUDA_API_PER_THREAD_DEFAULT_STREAM + #define __CUDA_API_PTDS(api) api ## _ptds + #define __CUDA_API_PTSZ(api) api ## _ptsz +#else + #define __CUDA_API_PTDS(api) api + #define __CUDA_API_PTSZ(api) api +#endif + +#define cuGLCtxCreate cuGLCtxCreate_v2 +#define cuGLMapBufferObject __CUDA_API_PTDS(cuGLMapBufferObject_v2) +#define cuGLMapBufferObjectAsync __CUDA_API_PTSZ(cuGLMapBufferObjectAsync_v2) +#define cuGLGetDevices cuGLGetDevices_v2 + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \file cudaGL.h + * \brief Header file for the OpenGL interoperability functions of the + * low-level CUDA driver application programming interface. + */ + +/** + * \defgroup CUDA_GL OpenGL Interoperability + * \ingroup CUDA_DRIVER + * + * ___MANBRIEF___ OpenGL interoperability functions of the low-level CUDA + * driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the OpenGL interoperability functions of the + * low-level CUDA driver application programming interface. Note that mapping + * of OpenGL resources is performed with the graphics API agnostic, resource + * mapping interface described in \ref CUDA_GRAPHICS "Graphics Interoperability". + * + * @{ + */ + +#if defined(_WIN32) +#if !defined(WGL_NV_gpu_affinity) +typedef void* HGPUNV; +#endif +#endif /* _WIN32 */ + +/** + * \brief Registers an OpenGL buffer object + * + * Registers the buffer object specified by \p buffer for access by + * CUDA. A handle to the registered object is returned as \p + * pCudaResource. The register flags \p Flags specify the intended usage, + * as follows: + * + * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA. This is the default value. + * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA + * will not write to this resource. + * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that + * CUDA will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously + * stored in the resource will be preserved. + * + * \param pCudaResource - Pointer to the returned object handle + * \param buffer - name of buffer object to be registered + * \param Flags - Register flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa + * ::cuGraphicsUnregisterResource, + * ::cuGraphicsMapResources, + * ::cuGraphicsResourceGetMappedPointer, + * ::cudaGraphicsGLRegisterBuffer + */ +CUresult CUDAAPI cuGraphicsGLRegisterBuffer(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags); + +/** + * \brief Register an OpenGL texture or renderbuffer object + * + * Registers the texture or renderbuffer object specified by \p image for access by CUDA. + * A handle to the registered object is returned as \p pCudaResource. + * + * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, + * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, + * or ::GL_RENDERBUFFER. + * + * The register flags \p Flags specify the intended usage, as follows: + * + * - ::CU_GRAPHICS_REGISTER_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA. This is the default value. + * - ::CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY: Specifies that CUDA + * will not write to this resource. + * - ::CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD: Specifies that + * CUDA will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously + * stored in the resource will be preserved. + * - ::CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST: Specifies that CUDA will + * bind this resource to a surface reference. + * - ::CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER: Specifies that CUDA will perform + * texture gather operations on this resource. + * + * The following image formats are supported. For brevity's sake, the list is abbreviated. + * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats + * {GL_R8, GL_R16, GL_RG8, GL_RG16} : + * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY + * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I} + * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X + * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT} + * + * The following image classes are currently disallowed: + * - Textures with borders + * - Multisampled renderbuffers + * + * \param pCudaResource - Pointer to the returned object handle + * \param image - name of texture or renderbuffer object to be registered + * \param target - Identifies the type of object specified by \p image + * \param Flags - Register flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa + * ::cuGraphicsUnregisterResource, + * ::cuGraphicsMapResources, + * ::cuGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsGLRegisterImage + */ +CUresult CUDAAPI cuGraphicsGLRegisterImage(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags); + +#ifdef _WIN32 +/** + * \brief Gets the CUDA device associated with hGpu + * + * Returns in \p *pDevice the CUDA device associated with a \p hGpu, if + * applicable. + * + * \param pDevice - Device associated with hGpu + * \param hGpu - Handle to a GPU, as queried via ::WGL_NV_gpu_affinity() + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuGLMapBufferObject, + * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject, + * ::cuGLUnregisterBufferObject, ::cuGLUnmapBufferObjectAsync, + * ::cuGLSetBufferObjectMapFlags, + * ::cudaWGLGetDevice + */ +CUresult CUDAAPI cuWGLGetDevice(CUdevice *pDevice, HGPUNV hGpu); +#endif /* _WIN32 */ + +/** + * CUDA devices corresponding to an OpenGL device + */ +typedef enum CUGLDeviceList_enum { + CU_GL_DEVICE_LIST_ALL = 0x01, /**< The CUDA devices for all GPUs used by the current OpenGL context */ + CU_GL_DEVICE_LIST_CURRENT_FRAME = 0x02, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */ + CU_GL_DEVICE_LIST_NEXT_FRAME = 0x03, /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */ +} CUGLDeviceList; + +/** + * \brief Gets the CUDA devices associated with the current OpenGL context + * + * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices + * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices + * at most cudaDeviceCount of the CUDA-compatible devices corresponding to + * the current OpenGL context. If any of the GPUs being used by the current OpenGL + * context are not CUDA capable then the call will return CUDA_ERROR_NO_DEVICE. + * + * The \p deviceList argument may be any of the following: + * - ::CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL context. + * - ::CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the current OpenGL context to + * render the current frame (in SLI). + * - ::CU_GL_DEVICE_LIST_NEXT_FRAME: Query the devices used by the current OpenGL context to + * render the next frame (in SLI). Note that this is a prediction, it can't be guaranteed that + * this is correct in all cases. + * + * \param pCudaDeviceCount - Returned number of CUDA devices. + * \param pCudaDevices - Returned CUDA devices. + * \param cudaDeviceCount - The size of the output device array pCudaDevices. + * \param deviceList - The set of devices to return. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NO_DEVICE, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_GRAPHICS_CONTEXT + * + * \notefnerr + * + * \sa + * ::cuWGLGetDevice, + * ::cudaGLGetDevices + */ +CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList); + +/** + * \defgroup CUDA_GL_DEPRECATED OpenGL Interoperability [DEPRECATED] + * + * ___MANBRIEF___ deprecated OpenGL interoperability functions of the low-level + * CUDA driver API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes deprecated OpenGL interoperability functionality. + * + * @{ + */ + +/** Flags to map or unmap a resource */ +typedef enum CUGLmap_flags_enum { + CU_GL_MAP_RESOURCE_FLAGS_NONE = 0x00, + CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01, + CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02, +} CUGLmap_flags; + +/** + * \brief Create a CUDA context for interoperability with OpenGL + * + * \deprecated This function is deprecated as of Cuda 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA context with an OpenGL + * context in order to achieve maximum interoperability performance. + * + * \param pCtx - Returned CUDA context + * \param Flags - Options for CUDA context creation + * \param device - Device on which to create the context + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_OUT_OF_MEMORY + * \notefnerr + * + * \sa ::cuCtxCreate, ::cuGLInit, ::cuGLMapBufferObject, + * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject, + * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync, + * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags, + * ::cuWGLGetDevice + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device ); + +/** + * \brief Initializes OpenGL interoperability + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Initializes OpenGL interoperability. This function is deprecated + * and calling it is no longer required. It may fail if the needed + * OpenGL driver facilities are not available. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_UNKNOWN + * \notefnerr + * + * \sa ::cuGLMapBufferObject, + * ::cuGLRegisterBufferObject, ::cuGLUnmapBufferObject, + * ::cuGLUnregisterBufferObject, ::cuGLMapBufferObjectAsync, + * ::cuGLUnmapBufferObjectAsync, ::cuGLSetBufferObjectMapFlags, + * ::cuWGLGetDevice + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLInit(void); + +/** + * \brief Registers an OpenGL buffer object + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Registers the buffer object specified by \p buffer for access by + * CUDA. This function must be called before CUDA can map the buffer + * object. There must be a valid OpenGL context bound to the current + * thread when this function is called, and the buffer name is + * resolved by that context. + * + * \param buffer - The name of the buffer object to register. + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_ALREADY_MAPPED + * \notefnerr + * + * \sa ::cuGraphicsGLRegisterBuffer + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLRegisterBufferObject(GLuint buffer); + +/** + * \brief Maps an OpenGL buffer object + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Maps the buffer object specified by \p buffer into the address space of the + * current CUDA context and returns in \p *dptr and \p *size the base pointer + * and size of the resulting mapping. + * + * There must be a valid OpenGL context bound to the current thread + * when this function is called. This must be the same context, or a + * member of the same shareGroup, as the context that was bound when + * the buffer was registered. + * + * All streams in the current CUDA context are synchronized with the + * current GL context. + * + * \param dptr - Returned mapped base pointer + * \param size - Returned size of mapping + * \param buffer - The name of the buffer object to map + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_MAP_FAILED + * \notefnerr + * + * \sa ::cuGraphicsMapResources + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr *dptr, size_t *size, GLuint buffer); + +/** + * \brief Unmaps an OpenGL buffer object + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Unmaps the buffer object specified by \p buffer for access by CUDA. + * + * There must be a valid OpenGL context bound to the current thread + * when this function is called. This must be the same context, or a + * member of the same shareGroup, as the context that was bound when + * the buffer was registered. + * + * All streams in the current CUDA context are synchronized with the + * current GL context. + * + * \param buffer - Buffer object to unmap + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuGraphicsUnmapResources + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObject(GLuint buffer); + +/** + * \brief Unregister an OpenGL buffer object + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Unregisters the buffer object specified by \p buffer. This + * releases any resources associated with the registered buffer. + * After this call, the buffer may no longer be mapped for access by + * CUDA. + * + * There must be a valid OpenGL context bound to the current thread + * when this function is called. This must be the same context, or a + * member of the same shareGroup, as the context that was bound when + * the buffer was registered. + * + * \param buffer - Name of the buffer object to unregister + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuGraphicsUnregisterResource + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnregisterBufferObject(GLuint buffer); + +/** + * \brief Set the map flags for an OpenGL buffer object + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Sets the map flags for the buffer object specified by \p buffer. + * + * Changes to \p Flags will take effect the next time \p buffer is mapped. + * The \p Flags argument may be any of the following: + * - ::CU_GL_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA kernels. This is the default value. + * - ::CU_GL_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA kernels which + * access this resource will not write to this resource. + * - ::CU_GL_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that CUDA kernels + * which access this resource will not read from this resource and will + * write over the entire contents of the resource, so none of the data + * previously stored in the resource will be preserved. + * + * If \p buffer has not been registered for use with CUDA, then + * ::CUDA_ERROR_INVALID_HANDLE is returned. If \p buffer is presently + * mapped for access by CUDA, then ::CUDA_ERROR_ALREADY_MAPPED is returned. + * + * There must be a valid OpenGL context bound to the current thread + * when this function is called. This must be the same context, or a + * member of the same shareGroup, as the context that was bound when + * the buffer was registered. + * + * \param buffer - Buffer object to unmap + * \param Flags - Map flags + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_HANDLE, + * ::CUDA_ERROR_ALREADY_MAPPED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * \notefnerr + * + * \sa ::cuGraphicsResourceSetMapFlags + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLSetBufferObjectMapFlags(GLuint buffer, unsigned int Flags); + +/** + * \brief Maps an OpenGL buffer object + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Maps the buffer object specified by \p buffer into the address space of the + * current CUDA context and returns in \p *dptr and \p *size the base pointer + * and size of the resulting mapping. + * + * There must be a valid OpenGL context bound to the current thread + * when this function is called. This must be the same context, or a + * member of the same shareGroup, as the context that was bound when + * the buffer was registered. + * + * Stream \p hStream in the current CUDA context is synchronized with + * the current GL context. + * + * \param dptr - Returned mapped base pointer + * \param size - Returned size of mapping + * \param buffer - The name of the buffer object to map + * \param hStream - Stream to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_MAP_FAILED + * \notefnerr + * + * \sa ::cuGraphicsMapResources + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream); + +/** + * \brief Unmaps an OpenGL buffer object + * + * \deprecated This function is deprecated as of Cuda 3.0. + * + * Unmaps the buffer object specified by \p buffer for access by CUDA. + * + * There must be a valid OpenGL context bound to the current thread + * when this function is called. This must be the same context, or a + * member of the same shareGroup, as the context that was bound when + * the buffer was registered. + * + * Stream \p hStream in the current CUDA context is synchronized with + * the current GL context. + * + * \param buffer - Name of the buffer object to unmap + * \param hStream - Stream to synchronize + * + * \return + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_DEINITIALIZED, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_INVALID_CONTEXT, + * ::CUDA_ERROR_INVALID_VALUE + * \notefnerr + * + * \sa ::cuGraphicsUnmapResources + */ +__CUDA_DEPRECATED CUresult CUDAAPI cuGLUnmapBufferObjectAsync(GLuint buffer, CUstream hStream); + +/** @} */ /* END CUDA_GL_DEPRECATED */ +/** @} */ /* END CUDA_GL */ + + +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cuGLCtxCreate + #undef cuGLMapBufferObject + #undef cuGLMapBufferObjectAsync + #undef cuGLGetDevices + + CUresult CUDAAPI cuGLGetDevices(unsigned int *pCudaDeviceCount, CUdevice *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList); + CUresult CUDAAPI cuGLMapBufferObject_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer); + CUresult CUDAAPI cuGLMapBufferObjectAsync_v2(CUdeviceptr *dptr, size_t *size, GLuint buffer, CUstream hStream); + CUresult CUDAAPI cuGLCtxCreate(CUcontext *pCtx, unsigned int Flags, CUdevice device ); + CUresult CUDAAPI cuGLMapBufferObject(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer); + CUresult CUDAAPI cuGLMapBufferObjectAsync(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream); +#endif /* __CUDA_API_VERSION_INTERNAL */ + +#ifdef __cplusplus +}; +#endif + +#undef __CUDA_DEPRECATED + +#endif diff --git a/ext/cudart/include/cudaGLTypedefs.h b/ext/cudart/include/cudaGLTypedefs.h new file mode 100644 index 0000000000000000000000000000000000000000..81f0d5349e435159647af9af379d1e8e8441221c --- /dev/null +++ b/ext/cudart/include/cudaGLTypedefs.h @@ -0,0 +1,123 @@ +/* + * Copyright 2020-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAGLTYPEDEFS_H +#define CUDAGLTYPEDEFS_H + +// Dependent includes for cudagl.h +#include <GL/gl.h> + +#include <cudaGL.h> + +#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds + #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz +#else + #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version + #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/* + * Macros for the latest version for each driver function in cudaGL.h + */ +#define PFN_cuGraphicsGLRegisterBuffer PFN_cuGraphicsGLRegisterBuffer_v3000 +#define PFN_cuGraphicsGLRegisterImage PFN_cuGraphicsGLRegisterImage_v3000 +#define PFN_cuWGLGetDevice PFN_cuWGLGetDevice_v2020 +#define PFN_cuGLGetDevices PFN_cuGLGetDevices_v6050 +#define PFN_cuGLCtxCreate PFN_cuGLCtxCreate_v3020 +#define PFN_cuGLInit PFN_cuGLInit_v2000 +#define PFN_cuGLRegisterBufferObject PFN_cuGLRegisterBufferObject_v2000 +#define PFN_cuGLMapBufferObject __API_TYPEDEF_PTDS(PFN_cuGLMapBufferObject, 3020, 7000) +#define PFN_cuGLUnmapBufferObject PFN_cuGLUnmapBufferObject_v2000 +#define PFN_cuGLUnregisterBufferObject PFN_cuGLUnregisterBufferObject_v2000 +#define PFN_cuGLSetBufferObjectMapFlags PFN_cuGLSetBufferObjectMapFlags_v2030 +#define PFN_cuGLMapBufferObjectAsync __API_TYPEDEF_PTSZ(PFN_cuGLMapBufferObjectAsync, 3020, 7000) +#define PFN_cuGLUnmapBufferObjectAsync PFN_cuGLUnmapBufferObjectAsync_v2030 + + +/** + * Type definitions for functions defined in cudaGL.h + */ +typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterBuffer_v3000)(CUgraphicsResource *pCudaResource, GLuint buffer, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuGraphicsGLRegisterImage_v3000)(CUgraphicsResource *pCudaResource, GLuint image, GLenum target, unsigned int Flags); +#ifdef _WIN32 +typedef CUresult (CUDAAPI *PFN_cuWGLGetDevice_v2020)(CUdevice_v1 *pDevice, HGPUNV hGpu); +#endif +typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v6050)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList); +typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v3020)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device); +typedef CUresult (CUDAAPI *PFN_cuGLInit_v2000)(void); +typedef CUresult (CUDAAPI *PFN_cuGLRegisterBufferObject_v2000)(GLuint buffer); +typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v7000_ptds)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer); +typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObject_v2000)(GLuint buffer); +typedef CUresult (CUDAAPI *PFN_cuGLUnregisterBufferObject_v2000)(GLuint buffer); +typedef CUresult (CUDAAPI *PFN_cuGLSetBufferObjectMapFlags_v2030)(GLuint buffer, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v7000_ptsz)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGLUnmapBufferObjectAsync_v2030)(GLuint buffer, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer); +typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v3020)(CUdeviceptr_v2 *dptr, size_t *size, GLuint buffer, CUstream hStream); + +/* + * Type definitions for older versioned functions in cuda.h + */ +#if defined(__CUDA_API_VERSION_INTERNAL) +typedef CUresult (CUDAAPI *PFN_cuGLGetDevices_v4010)(unsigned int *pCudaDeviceCount, CUdevice_v1 *pCudaDevices, unsigned int cudaDeviceCount, CUGLDeviceList deviceList); +typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObject_v2000)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer); +typedef CUresult (CUDAAPI *PFN_cuGLMapBufferObjectAsync_v2030)(CUdeviceptr_v1 *dptr, unsigned int *size, GLuint buffer, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGLCtxCreate_v2000)(CUcontext *pCtx, unsigned int Flags, CUdevice_v1 device); +#endif + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // file guard diff --git a/ext/cudart/include/cudaProfilerTypedefs.h b/ext/cudart/include/cudaProfilerTypedefs.h new file mode 100644 index 0000000000000000000000000000000000000000..bea7df4573aff2fa5b0d0029ce9d40a7ebe2de46 --- /dev/null +++ b/ext/cudart/include/cudaProfilerTypedefs.h @@ -0,0 +1,78 @@ +/* + * Copyright 2020-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDAPROFILERTYPEDEFS_H +#define CUDAPROFILERTYPEDEFS_H + +#include <cudaProfiler.h> + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/* + * Macros for the latest version for each driver function in cudaProfiler.h + */ +#define PFN_cuProfilerInitialize PFN_cuProfilerInitialize_v4000 +#define PFN_cuProfilerStart PFN_cuProfilerStart_v4000 +#define PFN_cuProfilerStop PFN_cuProfilerStop_v4000 + + +/** + * Type definitions for functions defined in cudaProfiler.h + */ +typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode); +typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void); +typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // file guard diff --git a/ext/cudart/include/cudaTypedefs.h b/ext/cudart/include/cudaTypedefs.h new file mode 100644 index 0000000000000000000000000000000000000000..3ab767efea8d520fb8f95b4485428cafe70dbdbc --- /dev/null +++ b/ext/cudart/include/cudaTypedefs.h @@ -0,0 +1,959 @@ +/* + * Copyright 2020-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef CUDATYPEDEFS_H +#define CUDATYPEDEFS_H + +#include <cuda.h> + +#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) + #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptds + #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## ptds_version ## _ptsz +#else + #define __API_TYPEDEF_PTDS(api, default_version, ptds_version) api ## _v ## default_version + #define __API_TYPEDEF_PTSZ(api, default_version, ptds_version) api ## _v ## default_version +#endif + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +/* + * Macros for the latest version for each driver function in cuda.h + */ +#define PFN_cuGetErrorString PFN_cuGetErrorString_v6000 +#define PFN_cuGetErrorName PFN_cuGetErrorName_v6000 +#define PFN_cuInit PFN_cuInit_v2000 +#define PFN_cuDriverGetVersion PFN_cuDriverGetVersion_v2020 +#define PFN_cuDeviceGet PFN_cuDeviceGet_v2000 +#define PFN_cuDeviceGetCount PFN_cuDeviceGetCount_v2000 +#define PFN_cuDeviceGetName PFN_cuDeviceGetName_v2000 +#define PFN_cuDeviceGetUuid PFN_cuDeviceGetUuid_v11040 +#define PFN_cuDeviceGetLuid PFN_cuDeviceGetLuid_v10000 +#define PFN_cuDeviceTotalMem PFN_cuDeviceTotalMem_v3020 +#define PFN_cuDeviceGetTexture1DLinearMaxWidth PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010 +#define PFN_cuDeviceGetAttribute PFN_cuDeviceGetAttribute_v2000 +#define PFN_cuDeviceGetNvSciSyncAttributes PFN_cuDeviceGetNvSciSyncAttributes_v10020 +#define PFN_cuDeviceSetMemPool PFN_cuDeviceSetMemPool_v11020 +#define PFN_cuDeviceGetMemPool PFN_cuDeviceGetMemPool_v11020 +#define PFN_cuDeviceGetDefaultMemPool PFN_cuDeviceGetDefaultMemPool_v11020 +#define PFN_cuDeviceGetProperties PFN_cuDeviceGetProperties_v2000 +#define PFN_cuDeviceComputeCapability PFN_cuDeviceComputeCapability_v2000 +#define PFN_cuDevicePrimaryCtxRetain PFN_cuDevicePrimaryCtxRetain_v7000 +#define PFN_cuDevicePrimaryCtxRelease PFN_cuDevicePrimaryCtxRelease_v11000 +#define PFN_cuDevicePrimaryCtxSetFlags PFN_cuDevicePrimaryCtxSetFlags_v11000 +#define PFN_cuDevicePrimaryCtxGetState PFN_cuDevicePrimaryCtxGetState_v7000 +#define PFN_cuDevicePrimaryCtxReset PFN_cuDevicePrimaryCtxReset_v11000 +#define PFN_cuDeviceGetExecAffinitySupport PFN_cuDeviceGetExecAffinitySupport_v11040 +#define PFN_cuCtxCreate PFN_cuCtxCreate_v11040 +#define PFN_cuCtxDestroy PFN_cuCtxDestroy_v4000 +#define PFN_cuCtxPushCurrent PFN_cuCtxPushCurrent_v4000 +#define PFN_cuCtxPopCurrent PFN_cuCtxPopCurrent_v4000 +#define PFN_cuCtxSetCurrent PFN_cuCtxSetCurrent_v4000 +#define PFN_cuCtxGetCurrent PFN_cuCtxGetCurrent_v4000 +#define PFN_cuCtxGetDevice PFN_cuCtxGetDevice_v2000 +#define PFN_cuCtxGetFlags PFN_cuCtxGetFlags_v7000 +#define PFN_cuCtxSynchronize PFN_cuCtxSynchronize_v2000 +#define PFN_cuCtxSetLimit PFN_cuCtxSetLimit_v3010 +#define PFN_cuCtxGetLimit PFN_cuCtxGetLimit_v3010 +#define PFN_cuCtxGetCacheConfig PFN_cuCtxGetCacheConfig_v3020 +#define PFN_cuCtxSetCacheConfig PFN_cuCtxSetCacheConfig_v3020 +#define PFN_cuCtxGetSharedMemConfig PFN_cuCtxGetSharedMemConfig_v4020 +#define PFN_cuCtxSetSharedMemConfig PFN_cuCtxSetSharedMemConfig_v4020 +#define PFN_cuCtxGetApiVersion PFN_cuCtxGetApiVersion_v3020 +#define PFN_cuCtxGetStreamPriorityRange PFN_cuCtxGetStreamPriorityRange_v5050 +#define PFN_cuCtxResetPersistingL2Cache PFN_cuCtxResetPersistingL2Cache_v11000 +#define PFN_cuCtxAttach PFN_cuCtxAttach_v2000 +#define PFN_cuCtxDetach PFN_cuCtxDetach_v2000 +#define PFN_cuCtxGetExecAffinity PFN_cuCtxGetExecAffinity_v11040 +#define PFN_cuModuleLoad PFN_cuModuleLoad_v2000 +#define PFN_cuModuleLoadData PFN_cuModuleLoadData_v2000 +#define PFN_cuModuleLoadDataEx PFN_cuModuleLoadDataEx_v2010 +#define PFN_cuModuleLoadFatBinary PFN_cuModuleLoadFatBinary_v2000 +#define PFN_cuModuleUnload PFN_cuModuleUnload_v2000 +#define PFN_cuModuleGetFunction PFN_cuModuleGetFunction_v2000 +#define PFN_cuModuleGetGlobal PFN_cuModuleGetGlobal_v3020 +#define PFN_cuModuleGetTexRef PFN_cuModuleGetTexRef_v2000 +#define PFN_cuModuleGetSurfRef PFN_cuModuleGetSurfRef_v3000 +#define PFN_cuLinkCreate PFN_cuLinkCreate_v6050 +#define PFN_cuLinkAddData PFN_cuLinkAddData_v6050 +#define PFN_cuLinkAddFile PFN_cuLinkAddFile_v6050 +#define PFN_cuLinkComplete PFN_cuLinkComplete_v5050 +#define PFN_cuLinkDestroy PFN_cuLinkDestroy_v5050 +#define PFN_cuMemGetInfo PFN_cuMemGetInfo_v3020 +#define PFN_cuMemAlloc PFN_cuMemAlloc_v3020 +#define PFN_cuMemAllocPitch PFN_cuMemAllocPitch_v3020 +#define PFN_cuMemFree PFN_cuMemFree_v3020 +#define PFN_cuMemGetAddressRange PFN_cuMemGetAddressRange_v3020 +#define PFN_cuMemAllocHost PFN_cuMemAllocHost_v3020 +#define PFN_cuMemFreeHost PFN_cuMemFreeHost_v2000 +#define PFN_cuMemHostAlloc PFN_cuMemHostAlloc_v2020 +#define PFN_cuMemHostGetDevicePointer PFN_cuMemHostGetDevicePointer_v3020 +#define PFN_cuMemHostGetFlags PFN_cuMemHostGetFlags_v2030 +#define PFN_cuMemAllocManaged PFN_cuMemAllocManaged_v6000 +#define PFN_cuDeviceGetByPCIBusId PFN_cuDeviceGetByPCIBusId_v4010 +#define PFN_cuDeviceGetPCIBusId PFN_cuDeviceGetPCIBusId_v4010 +#define PFN_cuIpcGetEventHandle PFN_cuIpcGetEventHandle_v4010 +#define PFN_cuIpcOpenEventHandle PFN_cuIpcOpenEventHandle_v4010 +#define PFN_cuIpcGetMemHandle PFN_cuIpcGetMemHandle_v4010 +#define PFN_cuIpcOpenMemHandle PFN_cuIpcOpenMemHandle_v11000 +#define PFN_cuIpcCloseMemHandle PFN_cuIpcCloseMemHandle_v4010 +#define PFN_cuMemHostRegister PFN_cuMemHostRegister_v6050 +#define PFN_cuMemHostUnregister PFN_cuMemHostUnregister_v4000 +#define PFN_cuMemcpy __API_TYPEDEF_PTDS(PFN_cuMemcpy, 4000, 7000) +#define PFN_cuMemcpyPeer __API_TYPEDEF_PTDS(PFN_cuMemcpyPeer, 4000, 7000) +#define PFN_cuMemcpyHtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoD, 3020, 7000) +#define PFN_cuMemcpyDtoH __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoH, 3020, 7000) +#define PFN_cuMemcpyDtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoD, 3020, 7000) +#define PFN_cuMemcpyDtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyDtoA, 3020, 7000) +#define PFN_cuMemcpyAtoD __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoD, 3020, 7000) +#define PFN_cuMemcpyHtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyHtoA, 3020, 7000) +#define PFN_cuMemcpyAtoH __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoH, 3020, 7000) +#define PFN_cuMemcpyAtoA __API_TYPEDEF_PTDS(PFN_cuMemcpyAtoA, 3020, 7000) +#define PFN_cuMemcpy2D __API_TYPEDEF_PTDS(PFN_cuMemcpy2D, 3020, 7000) +#define PFN_cuMemcpy2DUnaligned __API_TYPEDEF_PTDS(PFN_cuMemcpy2DUnaligned, 3020, 7000) +#define PFN_cuMemcpy3D __API_TYPEDEF_PTDS(PFN_cuMemcpy3D, 3020, 7000) +#define PFN_cuMemcpy3DPeer __API_TYPEDEF_PTDS(PFN_cuMemcpy3DPeer, 4000, 7000) +#define PFN_cuMemcpyAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyAsync, 4000, 7000) +#define PFN_cuMemcpyPeerAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyPeerAsync, 4000, 7000) +#define PFN_cuMemcpyHtoDAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoDAsync, 3020, 7000) +#define PFN_cuMemcpyDtoHAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoHAsync, 3020, 7000) +#define PFN_cuMemcpyDtoDAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyDtoDAsync, 3020, 7000) +#define PFN_cuMemcpyHtoAAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyHtoAAsync, 3020, 7000) +#define PFN_cuMemcpyAtoHAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpyAtoHAsync, 3020, 7000) +#define PFN_cuMemcpy2DAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy2DAsync, 3020, 7000) +#define PFN_cuMemcpy3DAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DAsync, 3020, 7000) +#define PFN_cuMemcpy3DPeerAsync __API_TYPEDEF_PTSZ(PFN_cuMemcpy3DPeerAsync, 4000, 7000) +#define PFN_cuMemsetD8 __API_TYPEDEF_PTDS(PFN_cuMemsetD8, 3020, 7000) +#define PFN_cuMemsetD16 __API_TYPEDEF_PTDS(PFN_cuMemsetD16, 3020, 7000) +#define PFN_cuMemsetD32 __API_TYPEDEF_PTDS(PFN_cuMemsetD32, 3020, 7000) +#define PFN_cuMemsetD2D8 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D8, 3020, 7000) +#define PFN_cuMemsetD2D16 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D16, 3020, 7000) +#define PFN_cuMemsetD2D32 __API_TYPEDEF_PTDS(PFN_cuMemsetD2D32, 3020, 7000) +#define PFN_cuMemsetD8Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD8Async, 3020, 7000) +#define PFN_cuMemsetD16Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD16Async, 3020, 7000) +#define PFN_cuMemsetD32Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD32Async, 3020, 7000) +#define PFN_cuMemsetD2D8Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D8Async, 3020, 7000) +#define PFN_cuMemsetD2D16Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D16Async, 3020, 7000) +#define PFN_cuMemsetD2D32Async __API_TYPEDEF_PTSZ(PFN_cuMemsetD2D32Async, 3020, 7000) +#define PFN_cuArrayCreate PFN_cuArrayCreate_v3020 +#define PFN_cuArrayGetDescriptor PFN_cuArrayGetDescriptor_v3020 +#define PFN_cuArrayGetSparseProperties PFN_cuArrayGetSparseProperties_v11010 +#define PFN_cuMipmappedArrayGetSparseProperties PFN_cuMipmappedArrayGetSparseProperties_v11010 +#define PFN_cuArrayGetMemoryRequirements PFN_cuArrayGetMemoryRequirements_v11060 +#define PFN_cuMipmappedArrayGetMemoryRequirements PFN_cuMipmappedArrayGetMemoryRequirements_v11060 +#define PFN_cuArrayGetPlane PFN_cuArrayGetPlane_v11020 +#define PFN_cuArrayDestroy PFN_cuArrayDestroy_v2000 +#define PFN_cuArray3DCreate PFN_cuArray3DCreate_v3020 +#define PFN_cuArray3DGetDescriptor PFN_cuArray3DGetDescriptor_v3020 +#define PFN_cuMipmappedArrayCreate PFN_cuMipmappedArrayCreate_v5000 +#define PFN_cuMipmappedArrayGetLevel PFN_cuMipmappedArrayGetLevel_v5000 +#define PFN_cuMipmappedArrayDestroy PFN_cuMipmappedArrayDestroy_v5000 +#define PFN_cuMemAddressReserve PFN_cuMemAddressReserve_v10020 +#define PFN_cuMemAddressFree PFN_cuMemAddressFree_v10020 +#define PFN_cuMemCreate PFN_cuMemCreate_v10020 +#define PFN_cuMemRelease PFN_cuMemRelease_v10020 +#define PFN_cuMemMap PFN_cuMemMap_v10020 +#define PFN_cuMemMapArrayAsync __API_TYPEDEF_PTSZ(PFN_cuMemMapArrayAsync, 11010, 11010) +#define PFN_cuMemUnmap PFN_cuMemUnmap_v10020 +#define PFN_cuMemSetAccess PFN_cuMemSetAccess_v10020 +#define PFN_cuMemGetAccess PFN_cuMemGetAccess_v10020 +#define PFN_cuMemExportToShareableHandle PFN_cuMemExportToShareableHandle_v10020 +#define PFN_cuMemImportFromShareableHandle PFN_cuMemImportFromShareableHandle_v10020 +#define PFN_cuMemGetAllocationGranularity PFN_cuMemGetAllocationGranularity_v10020 +#define PFN_cuMemGetAllocationPropertiesFromHandle PFN_cuMemGetAllocationPropertiesFromHandle_v10020 +#define PFN_cuMemRetainAllocationHandle PFN_cuMemRetainAllocationHandle_v11000 +#define PFN_cuMemFreeAsync __API_TYPEDEF_PTSZ(PFN_cuMemFreeAsync, 11020, 11020) +#define PFN_cuMemAllocAsync __API_TYPEDEF_PTSZ(PFN_cuMemAllocAsync, 11020, 11020) +#define PFN_cuMemPoolTrimTo PFN_cuMemPoolTrimTo_v11020 +#define PFN_cuMemPoolSetAttribute PFN_cuMemPoolSetAttribute_v11020 +#define PFN_cuMemPoolGetAttribute PFN_cuMemPoolGetAttribute_v11020 +#define PFN_cuMemPoolSetAccess PFN_cuMemPoolSetAccess_v11020 +#define PFN_cuMemPoolGetAccess PFN_cuMemPoolGetAccess_v11020 +#define PFN_cuMemPoolCreate PFN_cuMemPoolCreate_v11020 +#define PFN_cuMemPoolDestroy PFN_cuMemPoolDestroy_v11020 +#define PFN_cuMemAllocFromPoolAsync __API_TYPEDEF_PTSZ(PFN_cuMemAllocFromPoolAsync, 11020, 11020) +#define PFN_cuMemPoolExportToShareableHandle PFN_cuMemPoolExportToShareableHandle_v11020 +#define PFN_cuMemPoolImportFromShareableHandle PFN_cuMemPoolImportFromShareableHandle_v11020 +#define PFN_cuMemPoolExportPointer PFN_cuMemPoolExportPointer_v11020 +#define PFN_cuMemPoolImportPointer PFN_cuMemPoolImportPointer_v11020 +#define PFN_cuPointerGetAttribute PFN_cuPointerGetAttribute_v4000 +#define PFN_cuMemPrefetchAsync __API_TYPEDEF_PTSZ(PFN_cuMemPrefetchAsync, 8000, 8000) +#define PFN_cuMemAdvise PFN_cuMemAdvise_v8000 +#define PFN_cuMemRangeGetAttribute PFN_cuMemRangeGetAttribute_v8000 +#define PFN_cuMemRangeGetAttributes PFN_cuMemRangeGetAttributes_v8000 +#define PFN_cuPointerSetAttribute PFN_cuPointerSetAttribute_v6000 +#define PFN_cuPointerGetAttributes PFN_cuPointerGetAttributes_v7000 +#define PFN_cuStreamCreate PFN_cuStreamCreate_v2000 +#define PFN_cuStreamCreateWithPriority PFN_cuStreamCreateWithPriority_v5050 +#define PFN_cuStreamGetPriority __API_TYPEDEF_PTSZ(PFN_cuStreamGetPriority, 5050, 7000) +#define PFN_cuStreamGetFlags __API_TYPEDEF_PTSZ(PFN_cuStreamGetFlags, 5050, 7000) +#define PFN_cuStreamGetCtx __API_TYPEDEF_PTSZ(PFN_cuStreamGetCtx, 9020, 9020) +#define PFN_cuStreamWaitEvent __API_TYPEDEF_PTSZ(PFN_cuStreamWaitEvent, 3020, 7000) +#define PFN_cuStreamAddCallback __API_TYPEDEF_PTSZ(PFN_cuStreamAddCallback, 5000, 7000) +#define PFN_cuStreamBeginCapture __API_TYPEDEF_PTSZ(PFN_cuStreamBeginCapture, 10010, 10010) +#define PFN_cuThreadExchangeStreamCaptureMode PFN_cuThreadExchangeStreamCaptureMode_v10010 +#define PFN_cuStreamEndCapture __API_TYPEDEF_PTSZ(PFN_cuStreamEndCapture, 10000, 10000) +#define PFN_cuStreamIsCapturing __API_TYPEDEF_PTSZ(PFN_cuStreamIsCapturing, 10000, 10000) +#define PFN_cuStreamGetCaptureInfo __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 10010, 10010) +#define PFN_cuStreamGetCaptureInfo_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamGetCaptureInfo, 11030, 11030) +#define PFN_cuStreamUpdateCaptureDependencies __API_TYPEDEF_PTSZ(PFN_cuStreamUpdateCaptureDependencies, 11030, 11030) +#define PFN_cuStreamAttachMemAsync __API_TYPEDEF_PTSZ(PFN_cuStreamAttachMemAsync, 6000, 7000) +#define PFN_cuStreamQuery __API_TYPEDEF_PTSZ(PFN_cuStreamQuery, 2000, 7000) +#define PFN_cuStreamSynchronize __API_TYPEDEF_PTSZ(PFN_cuStreamSynchronize, 2000, 7000) +#define PFN_cuStreamDestroy PFN_cuStreamDestroy_v4000 +#define PFN_cuStreamCopyAttributes __API_TYPEDEF_PTSZ(PFN_cuStreamCopyAttributes, 11000, 11000) +#define PFN_cuStreamGetAttribute __API_TYPEDEF_PTSZ(PFN_cuStreamGetAttribute, 11000, 11000) +#define PFN_cuStreamSetAttribute __API_TYPEDEF_PTSZ(PFN_cuStreamSetAttribute, 11000, 11000) +#define PFN_cuEventCreate PFN_cuEventCreate_v2000 +#define PFN_cuEventRecord __API_TYPEDEF_PTSZ(PFN_cuEventRecord, 2000, 7000) +#define PFN_cuEventRecordWithFlags __API_TYPEDEF_PTSZ(PFN_cuEventRecordWithFlags, 11010, 11010) +#define PFN_cuEventQuery PFN_cuEventQuery_v2000 +#define PFN_cuEventSynchronize PFN_cuEventSynchronize_v2000 +#define PFN_cuEventDestroy PFN_cuEventDestroy_v4000 +#define PFN_cuEventElapsedTime PFN_cuEventElapsedTime_v2000 +#define PFN_cuImportExternalMemory PFN_cuImportExternalMemory_v10000 +#define PFN_cuExternalMemoryGetMappedBuffer PFN_cuExternalMemoryGetMappedBuffer_v10000 +#define PFN_cuExternalMemoryGetMappedMipmappedArray PFN_cuExternalMemoryGetMappedMipmappedArray_v10000 +#define PFN_cuDestroyExternalMemory PFN_cuDestroyExternalMemory_v10000 +#define PFN_cuImportExternalSemaphore PFN_cuImportExternalSemaphore_v10000 +#define PFN_cuSignalExternalSemaphoresAsync __API_TYPEDEF_PTSZ(PFN_cuSignalExternalSemaphoresAsync, 10000, 10000) +#define PFN_cuWaitExternalSemaphoresAsync __API_TYPEDEF_PTSZ(PFN_cuWaitExternalSemaphoresAsync, 10000, 10000) +#define PFN_cuDestroyExternalSemaphore PFN_cuDestroyExternalSemaphore_v10000 +#define PFN_cuStreamWaitValue32 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 8000, 8000) +#define PFN_cuStreamWaitValue64 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 9000, 9000) +#define PFN_cuStreamWriteValue32 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 8000, 8000) +#define PFN_cuStreamWriteValue64 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 9000, 9000) +#define PFN_cuStreamBatchMemOp __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 8000, 8000) +#define PFN_cuStreamWaitValue32_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue32, 11070, 11070) +#define PFN_cuStreamWaitValue64_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWaitValue64, 11070, 11070) +#define PFN_cuStreamWriteValue32_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue32, 11070, 11070) +#define PFN_cuStreamWriteValue64_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamWriteValue64, 11070, 11070) +#define PFN_cuStreamBatchMemOp_v2 __API_TYPEDEF_PTSZ(PFN_cuStreamBatchMemOp, 11070, 11070) +#define PFN_cuFuncGetAttribute PFN_cuFuncGetAttribute_v2020 +#define PFN_cuFuncSetAttribute PFN_cuFuncSetAttribute_v9000 +#define PFN_cuFuncSetCacheConfig PFN_cuFuncSetCacheConfig_v3000 +#define PFN_cuFuncSetSharedMemConfig PFN_cuFuncSetSharedMemConfig_v4020 +#define PFN_cuLaunchKernel __API_TYPEDEF_PTSZ(PFN_cuLaunchKernel, 4000, 7000) +#define PFN_cuLaunchKernelEx __API_TYPEDEF_PTSZ(PFN_cuLaunchKernelEx, 11060, 11060) +#define PFN_cuLaunchCooperativeKernel __API_TYPEDEF_PTSZ(PFN_cuLaunchCooperativeKernel, 9000, 9000) +#define PFN_cuLaunchCooperativeKernelMultiDevice PFN_cuLaunchCooperativeKernelMultiDevice_v9000 +#define PFN_cuLaunchHostFunc __API_TYPEDEF_PTSZ(PFN_cuLaunchHostFunc, 10000, 10000) +#define PFN_cuFuncSetBlockShape PFN_cuFuncSetBlockShape_v2000 +#define PFN_cuFuncSetSharedSize PFN_cuFuncSetSharedSize_v2000 +#define PFN_cuParamSetSize PFN_cuParamSetSize_v2000 +#define PFN_cuParamSeti PFN_cuParamSeti_v2000 +#define PFN_cuParamSetf PFN_cuParamSetf_v2000 +#define PFN_cuParamSetv PFN_cuParamSetv_v2000 +#define PFN_cuLaunch PFN_cuLaunch_v2000 +#define PFN_cuLaunchGrid PFN_cuLaunchGrid_v2000 +#define PFN_cuLaunchGridAsync PFN_cuLaunchGridAsync_v2000 +#define PFN_cuParamSetTexRef PFN_cuParamSetTexRef_v2000 +#define PFN_cuGraphCreate PFN_cuGraphCreate_v10000 +#define PFN_cuGraphAddKernelNode PFN_cuGraphAddKernelNode_v10000 +#define PFN_cuGraphKernelNodeGetParams PFN_cuGraphKernelNodeGetParams_v10000 +#define PFN_cuGraphKernelNodeSetParams PFN_cuGraphKernelNodeSetParams_v10000 +#define PFN_cuGraphAddMemcpyNode PFN_cuGraphAddMemcpyNode_v10000 +#define PFN_cuGraphMemcpyNodeGetParams PFN_cuGraphMemcpyNodeGetParams_v10000 +#define PFN_cuGraphMemcpyNodeSetParams PFN_cuGraphMemcpyNodeSetParams_v10000 +#define PFN_cuGraphAddMemsetNode PFN_cuGraphAddMemsetNode_v10000 +#define PFN_cuGraphMemsetNodeGetParams PFN_cuGraphMemsetNodeGetParams_v10000 +#define PFN_cuGraphMemsetNodeSetParams PFN_cuGraphMemsetNodeSetParams_v10000 +#define PFN_cuGraphAddHostNode PFN_cuGraphAddHostNode_v10000 +#define PFN_cuGraphHostNodeGetParams PFN_cuGraphHostNodeGetParams_v10000 +#define PFN_cuGraphHostNodeSetParams PFN_cuGraphHostNodeSetParams_v10000 +#define PFN_cuGraphAddChildGraphNode PFN_cuGraphAddChildGraphNode_v10000 +#define PFN_cuGraphChildGraphNodeGetGraph PFN_cuGraphChildGraphNodeGetGraph_v10000 +#define PFN_cuGraphAddEmptyNode PFN_cuGraphAddEmptyNode_v10000 +#define PFN_cuGraphAddEventRecordNode PFN_cuGraphAddEventRecordNode_v11010 +#define PFN_cuGraphEventRecordNodeGetEvent PFN_cuGraphEventRecordNodeGetEvent_v11010 +#define PFN_cuGraphEventRecordNodeSetEvent PFN_cuGraphEventRecordNodeSetEvent_v11010 +#define PFN_cuGraphAddEventWaitNode PFN_cuGraphAddEventWaitNode_v11010 +#define PFN_cuGraphEventWaitNodeGetEvent PFN_cuGraphEventWaitNodeGetEvent_v11010 +#define PFN_cuGraphEventWaitNodeSetEvent PFN_cuGraphEventWaitNodeSetEvent_v11010 +#define PFN_cuGraphAddExternalSemaphoresSignalNode PFN_cuGraphAddExternalSemaphoresSignalNode_v11020 +#define PFN_cuGraphExternalSemaphoresSignalNodeGetParams PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020 +#define PFN_cuGraphExternalSemaphoresSignalNodeSetParams PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020 +#define PFN_cuGraphAddExternalSemaphoresWaitNode PFN_cuGraphAddExternalSemaphoresWaitNode_v11020 +#define PFN_cuGraphExternalSemaphoresWaitNodeGetParams PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020 +#define PFN_cuGraphExternalSemaphoresWaitNodeSetParams PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020 +#define PFN_cuGraphAddBatchMemOpNode PFN_cuGraphAddBatchMemOpNode_v11070 +#define PFN_cuGraphBatchMemOpNodeGetParams PFN_cuGraphBatchMemOpNodeGetParams_v11070 +#define PFN_cuGraphBatchMemOpNodeSetParams PFN_cuGraphBatchMemOpNodeSetParams _v11070 +#define PFN_cuGraphExecBatchMemOpNodeSetParams PFN_cuGraphExecBatchMemOpNodeSetParams_v11070 +#define PFN_cuGraphClone PFN_cuGraphClone_v10000 +#define PFN_cuGraphNodeFindInClone PFN_cuGraphNodeFindInClone_v10000 +#define PFN_cuGraphNodeGetType PFN_cuGraphNodeGetType_v10000 +#define PFN_cuGraphGetNodes PFN_cuGraphGetNodes_v10000 +#define PFN_cuGraphGetRootNodes PFN_cuGraphGetRootNodes_v10000 +#define PFN_cuGraphGetEdges PFN_cuGraphGetEdges_v10000 +#define PFN_cuGraphNodeGetDependencies PFN_cuGraphNodeGetDependencies_v10000 +#define PFN_cuGraphNodeGetDependentNodes PFN_cuGraphNodeGetDependentNodes_v10000 +#define PFN_cuGraphAddDependencies PFN_cuGraphAddDependencies_v10000 +#define PFN_cuGraphRemoveDependencies PFN_cuGraphRemoveDependencies_v10000 +#define PFN_cuGraphDestroyNode PFN_cuGraphDestroyNode_v10000 +#define PFN_cuGraphInstantiate PFN_cuGraphInstantiate_v11000 +#define PFN_cuGraphInstantiateWithFlags PFN_cuGraphInstantiateWithFlags_v11040 +#define PFN_cuGraphExecKernelNodeSetParams PFN_cuGraphExecKernelNodeSetParams_v10010 +#define PFN_cuGraphExecMemcpyNodeSetParams PFN_cuGraphExecMemcpyNodeSetParams_v10020 +#define PFN_cuGraphExecMemsetNodeSetParams PFN_cuGraphExecMemsetNodeSetParams_v10020 +#define PFN_cuGraphExecHostNodeSetParams PFN_cuGraphExecHostNodeSetParams_v10020 +#define PFN_cuGraphExecChildGraphNodeSetParams PFN_cuGraphExecChildGraphNodeSetParams_v11010 +#define PFN_cuGraphExecEventRecordNodeSetEvent PFN_cuGraphExecEventRecordNodeSetEvent_v11010 +#define PFN_cuGraphExecEventWaitNodeSetEvent PFN_cuGraphExecEventWaitNodeSetEvent_v11010 +#define PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020 +#define PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020 +#define PFN_cuGraphUpload __API_TYPEDEF_PTSZ(PFN_cuGraphUpload, 11010, 11010) +#define PFN_cuGraphLaunch __API_TYPEDEF_PTSZ(PFN_cuGraphLaunch, 10000, 10000) +#define PFN_cuGraphExecDestroy PFN_cuGraphExecDestroy_v10000 +#define PFN_cuGraphDestroy PFN_cuGraphDestroy_v10000 +#define PFN_cuGraphExecUpdate PFN_cuGraphExecUpdate_v10020 +#define PFN_cuGraphKernelNodeCopyAttributes PFN_cuGraphKernelNodeCopyAttributes_v11000 +#define PFN_cuGraphKernelNodeGetAttribute PFN_cuGraphKernelNodeGetAttribute_v11000 +#define PFN_cuGraphKernelNodeSetAttribute PFN_cuGraphKernelNodeSetAttribute_v11000 +#define PFN_cuGraphDebugDotPrint PFN_cuGraphDebugDotPrint_v11030 +#define PFN_cuGraphAddMemAllocNode PFN_cuGraphAddMemAllocNode_v11040 +#define PFN_cuGraphMemAllocNodeGetParams PFN_cuGraphMemAllocNodeGetParams_v11040 +#define PFN_cuGraphAddMemFreeNode PFN_cuGraphAddMemFreeNode_v11040 +#define PFN_cuGraphMemFreeNodeGetParams PFN_cuGraphMemFreeNodeGetParams_v11040 +#define PFN_cuGraphNodeSetEnabled PFN_cuGraphNodeSetEnabled_v11060 +#define PFN_cuGraphNodeGetEnabled PFN_cuGraphNodeGetEnabled_v11060 +#define PFN_cuDeviceGraphMemTrim PFN_cuDeviceGraphMemTrim_v11040 +#define PFN_cuDeviceGetGraphMemAttribute PFN_cuDeviceGetGraphMemAttribute_v11040 +#define PFN_cuDeviceSetGraphMemAttribute PFN_cuDeviceSetGraphMemAttribute_v11040 +#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050 +#define PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000 +#define PFN_cuOccupancyMaxPotentialBlockSize PFN_cuOccupancyMaxPotentialBlockSize_v6050 +#define PFN_cuOccupancyMaxPotentialBlockSizeWithFlags PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000 +#define PFN_cuOccupancyAvailableDynamicSMemPerBlock PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020 +#define PFN_cuOccupancyMaxPotentialClusterSize PFN_cuOccupancyMaxPotentialClusterSize_v11070 +#define PFN_cuOccupancyMaxActiveClusters PFN_cuOccupancyMaxActiveClusters_v11070 +#define PFN_cuTexRefSetArray PFN_cuTexRefSetArray_v2000 +#define PFN_cuTexRefSetMipmappedArray PFN_cuTexRefSetMipmappedArray_v5000 +#define PFN_cuTexRefSetAddress PFN_cuTexRefSetAddress_v3020 +#define PFN_cuTexRefSetAddress2D PFN_cuTexRefSetAddress2D_v4010 +#define PFN_cuTexRefSetFormat PFN_cuTexRefSetFormat_v2000 +#define PFN_cuTexRefSetAddressMode PFN_cuTexRefSetAddressMode_v2000 +#define PFN_cuTexRefSetFilterMode PFN_cuTexRefSetFilterMode_v2000 +#define PFN_cuTexRefSetMipmapFilterMode PFN_cuTexRefSetMipmapFilterMode_v5000 +#define PFN_cuTexRefSetMipmapLevelBias PFN_cuTexRefSetMipmapLevelBias_v5000 +#define PFN_cuTexRefSetMipmapLevelClamp PFN_cuTexRefSetMipmapLevelClamp_v5000 +#define PFN_cuTexRefSetMaxAnisotropy PFN_cuTexRefSetMaxAnisotropy_v5000 +#define PFN_cuTexRefSetBorderColor PFN_cuTexRefSetBorderColor_v8000 +#define PFN_cuTexRefSetFlags PFN_cuTexRefSetFlags_v2000 +#define PFN_cuTexRefGetAddress PFN_cuTexRefGetAddress_v3020 +#define PFN_cuTexRefGetArray PFN_cuTexRefGetArray_v2000 +#define PFN_cuTexRefGetMipmappedArray PFN_cuTexRefGetMipmappedArray_v5000 +#define PFN_cuTexRefGetAddressMode PFN_cuTexRefGetAddressMode_v2000 +#define PFN_cuTexRefGetFilterMode PFN_cuTexRefGetFilterMode_v2000 +#define PFN_cuTexRefGetFormat PFN_cuTexRefGetFormat_v2000 +#define PFN_cuTexRefGetMipmapFilterMode PFN_cuTexRefGetMipmapFilterMode_v5000 +#define PFN_cuTexRefGetMipmapLevelBias PFN_cuTexRefGetMipmapLevelBias_v5000 +#define PFN_cuTexRefGetMipmapLevelClamp PFN_cuTexRefGetMipmapLevelClamp_v5000 +#define PFN_cuTexRefGetMaxAnisotropy PFN_cuTexRefGetMaxAnisotropy_v5000 +#define PFN_cuTexRefGetBorderColor PFN_cuTexRefGetBorderColor_v8000 +#define PFN_cuTexRefGetFlags PFN_cuTexRefGetFlags_v2000 +#define PFN_cuTexRefCreate PFN_cuTexRefCreate_v2000 +#define PFN_cuTexRefDestroy PFN_cuTexRefDestroy_v2000 +#define PFN_cuSurfRefSetArray PFN_cuSurfRefSetArray_v3000 +#define PFN_cuSurfRefGetArray PFN_cuSurfRefGetArray_v3000 +#define PFN_cuTexObjectCreate PFN_cuTexObjectCreate_v5000 +#define PFN_cuTexObjectDestroy PFN_cuTexObjectDestroy_v5000 +#define PFN_cuTexObjectGetResourceDesc PFN_cuTexObjectGetResourceDesc_v5000 +#define PFN_cuTexObjectGetTextureDesc PFN_cuTexObjectGetTextureDesc_v5000 +#define PFN_cuTexObjectGetResourceViewDesc PFN_cuTexObjectGetResourceViewDesc_v5000 +#define PFN_cuSurfObjectCreate PFN_cuSurfObjectCreate_v5000 +#define PFN_cuSurfObjectDestroy PFN_cuSurfObjectDestroy_v5000 +#define PFN_cuSurfObjectGetResourceDesc PFN_cuSurfObjectGetResourceDesc_v5000 +#define PFN_cuDeviceCanAccessPeer PFN_cuDeviceCanAccessPeer_v4000 +#define PFN_cuCtxEnablePeerAccess PFN_cuCtxEnablePeerAccess_v4000 +#define PFN_cuCtxDisablePeerAccess PFN_cuCtxDisablePeerAccess_v4000 +#define PFN_cuDeviceGetP2PAttribute PFN_cuDeviceGetP2PAttribute_v8000 +#define PFN_cuGraphicsUnregisterResource PFN_cuGraphicsUnregisterResource_v3000 +#define PFN_cuGraphicsSubResourceGetMappedArray PFN_cuGraphicsSubResourceGetMappedArray_v3000 +#define PFN_cuGraphicsResourceGetMappedMipmappedArray PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000 +#define PFN_cuGraphicsResourceGetMappedPointer PFN_cuGraphicsResourceGetMappedPointer_v3020 +#define PFN_cuGraphicsResourceSetMapFlags PFN_cuGraphicsResourceSetMapFlags_v6050 +#define PFN_cuGraphicsMapResources __API_TYPEDEF_PTSZ(PFN_cuGraphicsMapResources, 3000, 7000) +#define PFN_cuGraphicsUnmapResources __API_TYPEDEF_PTSZ(PFN_cuGraphicsUnmapResources, 3000, 7000) +#define PFN_cuGetExportTable PFN_cuGetExportTable_v3000 +#define PFN_cuFuncGetModule PFN_cuFuncGetModule_v11000 +#define PFN_cuFlushGPUDirectRDMAWrites PFN_cuFlushGPUDirectRDMAWrites_v11030 +#define PFN_cuGetProcAddress PFN_cuGetProcAddress_v11030 +#define PFN_cuUserObjectCreate PFN_cuUserObjectCreate_v11030 +#define PFN_cuUserObjectRetain PFN_cuUserObjectRetain_v11030 +#define PFN_cuUserObjectRelease PFN_cuUserObjectRelease_v11030 +#define PFN_cuGraphRetainUserObject PFN_cuGraphRetainUserObject_v11030 +#define PFN_cuGraphReleaseUserObject PFN_cuGraphReleaseUserObject_v11030 +#define PFN_cuModuleGetLoadingMode PFN_cuModuleGetLoadingMode_v11070 +#define PFN_cuMemGetHandleForAddressRange PFN_cuMemGetHandleForAddressRange_v11070 + +/* + * Type definitions for functions defined in cuda.h + */ +typedef CUresult (CUDAAPI *PFN_cuGetErrorString_v6000)(CUresult error, const char **pStr); +typedef CUresult (CUDAAPI *PFN_cuGetErrorName_v6000)(CUresult error, const char **pStr); +typedef CUresult (CUDAAPI *PFN_cuInit_v2000)(unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuDriverGetVersion_v2020)(int *driverVersion); +typedef CUresult (CUDAAPI *PFN_cuDeviceGet_v2000)(CUdevice_v1 *device, int ordinal); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetCount_v2000)(int *count); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetName_v2000)(char *name, int len, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v9020)(CUuuid *uuid, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetUuid_v11040)(CUuuid *uuid, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetLuid_v10000)(char *luid, unsigned int *deviceNodeMask, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v3020)(size_t *bytes, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetTexture1DLinearMaxWidth_v11010)(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetAttribute_v2000)(int *pi, CUdevice_attribute attrib, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetNvSciSyncAttributes_v10020)(void *nvSciSyncAttrList, CUdevice_v1 dev, int flags); +typedef CUresult (CUDAAPI *PFN_cuDeviceSetMemPool_v11020)(CUdevice_v1 dev, CUmemoryPool pool); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetMemPool_v11020)(CUmemoryPool *pool, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetDefaultMemPool_v11020)(CUmemoryPool *pool_out, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetProperties_v2000)(CUdevprop_v1 *prop, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceComputeCapability_v2000)(int *major, int *minor, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRetain_v7000)(CUcontext *pctx, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v11000)(CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v11000)(CUdevice_v1 dev, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxGetState_v7000)(CUdevice_v1 dev, unsigned int *flags, int *active); +typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v11000)(CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetExecAffinitySupport_v11040)(int *pi, CUexecAffinityType type, CUdevice dev); +typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v3020)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v11040)(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v4000)(CUcontext ctx); +typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v4000)(CUcontext ctx); +typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v4000)(CUcontext *pctx); +typedef CUresult (CUDAAPI *PFN_cuCtxSetCurrent_v4000)(CUcontext ctx); +typedef CUresult (CUDAAPI *PFN_cuCtxGetCurrent_v4000)(CUcontext *pctx); +typedef CUresult (CUDAAPI *PFN_cuCtxGetDevice_v2000)(CUdevice_v1 *device); +typedef CUresult (CUDAAPI *PFN_cuCtxGetFlags_v7000)(unsigned int *flags); +typedef CUresult (CUDAAPI *PFN_cuCtxSynchronize_v2000)(void); +typedef CUresult (CUDAAPI *PFN_cuCtxSetLimit_v3010)(CUlimit limit, size_t value); +typedef CUresult (CUDAAPI *PFN_cuCtxGetLimit_v3010)(size_t *pvalue, CUlimit limit); +typedef CUresult (CUDAAPI *PFN_cuCtxGetCacheConfig_v3020)(CUfunc_cache *pconfig); +typedef CUresult (CUDAAPI *PFN_cuCtxSetCacheConfig_v3020)(CUfunc_cache config); +typedef CUresult (CUDAAPI *PFN_cuCtxGetSharedMemConfig_v4020)(CUsharedconfig *pConfig); +typedef CUresult (CUDAAPI *PFN_cuCtxSetSharedMemConfig_v4020)(CUsharedconfig config); +typedef CUresult (CUDAAPI *PFN_cuCtxGetApiVersion_v3020)(CUcontext ctx, unsigned int *version); +typedef CUresult (CUDAAPI *PFN_cuCtxGetStreamPriorityRange_v5050)(int *leastPriority, int *greatestPriority); +typedef CUresult (CUDAAPI *PFN_cuCtxResetPersistingL2Cache_v11000)(void); +typedef CUresult (CUDAAPI *PFN_cuCtxAttach_v2000)(CUcontext *pctx, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuCtxDetach_v2000)(CUcontext ctx); +typedef CUresult (CUDAAPI *PFN_cuCtxGetExecAffinity_v11040)(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type); +typedef CUresult (CUDAAPI *PFN_cuModuleLoad_v2000)(CUmodule *module, const char *fname); +typedef CUresult (CUDAAPI *PFN_cuModuleLoadData_v2000)(CUmodule *module, const void *image); +typedef CUresult (CUDAAPI *PFN_cuModuleLoadDataEx_v2010)(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); +typedef CUresult (CUDAAPI *PFN_cuModuleLoadFatBinary_v2000)(CUmodule *module, const void *fatCubin); +typedef CUresult (CUDAAPI *PFN_cuModuleUnload_v2000)(CUmodule hmod); +typedef CUresult (CUDAAPI *PFN_cuModuleGetFunction_v2000)(CUfunction *hfunc, CUmodule hmod, const char *name); +typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v3020)(CUdeviceptr_v2 *dptr, size_t *bytes, CUmodule hmod, const char *name); +typedef CUresult (CUDAAPI *PFN_cuModuleGetTexRef_v2000)(CUtexref *pTexRef, CUmodule hmod, const char *name); +typedef CUresult (CUDAAPI *PFN_cuModuleGetSurfRef_v3000)(CUsurfref *pSurfRef, CUmodule hmod, const char *name); +typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v6050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); +typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v6050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues); +typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v6050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues); +typedef CUresult (CUDAAPI *PFN_cuLinkComplete_v5050)(CUlinkState state, void **cubinOut, size_t *sizeOut); +typedef CUresult (CUDAAPI *PFN_cuLinkDestroy_v5050)(CUlinkState state); +typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v3020)(size_t *free, size_t *total); +typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v3020)(CUdeviceptr_v2 *dptr, size_t bytesize); +typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v3020)(CUdeviceptr_v2 *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes); +typedef CUresult (CUDAAPI *PFN_cuMemFree_v3020)(CUdeviceptr_v2 dptr); +typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v3020)(CUdeviceptr_v2 *pbase, size_t *psize, CUdeviceptr_v2 dptr); +typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v3020)(void **pp, size_t bytesize); +typedef CUresult (CUDAAPI *PFN_cuMemFreeHost_v2000)(void *p); +typedef CUresult (CUDAAPI *PFN_cuMemHostAlloc_v2020)(void **pp, size_t bytesize, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v3020)(CUdeviceptr_v2 *pdptr, void *p, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuMemHostGetFlags_v2030)(unsigned int *pFlags, void *p); +typedef CUresult (CUDAAPI *PFN_cuMemAllocManaged_v6000)(CUdeviceptr_v2 *dptr, size_t bytesize, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetByPCIBusId_v4010)(CUdevice_v1 *dev, const char *pciBusId); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetPCIBusId_v4010)(char *pciBusId, int len, CUdevice_v1 dev); +typedef CUresult (CUDAAPI *PFN_cuIpcGetEventHandle_v4010)(CUipcEventHandle_v1 *pHandle, CUevent event); +typedef CUresult (CUDAAPI *PFN_cuIpcOpenEventHandle_v4010)(CUevent *phEvent, CUipcEventHandle_v1 handle); +typedef CUresult (CUDAAPI *PFN_cuIpcGetMemHandle_v4010)(CUipcMemHandle_v1 *pHandle, CUdeviceptr_v2 dptr); +typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v11000)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuIpcCloseMemHandle_v4010)(CUdeviceptr_v2 dptr); +typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v6050)(void *p, size_t bytesize, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuMemHostUnregister_v4000)(void *p); +typedef CUresult (CUDAAPI *PFN_cuMemcpy_v7000_ptds)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v7000_ptds)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v7000_ptds)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v7000_ptds)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v7000_ptds)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy); +typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v7000_ptds)(const CUDA_MEMCPY2D_v2 *pCopy); +typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v7000_ptds)(const CUDA_MEMCPY3D_v2 *pCopy); +typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v7000_ptds)(const CUDA_MEMCPY3D_PEER_v1 *pCopy); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v7000_ptsz)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v7000_ptsz)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v7000_ptsz)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v7000_ptsz)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v7000_ptsz)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v7000_ptsz)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v7000_ptsz)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v7000_ptsz)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N); +typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N); +typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v7000_ptds)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v7000_ptds)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); +typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v7000_ptsz)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v2 *pAllocateArray); +typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v3020)(CUDA_ARRAY_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray); +typedef CUresult (CUDAAPI *PFN_cuArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUarray array); +typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetSparseProperties_v11010)(CUDA_ARRAY_SPARSE_PROPERTIES_v1 *sparseProperties, CUmipmappedArray mipmap); +typedef CUresult (CUDAAPI *PFN_cuArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUarray array, CUdevice device); +typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetMemoryRequirements_v11060)(CUDA_ARRAY_MEMORY_REQUIREMENTS_v1 *memoryRequirements, CUmipmappedArray mipmap, CUdevice device); +typedef CUresult (CUDAAPI *PFN_cuArrayGetPlane_v11020)(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx); +typedef CUresult (CUDAAPI *PFN_cuArrayDestroy_v2000)(CUarray hArray); +typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v3020)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pAllocateArray); +typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v3020)(CUDA_ARRAY3D_DESCRIPTOR_v2 *pArrayDescriptor, CUarray hArray); +typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayCreate_v5000)(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v2 *pMipmappedArrayDesc, unsigned int numMipmapLevels); +typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayGetLevel_v5000)(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level); +typedef CUresult (CUDAAPI *PFN_cuMipmappedArrayDestroy_v5000)(CUmipmappedArray hMipmappedArray); +typedef CUresult (CUDAAPI *PFN_cuMemAddressReserve_v10020)(CUdeviceptr_v2 *ptr, size_t size, size_t alignment, CUdeviceptr_v2 addr, unsigned long long flags); +typedef CUresult (CUDAAPI *PFN_cuMemAddressFree_v10020)(CUdeviceptr_v2 ptr, size_t size); +typedef CUresult (CUDAAPI *PFN_cuMemCreate_v10020)(CUmemGenericAllocationHandle_v1 *handle, size_t size, const CUmemAllocationProp_v1 *prop, unsigned long long flags); +typedef CUresult (CUDAAPI *PFN_cuMemRelease_v10020)(CUmemGenericAllocationHandle_v1 handle); +typedef CUresult (CUDAAPI *PFN_cuMemMap_v10020)(CUdeviceptr_v2 ptr, size_t size, size_t offset, CUmemGenericAllocationHandle_v1 handle, unsigned long long flags); +typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010_ptsz)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemUnmap_v10020)(CUdeviceptr_v2 ptr, size_t size); +typedef CUresult (CUDAAPI *PFN_cuMemSetAccess_v10020)(CUdeviceptr_v2 ptr, size_t size, const CUmemAccessDesc_v1 *desc, size_t count); +typedef CUresult (CUDAAPI *PFN_cuMemGetAccess_v10020)(unsigned long long *flags, const CUmemLocation_v1 *location, CUdeviceptr_v2 ptr); +typedef CUresult (CUDAAPI *PFN_cuMemExportToShareableHandle_v10020)(void *shareableHandle, CUmemGenericAllocationHandle_v1 handle, CUmemAllocationHandleType handleType, unsigned long long flags); +typedef CUresult (CUDAAPI *PFN_cuMemImportFromShareableHandle_v10020)(CUmemGenericAllocationHandle_v1 *handle, void *osHandle, CUmemAllocationHandleType shHandleType); +typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationGranularity_v10020)(size_t *granularity, const CUmemAllocationProp_v1 *prop, CUmemAllocationGranularity_flags option); +typedef CUresult (CUDAAPI *PFN_cuMemGetAllocationPropertiesFromHandle_v10020)(CUmemAllocationProp_v1 *prop, CUmemGenericAllocationHandle_v1 handle); +typedef CUresult (CUDAAPI *PFN_cuMemRetainAllocationHandle_v11000)(CUmemGenericAllocationHandle_v1 *handle, void *addr); +typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020_ptsz)(CUdeviceptr_v2 dptr, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemPoolTrimTo_v11020)(CUmemoryPool pool, size_t minBytesToKeep); +typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value); +typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAttribute_v11020)(CUmemoryPool pool, CUmemPool_attribute attr, void *value); +typedef CUresult (CUDAAPI *PFN_cuMemPoolSetAccess_v11020)(CUmemoryPool pool, const CUmemAccessDesc_v1 *map, size_t count); +typedef CUresult (CUDAAPI *PFN_cuMemPoolGetAccess_v11020)(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation_v1 *location); +typedef CUresult (CUDAAPI *PFN_cuMemPoolCreate_v11020)(CUmemoryPool *pool, const CUmemPoolProps_v1 *poolProps); +typedef CUresult (CUDAAPI *PFN_cuMemPoolDestroy_v11020)(CUmemoryPool pool); +typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020_ptsz)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemPoolExportToShareableHandle_v11020)(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags); +typedef CUresult (CUDAAPI *PFN_cuMemPoolImportFromShareableHandle_v11020)(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags); +typedef CUresult (CUDAAPI *PFN_cuMemPoolExportPointer_v11020)(CUmemPoolPtrExportData_v1 *shareData_out, CUdeviceptr_v2 ptr); +typedef CUresult (CUDAAPI *PFN_cuMemPoolImportPointer_v11020)(CUdeviceptr_v2 *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData_v1 *shareData); +typedef CUresult (CUDAAPI *PFN_cuPointerGetAttribute_v4000)(void *data, CUpointer_attribute attribute, CUdeviceptr_v2 ptr); +typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000_ptsz)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemAdvise_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUmem_advise advice, CUdevice_v1 device); +typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttribute_v8000)(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr_v2 devPtr, size_t count); +typedef CUresult (CUDAAPI *PFN_cuMemRangeGetAttributes_v8000)(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr_v2 devPtr, size_t count); +typedef CUresult (CUDAAPI *PFN_cuPointerSetAttribute_v6000)(const void *value, CUpointer_attribute attribute, CUdeviceptr_v2 ptr); +typedef CUresult (CUDAAPI *PFN_cuPointerGetAttributes_v7000)(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr_v2 ptr); +typedef CUresult (CUDAAPI *PFN_cuStreamCreate_v2000)(CUstream *phStream, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuStreamCreateWithPriority_v5050)(CUstream *phStream, unsigned int flags, int priority); +typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v7000_ptsz)(CUstream hStream, int *priority); +typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v7000_ptsz)(CUstream hStream, unsigned int *flags); +typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020_ptsz)(CUstream hStream, CUcontext *pctx); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v7000_ptsz)(CUstream hStream, CUevent hEvent, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v7000_ptsz)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010_ptsz)(CUstream hStream, CUstreamCaptureMode mode); +typedef CUresult (CUDAAPI *PFN_cuThreadExchangeStreamCaptureMode_v10010)(CUstreamCaptureMode *mode); +typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000_ptsz)(CUstream hStream, CUgraph *phGraph); +typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus); +typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); +typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030_ptsz)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); +typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030_ptsz)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v7000_ptsz)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v7000_ptsz)(CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v7000_ptsz)(CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v4000)(CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000_ptsz)(CUstream dst, CUstream src); +typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value_out); +typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000_ptsz)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *value); +typedef CUresult (CUDAAPI *PFN_cuEventCreate_v2000)(CUevent *phEvent, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuEventRecord_v7000_ptsz)(CUevent hEvent, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010_ptsz)(CUevent hEvent, CUstream hStream, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuEventQuery_v2000)(CUevent hEvent); +typedef CUresult (CUDAAPI *PFN_cuEventSynchronize_v2000)(CUevent hEvent); +typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v4000)(CUevent hEvent); +typedef CUresult (CUDAAPI *PFN_cuEventElapsedTime_v2000)(float *pMilliseconds, CUevent hStart, CUevent hEnd); +typedef CUresult (CUDAAPI *PFN_cuImportExternalMemory_v10000)(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1 *memHandleDesc); +typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedBuffer_v10000)(CUdeviceptr_v2 *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC_v1 *bufferDesc); +typedef CUresult (CUDAAPI *PFN_cuExternalMemoryGetMappedMipmappedArray_v10000)(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1 *mipmapDesc); +typedef CUresult (CUDAAPI *PFN_cuDestroyExternalMemory_v10000)(CUexternalMemory extMem); +typedef CUresult (CUDAAPI *PFN_cuImportExternalSemaphore_v10000)(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1 *semHandleDesc); +typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream); +typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000_ptsz)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream); +typedef CUresult (CUDAAPI *PFN_cuDestroyExternalSemaphore_v10000)(CUexternalSemaphore extSem); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams_v1 *paramArray, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070_ptsz)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070_ptsz)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuFuncGetAttribute_v2020)(int *pi, CUfunction_attribute attrib, CUfunction hfunc); +typedef CUresult (CUDAAPI *PFN_cuFuncSetAttribute_v9000)(CUfunction hfunc, CUfunction_attribute attrib, int value); +typedef CUresult (CUDAAPI *PFN_cuFuncSetCacheConfig_v3000)(CUfunction hfunc, CUfunc_cache config); +typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedMemConfig_v4020)(CUfunction hfunc, CUsharedconfig config); +typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v7000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); +typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060_ptsz)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra); +typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000_ptsz)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); +typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernelMultiDevice_v9000)(CUDA_LAUNCH_PARAMS_v1 *launchParamsList, unsigned int numDevices, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000_ptsz)(CUstream hStream, CUhostFn fn, void *userData); +typedef CUresult (CUDAAPI *PFN_cuFuncSetBlockShape_v2000)(CUfunction hfunc, int x, int y, int z); +typedef CUresult (CUDAAPI *PFN_cuFuncSetSharedSize_v2000)(CUfunction hfunc, unsigned int bytes); +typedef CUresult (CUDAAPI *PFN_cuParamSetSize_v2000)(CUfunction hfunc, unsigned int numbytes); +typedef CUresult (CUDAAPI *PFN_cuParamSeti_v2000)(CUfunction hfunc, int offset, unsigned int value); +typedef CUresult (CUDAAPI *PFN_cuParamSetf_v2000)(CUfunction hfunc, int offset, float value); +typedef CUresult (CUDAAPI *PFN_cuParamSetv_v2000)(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes); +typedef CUresult (CUDAAPI *PFN_cuLaunch_v2000)(CUfunction f); +typedef CUresult (CUDAAPI *PFN_cuLaunchGrid_v2000)(CUfunction f, int grid_width, int grid_height); +typedef CUresult (CUDAAPI *PFN_cuLaunchGridAsync_v2000)(CUfunction f, int grid_width, int grid_height, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuParamSetTexRef_v2000)(CUfunction hfunc, int texunit, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuGraphCreate_v10000)(CUgraph *phGraph, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuGraphAddKernelNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetParams_v10000)(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphAddMemcpyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx); +typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMCPY3D_v2 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphMemcpyNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphAddMemsetNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx); +typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeGetParams_v10000)(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphMemsetNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphAddHostNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeGetParams_v10000)(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphHostNodeSetParams_v10000)(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphAddChildGraphNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph); +typedef CUresult (CUDAAPI *PFN_cuGraphChildGraphNodeGetGraph_v10000)(CUgraphNode hNode, CUgraph *phGraph); +typedef CUresult (CUDAAPI *PFN_cuGraphAddEmptyNode_v10000)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies); +typedef CUresult (CUDAAPI *PFN_cuGraphAddEventRecordNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); +typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out); +typedef CUresult (CUDAAPI *PFN_cuGraphEventRecordNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event); +typedef CUresult (CUDAAPI *PFN_cuGraphAddEventWaitNode_v11010)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event); +typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeGetEvent_v11010)(CUgraphNode hNode, CUevent *event_out); +typedef CUresult (CUDAAPI *PFN_cuGraphEventWaitNodeSetEvent_v11010)(CUgraphNode hNode, CUevent event); +typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresSignalNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *params_out); +typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphAddExternalSemaphoresWaitNode_v11020)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeGetParams_v11020)(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *params_out); +typedef CUresult (CUDAAPI *PFN_cuGraphExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphAddBatchMemOpNode_v11070)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeGetParams_v11070)(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out); +typedef CUresult (CUDAAPI *PFN_cuGraphBatchMemOpNodeSetParams_v11070)(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphExecBatchMemOpNodeSetParams_v11070)(CUgraphExec graphExec, CUgraphNode node, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphClone_v10000)(CUgraph *phGraphClone, CUgraph originalGraph); +typedef CUresult (CUDAAPI *PFN_cuGraphNodeFindInClone_v10000)(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph); +typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetType_v10000)(CUgraphNode hNode, CUgraphNodeType *type); +typedef CUresult (CUDAAPI *PFN_cuGraphGetNodes_v10000)(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes); +typedef CUresult (CUDAAPI *PFN_cuGraphGetRootNodes_v10000)(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes); +typedef CUresult (CUDAAPI *PFN_cuGraphGetEdges_v10000)(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges); +typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependencies_v10000)(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies); +typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetDependentNodes_v10000)(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes); +typedef CUresult (CUDAAPI *PFN_cuGraphAddDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); +typedef CUresult (CUDAAPI *PFN_cuGraphRemoveDependencies_v10000)(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies); +typedef CUresult (CUDAAPI *PFN_cuGraphDestroyNode_v10000)(CUgraphNode hNode); +typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v11000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); +typedef CUresult (CUDAAPI *PFN_cuGraphInstantiateWithFlags_v11040)(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags); +typedef CUresult (CUDAAPI *PFN_cuGraphExecKernelNodeSetParams_v10010)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphExecMemcpyNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D_v2 *copyParams, CUcontext ctx); +typedef CUresult (CUDAAPI *PFN_cuGraphExecMemsetNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS_v1 *memsetParams, CUcontext ctx); +typedef CUresult (CUDAAPI *PFN_cuGraphExecHostNodeSetParams_v10020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphExecChildGraphNodeSetParams_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph); +typedef CUresult (CUDAAPI *PFN_cuGraphExecEventRecordNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); +typedef CUresult (CUDAAPI *PFN_cuGraphExecEventWaitNodeSetEvent_v11010)(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event); +typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresSignalNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphExecExternalSemaphoresWaitNodeSetParams_v11020)(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1 *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010_ptsz)(CUgraphExec hGraphExec, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000_ptsz)(CUgraphExec hGraphExec, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGraphExecDestroy_v10000)(CUgraphExec hGraphExec); +typedef CUresult (CUDAAPI *PFN_cuGraphDestroy_v10000)(CUgraph hGraph); +typedef CUresult (CUDAAPI *PFN_cuGraphExecUpdate_v10020)(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out); +typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeCopyAttributes_v11000)(CUgraphNode dst, CUgraphNode src); +typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeGetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue_v1 *value_out); +typedef CUresult (CUDAAPI *PFN_cuGraphKernelNodeSetAttribute_v11000)(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue_v1 *value); +typedef CUresult (CUDAAPI *PFN_cuGraphDebugDotPrint_v11030)(CUgraph hGraph, const char *path, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuGraphAddMemAllocNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams); +typedef CUresult (CUDAAPI *PFN_cuGraphMemAllocNodeGetParams_v11040)(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out); +typedef CUresult (CUDAAPI *PFN_cuGraphAddMemFreeNode_v11040)(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr); +typedef CUresult (CUDAAPI *PFN_cuGraphMemFreeNodeGetParams_v11040)(CUgraphNode hNode, CUdeviceptr *dptr_out); +typedef CUresult (CUDAAPI *PFN_cuGraphNodeSetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled); +typedef CUresult (CUDAAPI *PFN_cuGraphNodeGetEnabled_v11060)(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled); +typedef CUresult (CUDAAPI *PFN_cuDeviceGraphMemTrim_v11040)(CUdevice device); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value); +typedef CUresult (CUDAAPI *PFN_cuDeviceSetGraphMemAttribute_v11040)(CUdevice device, CUgraphMem_attribute attr, void* value); +typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessor_v6050)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize); +typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000)(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSize_v6050)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit); +typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialBlockSizeWithFlags_v7000)(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuOccupancyAvailableDynamicSMemPerBlock_v10020)(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize); +typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxPotentialClusterSize_v11070)(int *clusterSize, CUfunction func, const CUlaunchConfig *config); +typedef CUresult (CUDAAPI *PFN_cuOccupancyMaxActiveClusters_v11070)(int *numClusters, CUfunction func, const CUlaunchConfig *config); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetArray_v2000)(CUtexref hTexRef, CUarray hArray, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmappedArray_v5000)(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v3020)(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr_v2 dptr, size_t bytes); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v4010)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetFormat_v2000)(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddressMode_v2000)(CUtexref hTexRef, int dim, CUaddress_mode am); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetFilterMode_v2000)(CUtexref hTexRef, CUfilter_mode fm); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapFilterMode_v5000)(CUtexref hTexRef, CUfilter_mode fm); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelBias_v5000)(CUtexref hTexRef, float bias); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetMipmapLevelClamp_v5000)(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetMaxAnisotropy_v5000)(CUtexref hTexRef, unsigned int maxAniso); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetBorderColor_v8000)(CUtexref hTexRef, float *pBorderColor); +typedef CUresult (CUDAAPI *PFN_cuTexRefSetFlags_v2000)(CUtexref hTexRef, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v3020)(CUdeviceptr_v2 *pdptr, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetArray_v2000)(CUarray *phArray, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmappedArray_v5000)(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddressMode_v2000)(CUaddress_mode *pam, CUtexref hTexRef, int dim); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetFilterMode_v2000)(CUfilter_mode *pfm, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetFormat_v2000)(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapFilterMode_v5000)(CUfilter_mode *pfm, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelBias_v5000)(float *pbias, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetMipmapLevelClamp_v5000)(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetMaxAnisotropy_v5000)(int *pmaxAniso, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetBorderColor_v8000)(float *pBorderColor, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefGetFlags_v2000)(unsigned int *pFlags, CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefCreate_v2000)(CUtexref *pTexRef); +typedef CUresult (CUDAAPI *PFN_cuTexRefDestroy_v2000)(CUtexref hTexRef); +typedef CUresult (CUDAAPI *PFN_cuSurfRefSetArray_v3000)(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuSurfRefGetArray_v3000)(CUarray *phArray, CUsurfref hSurfRef); +typedef CUresult (CUDAAPI *PFN_cuTexObjectCreate_v5000)(CUtexObject_v1 *pTexObject, const CUDA_RESOURCE_DESC_v1 *pResDesc, const CUDA_TEXTURE_DESC_v1 *pTexDesc, const CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc); +typedef CUresult (CUDAAPI *PFN_cuTexObjectDestroy_v5000)(CUtexObject_v1 texObject); +typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUtexObject_v1 texObject); +typedef CUresult (CUDAAPI *PFN_cuTexObjectGetTextureDesc_v5000)(CUDA_TEXTURE_DESC_v1 *pTexDesc, CUtexObject_v1 texObject); +typedef CUresult (CUDAAPI *PFN_cuTexObjectGetResourceViewDesc_v5000)(CUDA_RESOURCE_VIEW_DESC_v1 *pResViewDesc, CUtexObject_v1 texObject); +typedef CUresult (CUDAAPI *PFN_cuSurfObjectCreate_v5000)(CUsurfObject_v1 *pSurfObject, const CUDA_RESOURCE_DESC_v1 *pResDesc); +typedef CUresult (CUDAAPI *PFN_cuSurfObjectDestroy_v5000)(CUsurfObject_v1 surfObject); +typedef CUresult (CUDAAPI *PFN_cuSurfObjectGetResourceDesc_v5000)(CUDA_RESOURCE_DESC_v1 *pResDesc, CUsurfObject_v1 surfObject); +typedef CUresult (CUDAAPI *PFN_cuDeviceCanAccessPeer_v4000)(int *canAccessPeer, CUdevice_v1 dev, CUdevice_v1 peerDev); +typedef CUresult (CUDAAPI *PFN_cuCtxEnablePeerAccess_v4000)(CUcontext peerContext, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuCtxDisablePeerAccess_v4000)(CUcontext peerContext); +typedef CUresult (CUDAAPI *PFN_cuDeviceGetP2PAttribute_v8000)(int *value, CUdevice_P2PAttribute attrib, CUdevice_v1 srcDevice, CUdevice_v1 dstDevice); +typedef CUresult (CUDAAPI *PFN_cuGraphicsUnregisterResource_v3000)(CUgraphicsResource resource); +typedef CUresult (CUDAAPI *PFN_cuGraphicsSubResourceGetMappedArray_v3000)(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel); +typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedMipmappedArray_v5000)(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource); +typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3020)(CUdeviceptr_v2 *pDevPtr, size_t *pSize, CUgraphicsResource resource); +typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v6050)(CUgraphicsResource resource, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v7000_ptsz)(unsigned int count, CUgraphicsResource *resources, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGetExportTable_v3000)(const void **ppExportTable, const CUuuid *pExportTableId); +typedef CUresult (CUDAAPI *PFN_cuFuncGetModule_v11000)(CUmodule *hmod, CUfunction hfunc); +typedef CUresult (CUDAAPI *PFN_cuGetProcAddress_v11030)(const char *symbol, void **pfn, int driverVersion, cuuint64_t flags); +typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v3020)(CUarray dstArray, size_t dstOffset, CUdeviceptr_v2 srcDevice, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v3020)(CUdeviceptr_v2 dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v3020)(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v3020)(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v3020)(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v3020)(const CUDA_MEMCPY2D_v2 *pCopy); +typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v3020)(const CUDA_MEMCPY2D_v2 *pCopy); +typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v3020)(const CUDA_MEMCPY3D_v2 *pCopy); +typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v3020)(void *dstHost, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3020)(CUdeviceptr_v2 dstDevice, CUdeviceptr_v2 srcDevice, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v3020)(const CUDA_MEMCPY2D_v2 *pCopy, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v3020)(const CUDA_MEMCPY3D_v2 *pCopy, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N); +typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N); +typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height); +typedef CUresult (CUDAAPI *PFN_cuMemcpy_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyAsync_v4000)(CUdeviceptr_v2 dst, CUdeviceptr_v2 src, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpyPeer_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount); +typedef CUresult (CUDAAPI *PFN_cuMemcpyPeerAsync_v4000)(CUdeviceptr_v2 dstDevice, CUcontext dstContext, CUdeviceptr_v2 srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeer_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy); +typedef CUresult (CUDAAPI *PFN_cuMemcpy3DPeerAsync_v4000)(const CUDA_MEMCPY3D_PEER_v1 *pCopy, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD8Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned char uc, size_t N, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD16Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned short us, size_t N, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD32Async_v3020)(CUdeviceptr_v2 dstDevice, unsigned int ui, size_t N, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32Async_v3020)(CUdeviceptr_v2 dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuStreamGetPriority_v5050)(CUstream hStream, int *priority); +typedef CUresult (CUDAAPI *PFN_cuStreamGetFlags_v5050)(CUstream hStream, unsigned int *flags); +typedef CUresult (CUDAAPI *PFN_cuStreamGetCtx_v9020)(CUstream hStream, CUcontext *pctx); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitEvent_v3020)(CUstream hStream, CUevent hEvent, unsigned int Flags); +typedef CUresult (CUDAAPI *PFN_cuStreamAddCallback_v5000)(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamAttachMemAsync_v6000)(CUstream hStream, CUdeviceptr_v2 dptr, size_t length, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamQuery_v2000)(CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuStreamSynchronize_v2000)(CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuEventRecord_v2000)(CUevent hEvent, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuEventRecordWithFlags_v11010)(CUevent hEvent, CUstream hStream, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuLaunchKernel_v4000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra); +typedef CUresult (CUDAAPI *PFN_cuLaunchKernelEx_v11060)(const CUlaunchConfig *config, CUfunction f, void **kernelParams, void **extra); +typedef CUresult (CUDAAPI *PFN_cuLaunchHostFunc_v10000)(CUstream hStream, CUhostFn fn, void *userData); +typedef CUresult (CUDAAPI *PFN_cuGraphicsMapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGraphicsUnmapResources_v3000)(unsigned int count, CUgraphicsResource *resources, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v8000)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v9000)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v8000)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue32_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint32_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWriteValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamWaitValue64_v11070)(CUstream stream, CUdeviceptr_v2 addr, cuuint64_t value, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuStreamBatchMemOp_v11070)(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuMemPrefetchAsync_v8000)(CUdeviceptr_v2 devPtr, size_t count, CUdevice_v1 dstDevice, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuLaunchCooperativeKernel_v9000)(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams); +typedef CUresult (CUDAAPI *PFN_cuSignalExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream); +typedef CUresult (CUDAAPI *PFN_cuWaitExternalSemaphoresAsync_v10000)(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1 *paramsArray, unsigned int numExtSems, CUstream stream); +typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10010)(CUstream hStream, CUstreamCaptureMode mode); +typedef CUresult (CUDAAPI *PFN_cuStreamEndCapture_v10000)(CUstream hStream, CUgraph *phGraph); +typedef CUresult (CUDAAPI *PFN_cuStreamIsCapturing_v10000)(CUstream hStream, CUstreamCaptureStatus *captureStatus); +typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v10010)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out); +typedef CUresult (CUDAAPI *PFN_cuStreamGetCaptureInfo_v11030)(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out); +typedef CUresult (CUDAAPI *PFN_cuStreamUpdateCaptureDependencies_v11030)(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuGraphUpload_v11010)(CUgraphExec hGraph, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuGraphLaunch_v10000)(CUgraphExec hGraph, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuStreamCopyAttributes_v11000)(CUstream dstStream, CUstream srcStream); +typedef CUresult (CUDAAPI *PFN_cuStreamGetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue_v1 *value); +typedef CUresult (CUDAAPI *PFN_cuStreamSetAttribute_v11000)(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue_v1 *param); +typedef CUresult (CUDAAPI *PFN_cuMemMapArrayAsync_v11010)(CUarrayMapInfo_v1 *mapInfoList, unsigned int count, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemFreeAsync_v11020)(CUdeviceptr_v2 dptr, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemAllocAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuMemAllocFromPoolAsync_v11020)(CUdeviceptr_v2 *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream); +typedef CUresult (CUDAAPI *PFN_cuFlushGPUDirectRDMAWrites_v11030)(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope); +typedef CUresult (CUDAAPI *PFN_cuUserObjectCreate_v11030)(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuUserObjectRetain_v11030)(CUuserObject object, unsigned int count); +typedef CUresult (CUDAAPI *PFN_cuUserObjectRelease_v11030)(CUuserObject object, unsigned int count); +typedef CUresult (CUDAAPI *PFN_cuGraphRetainUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags); +typedef CUresult (CUDAAPI *PFN_cuGraphReleaseUserObject_v11030)(CUgraph graph, CUuserObject object, unsigned int count); +typedef CUresult (CUDAAPI *PFN_cuModuleGetLoadingMode_v11070)(CUmoduleLoadingMode *mode); +typedef CUresult (CUDAAPI *PFN_cuMemGetHandleForAddressRange_v11070)(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags); + +/* + * Type definitions for older versioned functions in cuda.h + */ +#if defined(__CUDA_API_VERSION_INTERNAL) + typedef CUresult (CUDAAPI *PFN_cuMemHostRegister_v4000)(void *p, size_t bytesize, unsigned int Flags); + typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceSetMapFlags_v3000)(CUgraphicsResource resource, unsigned int flags); + typedef CUresult (CUDAAPI *PFN_cuLinkCreate_v5050)(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut); + typedef CUresult (CUDAAPI *PFN_cuLinkAddData_v5050)(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues); + typedef CUresult (CUDAAPI *PFN_cuLinkAddFile_v5050)(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues); + typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v3020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v2 *desc, CUdeviceptr_v2 dptr, size_t Pitch); + typedef CUresult (CUDAAPI *PFN_cuDeviceTotalMem_v2000)(unsigned int *bytes, CUdevice_v1 dev); + typedef CUresult (CUDAAPI *PFN_cuCtxCreate_v2000)(CUcontext *pctx, unsigned int flags, CUdevice_v1 dev); + typedef CUresult (CUDAAPI *PFN_cuModuleGetGlobal_v2000)(CUdeviceptr_v1 *dptr, unsigned int *bytes, CUmodule hmod, const char *name); + typedef CUresult (CUDAAPI *PFN_cuMemGetInfo_v2000)(unsigned int *free, unsigned int *total); + typedef CUresult (CUDAAPI *PFN_cuMemAlloc_v2000)(CUdeviceptr_v1 *dptr, unsigned int bytesize); + typedef CUresult (CUDAAPI *PFN_cuMemAllocPitch_v2000)(CUdeviceptr_v1 *dptr, unsigned int *pPitch, unsigned int WidthInBytes, unsigned int Height, unsigned int ElementSizeBytes); + typedef CUresult (CUDAAPI *PFN_cuMemFree_v2000)(CUdeviceptr_v1 dptr); + typedef CUresult (CUDAAPI *PFN_cuMemGetAddressRange_v2000)(CUdeviceptr_v1 *pbase, unsigned int *psize, CUdeviceptr_v1 dptr); + typedef CUresult (CUDAAPI *PFN_cuMemAllocHost_v2000)(void **pp, unsigned int bytesize); + typedef CUresult (CUDAAPI *PFN_cuMemHostGetDevicePointer_v2020)(CUdeviceptr_v1 *pdptr, void *p, unsigned int Flags); + typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoD_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount); + typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoH_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoD_v2000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUdeviceptr_v1 srcDevice, unsigned int ByteCount); + typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoD_v2000)(CUdeviceptr_v1 dstDevice, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoA_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount); + typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoH_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoA_v2000)(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount); + typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoAAsync_v2000)(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuMemcpyAtoHAsync_v2000)(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuMemcpy2D_v2000)(const CUDA_MEMCPY2D_v1 *pCopy); + typedef CUresult (CUDAAPI *PFN_cuMemcpy2DUnaligned_v2000)(const CUDA_MEMCPY2D_v1 *pCopy); + typedef CUresult (CUDAAPI *PFN_cuMemcpy3D_v2000)(const CUDA_MEMCPY3D_v1 *pCopy); + typedef CUresult (CUDAAPI *PFN_cuMemcpyHtoDAsync_v2000)(CUdeviceptr_v1 dstDevice, const void *srcHost, unsigned int ByteCount, CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoHAsync_v2000)(void *dstHost, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuMemcpyDtoDAsync_v3000)(CUdeviceptr_v1 dstDevice, CUdeviceptr_v1 srcDevice, unsigned int ByteCount, CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuMemcpy2DAsync_v2000)(const CUDA_MEMCPY2D_v1 *pCopy, CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuMemcpy3DAsync_v2000)(const CUDA_MEMCPY3D_v1 *pCopy, CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuMemsetD8_v2000)(CUdeviceptr_v1 dstDevice, unsigned char uc, unsigned int N); + typedef CUresult (CUDAAPI *PFN_cuMemsetD16_v2000)(CUdeviceptr_v1 dstDevice, unsigned short us, unsigned int N); + typedef CUresult (CUDAAPI *PFN_cuMemsetD32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int ui, unsigned int N); + typedef CUresult (CUDAAPI *PFN_cuMemsetD2D8_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned char uc, unsigned int Width, unsigned int Height); + typedef CUresult (CUDAAPI *PFN_cuMemsetD2D16_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned short us, unsigned int Width, unsigned int Height); + typedef CUresult (CUDAAPI *PFN_cuMemsetD2D32_v2000)(CUdeviceptr_v1 dstDevice, unsigned int dstPitch, unsigned int ui, unsigned int Width, unsigned int Height); + typedef CUresult (CUDAAPI *PFN_cuArrayCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR_v1 *pAllocateArray); + typedef CUresult (CUDAAPI *PFN_cuArrayGetDescriptor_v2000)(CUDA_ARRAY_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); + typedef CUresult (CUDAAPI *PFN_cuArray3DCreate_v2000)(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR_v1 *pAllocateArray); + typedef CUresult (CUDAAPI *PFN_cuArray3DGetDescriptor_v2000)(CUDA_ARRAY3D_DESCRIPTOR_v1 *pArrayDescriptor, CUarray hArray); + typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress_v2000)(unsigned int *ByteOffset, CUtexref hTexRef, CUdeviceptr_v1 dptr, unsigned int bytes); + typedef CUresult (CUDAAPI *PFN_cuTexRefSetAddress2D_v2020)(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR_v1 *desc, CUdeviceptr_v1 dptr, unsigned int Pitch); + typedef CUresult (CUDAAPI *PFN_cuTexRefGetAddress_v2000)(CUdeviceptr_v1 *pdptr, CUtexref hTexRef); + typedef CUresult (CUDAAPI *PFN_cuGraphicsResourceGetMappedPointer_v3000)(CUdeviceptr_v1 *pDevPtr, unsigned int *pSize, CUgraphicsResource resource); + typedef CUresult (CUDAAPI *PFN_cuCtxDestroy_v2000)(CUcontext ctx); + typedef CUresult (CUDAAPI *PFN_cuCtxPopCurrent_v2000)(CUcontext *pctx); + typedef CUresult (CUDAAPI *PFN_cuCtxPushCurrent_v2000)(CUcontext ctx); + typedef CUresult (CUDAAPI *PFN_cuStreamDestroy_v2000)(CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuEventDestroy_v2000)(CUevent hEvent); + typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxRelease_v7000)(CUdevice_v1 dev); + typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxReset_v7000)(CUdevice_v1 dev); + typedef CUresult (CUDAAPI *PFN_cuDevicePrimaryCtxSetFlags_v7000)(CUdevice_v1 dev, unsigned int flags); + typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000)(CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuStreamBeginCapture_v10000_ptsz)(CUstream hStream); + typedef CUresult (CUDAAPI *PFN_cuIpcOpenMemHandle_v4010)(CUdeviceptr_v2 *pdptr, CUipcMemHandle_v1 handle, unsigned int Flags); + typedef CUresult (CUDAAPI *PFN_cuGraphInstantiate_v10000)(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize); +#endif + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // file guard diff --git a/ext/cudart/include/cuda_awbarrier.h b/ext/cudart/include/cuda_awbarrier.h new file mode 100644 index 0000000000000000000000000000000000000000..3a7fe8a370330454f8a49e083899a50f7dc527ce --- /dev/null +++ b/ext/cudart/include/cuda_awbarrier.h @@ -0,0 +1,227 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_AWBARRIER_H_ +# define _CUDA_AWBARRIER_H_ + +# include "cuda_awbarrier_primitives.h" + +# if !defined(_CUDA_AWBARRIER_SM_TARGET) +# error This file requires compute capability 7.0 or greater. +# endif + +# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER) +# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \ + -std=c++11 compiler option. +# endif + +_CUDA_AWBARRIER_BEGIN_NAMESPACE + +class awbarrier { +public: + class arrival_token { + public: + arrival_token() = default; + ~arrival_token() = default; + _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const; + private: + _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token); + uint64_t token; + friend awbarrier; + }; + awbarrier() = default; + awbarrier(const awbarrier&) = delete; + awbarrier& operator=(const awbarrier&) = delete; + ~awbarrier() = default; + + _CUDA_AWBARRIER_QUALIFIER arrival_token arrive(); + _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop(); + _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles); + _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token); + _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait(); + _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max(); +private: + uint64_t barrier; + friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count); + friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier); + friend class pipeline; +}; + +_CUDA_AWBARRIER_QUALIFIER +uint32_t awbarrier::arrival_token::pending_count() const +{ + const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token); +#if (__CUDA_ARCH__ >= 900) + return pending_count; +#else + return (pending_count >> 15); +#endif +} + +_CUDA_AWBARRIER_QUALIFIER +awbarrier::arrival_token::arrival_token(uint64_t token) + : token(token) +{ +} + +_CUDA_AWBARRIER_QUALIFIER +void init(awbarrier* barrier, uint32_t expected_count) +{ + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT); + +#if (__CUDA_ARCH__ >= 900) + const uint32_t init_count = expected_count; +#else + const uint32_t init_count = (expected_count << 15) + expected_count; +#endif + + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count); +} + +_CUDA_AWBARRIER_QUALIFIER +void inval(awbarrier* barrier) +{ + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier); +} + +_CUDA_AWBARRIER_QUALIFIER +awbarrier::arrival_token awbarrier::arrive() +{ + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + #if (__CUDA_ARCH__ < 900) + const uint32_t arrive_count = 1 << 15; + const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count); + (void) +#else + const uint64_t token = + #endif + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier); + + return arrival_token(token); +} + +_CUDA_AWBARRIER_QUALIFIER +awbarrier::arrival_token awbarrier::arrive_and_drop() +{ + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + #if (__CUDA_ARCH__ < 900) + const uint32_t arrive_count = 1 << 15; + const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count); + (void) +#else + const uint64_t token = + #endif + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier); + + return arrival_token(token); +} + +_CUDA_AWBARRIER_QUALIFIER +bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles) +{ + constexpr uint64_t max_busy_wait_cycles = 1024; + constexpr uint32_t max_sleep_ns = 1 << 20; + + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) { + return true; + } + + uint64_t start_cycles = clock64(); + uint64_t elapsed_cycles = 0; + uint32_t sleep_ns = 32; + while (elapsed_cycles < hint_cycles) { + if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) { + return true; + } + + if (elapsed_cycles > max_busy_wait_cycles) { + __nanosleep(sleep_ns); + if (sleep_ns < max_sleep_ns) { + sleep_ns *= 2; + } + } + + elapsed_cycles = clock64() - start_cycles; + } + + return false; +} + +_CUDA_AWBARRIER_QUALIFIER +void awbarrier::wait(arrival_token token) +{ + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + while (!timed_wait(token, ~0u)); +} + +_CUDA_AWBARRIER_QUALIFIER +void awbarrier::arrive_and_wait() +{ + _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier)); + + this->wait(this->arrive()); +} + +_CUDA_AWBARRIER_QUALIFIER __host__ +constexpr uint32_t awbarrier::max() +{ + return _CUDA_AWBARRIER_MAX_COUNT; +} + +_CUDA_AWBARRIER_END_NAMESPACE + +#endif /* !_CUDA_AWBARRIER_H_ */ diff --git a/ext/cudart/include/cuda_awbarrier_helpers.h b/ext/cudart/include/cuda_awbarrier_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..a112fea7830daf2934afff4aa6c14f0787a9f161 --- /dev/null +++ b/ext/cudart/include/cuda_awbarrier_helpers.h @@ -0,0 +1,350 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_AWBARRIER_HELPERS_H_ +#define _CUDA_AWBARRIER_HELPERS_H_ + +#define _CUDA_AWBARRIER_NAMESPACE nvcuda::experimental +#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental { +#define _CUDA_AWBARRIER_END_NAMESPACE } } + +#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal +#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal { +#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE } _CUDA_AWBARRIER_END_NAMESPACE + +# if !defined(_CUDA_AWBARRIER_QUALIFIER) +# define _CUDA_AWBARRIER_QUALIFIER inline __device__ +# endif +# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER) +# define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__ +#endif + +#if defined(__CUDA_ARCH__) +#if (__CUDA_ARCH__ >= 800) +# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80 +#elif (__CUDA_ARCH__ >= 700) +# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70 +#endif // No support < 700 +#else // !defined(__CUDA_ARCH__) +# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70 +#endif // defined(__CUDA_ARCH__) + +#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1) + +#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900))) +# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER +#endif + +#if !defined(_CUDA_AWBARRIER_DEBUG) +# if defined(__CUDACC_DEBUG__) +# define _CUDA_AWBARRIER_DEBUG 1 +# else +# define _CUDA_AWBARRIER_DEBUG 0 +# endif +#endif + +#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG) +# if !defined(__CUDACC_RTC__) +# include <cassert> +# endif +# define _CUDA_AWBARRIER_ASSERT(x) assert((x)); +# define _CUDA_AWBARRIER_ABORT() assert(0); +#else +# define _CUDA_AWBARRIER_ASSERT(x) +# define _CUDA_AWBARRIER_ABORT() __trap(); +#endif + +#if defined(__CUDACC_RTC__) +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +typedef uint64_t uintptr_t; +#else +# include <stdint.h> +#endif + +#if defined(_CUDA_AWBARRIER_SM_TARGET) + +typedef uint64_t __mbarrier_t; +typedef uint64_t __mbarrier_token_t; + +_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE + +extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *); + +namespace _CUDA_AWBARRIER_SM_70 { + union AWBarrier { + struct { + uint32_t expected; + uint32_t pending; + } split; + uint64_t raw; + }; + + _CUDA_AWBARRIER_STATIC_QUALIFIER + void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29)); + + AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier); + + awbarrier->split.expected = 0x40000000 - expected_count; + awbarrier->split.pending = 0x80000000 - expected_count; + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + void __awbarrier_inval(uint64_t* barrier) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint32_t __awbarrier_token_pending_count(uint64_t token) { + const uint32_t pending = token >> 32; + return 0x80000000 - (pending & 0x7fffffff); + } + + template<bool _Drop> + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint64_t __awbarrier_arrive_drop(uint64_t* barrier) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier); + + while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0); + + if (_Drop) { + (void)atomicAdd_block(&awbarrier->split.expected, 1); + } + + __threadfence_block(); + + const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1); + const uint32_t new_pending = old_pending + 1; + const bool reset = (old_pending ^ new_pending) & 0x80000000; + + if (reset) { + __threadfence_block(); + + uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected); + new_expected &= ~0x40000000; + if (new_expected & 0x20000000) { + new_expected |= 0x40000000; + } + atomicAdd_block(&awbarrier->split.pending, new_expected); + } + + return static_cast<uint64_t>(old_pending) << 32; + } + + template<bool _Drop> + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29)); + + AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier); + + while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0); + + if (_Drop) { + (void)atomicAdd_block(&awbarrier->split.expected, count); + } + + return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32; + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier); + + return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000; + } +}; // namespace _CUDA_AWBARRIER_SM_70 + +namespace _CUDA_AWBARRIER_SM_80 { + _CUDA_AWBARRIER_STATIC_QUALIFIER + void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29)); + + asm volatile ("mbarrier.init.shared.b64 [%0], %1;" + : + : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count) + : "memory"); + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + void __awbarrier_inval(uint64_t* barrier) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + asm volatile ("mbarrier.inval.shared.b64 [%0];" + : + : "r"(__nvvm_get_smem_pointer(barrier)) + : "memory"); + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint32_t __awbarrier_token_pending_count(uint64_t token) { + uint32_t __pending_count; + + asm ("mbarrier.pending_count.b64 %0, %1;" + : "=r"(__pending_count) + : "l"(token)); + return __pending_count; + } + + template<bool _Drop> + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint64_t __awbarrier_arrive_drop(uint64_t* barrier) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + uint64_t token; + + if (_Drop) { + asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];" + : "=l"(token) + : "r"(__nvvm_get_smem_pointer(barrier)) + : "memory"); + } else { + asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];" + : "=l"(token) + : "r"(__nvvm_get_smem_pointer(barrier)) + : "memory"); + } + + return token; + } + + template<bool _Drop> + _CUDA_AWBARRIER_STATIC_QUALIFIER + uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29)); + + uint64_t token; + + if (_Drop) { + asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;" + : "=l"(token) + : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count) + : "memory"); + } else { + asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;" + : "=l"(token) + : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count) + : "memory"); + } + + return token; + } + + _CUDA_AWBARRIER_STATIC_QUALIFIER + bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) { + _CUDA_AWBARRIER_ASSERT(__isShared(barrier)); + + uint16_t __wait_complete; + + asm volatile ("{" + " .reg .pred %%p;" + " mbarrier.test_wait.shared.b64 %%p, [%1], %2;" + " selp.u16 %0, 1, 0, %%p;" + "}" + : "=h"(__wait_complete) + : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token) + : "memory"); + return bool(__wait_complete); + } + +}; // namespace _CUDA_AWBARRIER_SM_80 + +_CUDA_AWBARRIER_QUALIFIER +void awbarrier_init(uint64_t* barrier, uint32_t expected_count) +{ + _CUDA_AWBARRIER_SM_TARGET::__awbarrier_init(barrier, expected_count); +} + +_CUDA_AWBARRIER_QUALIFIER +void awbarrier_inval(uint64_t* barrier) +{ + _CUDA_AWBARRIER_SM_TARGET::__awbarrier_inval(barrier); +} + +_CUDA_AWBARRIER_QUALIFIER +uint32_t awbarrier_token_pending_count(uint64_t token) +{ + return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_token_pending_count(token); +} + +template<bool _Drop> +_CUDA_AWBARRIER_QUALIFIER +uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count) +{ + return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop_no_complete<_Drop>(barrier, arrive_count); +} + +template<bool _Drop> +_CUDA_AWBARRIER_QUALIFIER +uint64_t awbarrier_arrive_drop(uint64_t* barrier) +{ + return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop<_Drop>(barrier); +} + +_CUDA_AWBARRIER_QUALIFIER +bool awbarrier_test_wait(uint64_t* barrier, uint64_t token) +{ + return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_test_wait(barrier, token); +} + +_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE + +#endif /* defined(_CUDA_AWBARRIER_SM_TARGET) */ + +#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */ diff --git a/ext/cudart/include/cuda_awbarrier_primitives.h b/ext/cudart/include/cuda_awbarrier_primitives.h new file mode 100644 index 0000000000000000000000000000000000000000..647110a3351477cdd8a197f53c5877648964ab8e --- /dev/null +++ b/ext/cudart/include/cuda_awbarrier_primitives.h @@ -0,0 +1,94 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_ +#define _CUDA_AWBARRIER_PRIMITIVES_H_ + +#include "cuda_awbarrier_helpers.h" + +#if !defined(_CUDA_AWBARRIER_SM_TARGET) +# error This file requires compute capability 7.0 or greater. +#endif + +_CUDA_AWBARRIER_STATIC_QUALIFIER __host__ +uint32_t __mbarrier_maximum_count() { + return _CUDA_AWBARRIER_MAX_COUNT; +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) { + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +void __mbarrier_inval(__mbarrier_t* barrier) { + _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) { + return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) { + return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) { + return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token); +} + +_CUDA_AWBARRIER_STATIC_QUALIFIER +uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) { + return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token); +} + +#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */ diff --git a/ext/cudart/include/cuda_bf16.h b/ext/cudart/include/cuda_bf16.h new file mode 100644 index 0000000000000000000000000000000000000000..695863c5122984c5d343e9aa70e583d5e19f6d2e --- /dev/null +++ b/ext/cudart/include/cuda_bf16.h @@ -0,0 +1,3749 @@ +/* +* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_BFLOAT16 Bfloat16 Precision Intrinsics +* This section describes nv_bfloat16 precision intrinsic functions that are +* only supported in device code. +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_ARITHMETIC Bfloat16 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_ARITHMETIC Bfloat162 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_COMPARISON Bfloat16 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_COMPARISON Bfloat162 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_MISC Bfloat16 Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT16_FUNCTIONS Bfloat16 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__BFLOAT162_FUNCTIONS Bfloat162 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_BFLOAT16 +* To use these functions, include the header file \p cuda_bf16.h in your program. +*/ + +#ifndef __CUDA_BF16_H__ +#define __CUDA_BF16_H__ + +#define ___CUDA_BF16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_BF16_STRINGIFY(x) ___CUDA_BF16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) +#if defined(__CUDACC__) +#define __CUDA_BF16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__ +#else +#define __CUDA_HOSTDEVICE_BF16_DECL__ static +#endif /* defined(__CUDACC__) */ + +#define __CUDA_BF16_TYPES_EXIST__ + +/* Forward-declaration of structures defined in "cuda_bf16.hpp" */ + +/** + * \brief nv_bfloat16 datatype + * + * \details This structure implements the datatype for storing + * nv_bfloat16 floating-point numbers. The structure implements + * assignment operators and type conversions. 16 bits are being + * used in total: 1 sign bit, 8 bits for the exponent, and + * the significand is being stored in 7 bits. The total + * precision is 8 bits. + * + */ +struct __nv_bfloat16; + +/** + * \brief nv_bfloat162 datatype + * + * \details This structure implements the datatype for storing two + * nv_bfloat16 floating-point numbers. + * The structure implements assignment operators and type conversions. + * + */ +struct __nv_bfloat162; + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts double number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts double number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-to-nearest-even mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-towards-zero mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-down mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts float number to nv_bfloat16 precision in round-up mode +* and returns \p nv_bfloat16 with converted value. +* +* \details Converts float number \p a to nv_bfloat16 precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat16 +* - \p a converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts \p nv_bfloat16 number to float. +* +* \details Converts nv_bfloat16 number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts input to nv_bfloat16 precision in round-to-nearest-even mode and +* populates both halves of \p nv_bfloat162 with converted value. +* +* \details Converts input \p a to nv_bfloat16 precision in round-to-nearest-even mode and +* populates both halves of \p nv_bfloat162 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 value with both halves equal to the converted nv_bfloat16 +* precision number. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both input floats to nv_bfloat16 precision in round-to-nearest-even +* mode and returns \p nv_bfloat162 with converted values. +* +* \details Converts both input floats to nv_bfloat16 precision in round-to-nearest-even mode +* and combines the results into one \p nv_bfloat162 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 value with corresponding halves equal to the +* converted input floats. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts low 16 bits of \p nv_bfloat162 to float and returns the result +* +* \details Converts low 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts high 16 bits of \p nv_bfloat162 to float and returns the result +* +* \details Converts high 16 bits of \p nv_bfloat162 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a); + +#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both components of float2 number to nv_bfloat16 precision in +* round-to-nearest-even mode and returns \p nv_bfloat162 with converted values. +* +* \details Converts both components of float2 to nv_bfloat16 precision in round-to-nearest +* mode and combines the results into one \p nv_bfloat162 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 which has corresponding halves equal to the +* converted float2 components. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Converts both halves of \p nv_bfloat162 to float2 and returns the result. +* +* \details Converts both halves of \p nv_bfloat162 input \p a to float2 and returns the +* result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns float2 +* - \p a converted to float2. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-to-nearest-even mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed short integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed short integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed short integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned short integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned short integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert an unsigned 64-bit integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-down mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a nv_bfloat16 to a signed 64-bit integer in round-up mode. +* +* \details Convert the nv_bfloat16 floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Convert a signed 64-bit integer to a nv_bfloat16 in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a nv_bfloat16 floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns nv_bfloat16 +* - \p i converted to nv_bfloat16. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Round input to nearest integer value in nv_bfloat16 floating-point +* number. +* +* \details Round \p h to the nearest integer value in nv_bfloat16 floating-point +* format, with bfloat16way cases rounded to the nearest even integer value. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h); + +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Truncate \p nv_bfloat162 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculate \p nv_bfloat162 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Round input to nearest integer value in nv_bfloat16 floating-point +* number. +* +* \details Round each component of \p nv_bfloat162 vector \p h to the nearest integer value in +* nv_bfloat16 floating-point format, with bfloat16way cases rounded to the +* nearest even integer value. +* \param[in] h - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns \p nv_bfloat162 with both halves equal to the input value. +* +* \details Returns \p nv_bfloat162 number with both halves equal to the input \p a \p nv_bfloat16 +* number. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat162 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Swaps both halves of the \p nv_bfloat162 input. +* +* \details Swaps both halves of the \p nv_bfloat162 input and returns a new \p nv_bfloat162 number +* with swapped halves. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines +* into one \p nv_bfloat162 number. +* +* \details Extracts low 16 bits from each of the two \p nv_bfloat162 inputs and combines into +* one \p nv_bfloat162 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and +* combines into one \p nv_bfloat162 number. +* +* \details Extracts high 16 bits from each of the two \p nv_bfloat162 inputs and combines into +* one \p nv_bfloat162 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns high 16 bits of \p nv_bfloat162 input. +* +* \details Returns high 16 bits of \p nv_bfloat162 input \p a. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat16 +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Returns low 16 bits of \p nv_bfloat162 input. +* +* \details Returns low 16 bits of \p nv_bfloat162 input \p a. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat16 +* - Returns \p nv_bfloat16 which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Checks if the input \p nv_bfloat16 number is infinite. +* +* \details Checks if the input \p nv_bfloat16 number \p a is infinite. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns int +* - -1 iff \p a is equal to negative infinity, +* - 1 iff \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Combines two \p nv_bfloat16 numbers into one \p nv_bfloat162 number. +* +* \details Combines two input \p nv_bfloat16 number \p a and \p b into one \p nv_bfloat162 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with one nv_bfloat16 equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts low 16 bits from \p nv_bfloat162 input. +* +* \details Extracts low 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 +* number which has both halves equal to the extracted bits. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Extracts high 16 bits from \p nv_bfloat162 input. +* +* \details Extracts high 16 bits from \p nv_bfloat162 input \p a and returns a new \p nv_bfloat162 +* number which has both halves equal to the extracted bits. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a \p nv_bfloat16 as a signed short integer. +* +* \details Reinterprets the bits in the nv_bfloat16 floating-point number \p h +* as a signed short integer. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a \p nv_bfloat16 as an unsigned short integer. +* +* \details Reinterprets the bits in the nv_bfloat16 floating-point \p h +* as an unsigned short number. +* \param[in] h - nv_bfloat16. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in a signed short integer as a \p nv_bfloat16. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* nv_bfloat16 floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns nv_bfloat16 +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p nv_bfloat16. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* nv_bfloat16 floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns nv_bfloat16 +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i); + +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - nv_bfloat162. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as nv_bfloat162. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - nv_bfloat16. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as nv_bfloat16. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr); + +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value); +/** +* \ingroup CUDA_MATH__BFLOAT16_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value); + +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs nv_bfloat162 vector if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison. +* +* Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p nv_bfloat16 results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The \p nv_bfloat162 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Determine whether \p nv_bfloat162 argument is a NaN. +* +* \details Determine whether each nv_bfloat16 of input \p nv_bfloat162 number \p a is a NaN. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The nv_bfloat162 with the corresponding \p nv_bfloat16 results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+add +* or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector division in round-to-nearest-even mode. +* +* \details Divides \p nv_bfloat162 input vector \p a by input vector \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p nv_bfloat162 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p nv_bfloat162 number and +* returns the result. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns bfloat2 +* - Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector add of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p nv_bfloat162 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Negates both halves of the input \p nv_bfloat162 number and returns the +* result. +* +* \details Negates both halves of the input \p nv_bfloat162 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Calculates the absolute value of input \p nv_bfloat16 number and returns the result. +* +* \details Calculates the absolute value of input \p nv_bfloat16 number and returns the result. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The absolute value of a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b. +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b. +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 division in round-to-nearest-even mode. +* +* \details Divides \p nv_bfloat16 input \p a by input \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p nv_bfloat16 input \p b from input \p a in round-to-nearest +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 multiplication of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Negates input \p nv_bfloat16 number and returns the result. +* +* \details Negates input \p nv_bfloat16 number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector if-equal comparison and returns boolean true +* iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector not-equal comparison and returns boolean +* true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-equal comparison and returns boolean +* true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-equal comparison and returns boolean +* true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector less-than comparison and returns boolean +* true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector greater-than comparison and returns boolean +* true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered if-equal comparison and returns +* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered not-equal comparison and returns +* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-equal comparison and returns +* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-equal comparison and +* returns boolean true iff both \p nv_bfloat16 results are true, boolean false +* otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered less-than comparison and returns +* boolean true iff both \p nv_bfloat16 results are true, boolean false otherwise. +* +* \details Performs \p nv_bfloat162 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Performs \p nv_bfloat162 vector unordered greater-than comparison and +* returns boolean true iff both \p nv_bfloat16 results are true, boolean false +* otherwise. +* +* \details Performs \p nv_bfloat162 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p nv_bfloat16 greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns bool +* - true if both \p nv_bfloat16 results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 if-equal comparison. +* +* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 not-equal comparison. +* +* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 less-equal comparison. +* +* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 greater-equal comparison. +* +* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 less-than comparison. +* +* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 greater-than comparison. +* +* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered if-equal comparison. +* +* \details Performs \p nv_bfloat16 if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered not-equal comparison. +* +* \details Performs \p nv_bfloat16 not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered less-equal comparison. +* +* \details Performs \p nv_bfloat16 less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered greater-equal comparison. +* +* \details Performs \p nv_bfloat16 greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered less-than comparison. +* +* \details Performs \p nv_bfloat16 less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Performs \p nv_bfloat16 unordered greater-than comparison. +* +* \details Performs \p nv_bfloat16 greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Determine whether \p nv_bfloat16 argument is a NaN. +* +* \details Determine whether \p nv_bfloat16 value \p a is a NaN. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns bool +* - true iff argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 maximum of two input values. +* +* \details Calculates \p nv_bfloat16 max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 minimum of two input values. +* +* \details Calculates \p nv_bfloat16 min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 maximum of two input values, NaNs pass through. +* +* \details Calculates \p nv_bfloat16 max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_COMPARISON +* \brief Calculates \p nv_bfloat16 minimum of two input values, NaNs pass through. +* +* \details Calculates \p nv_bfloat16 min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b); +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Performs \p nv_bfloat16 fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p nv_bfloat16 multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat16 add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - nv_bfloat16. Is only being read. +* \param[in] b - nv_bfloat16. Is only being read. +* \param[in] c - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector maximum of two inputs. +* +* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector minimum of two inputs. +* +* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p nv_bfloat162 vector max(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_COMPARISON +* \brief Calculates \p nv_bfloat162 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p nv_bfloat162 vector min(\p a, \p b). +* Elementwise \p nv_bfloat16 operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs \p nv_bfloat162 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p nv_bfloat162 vector multiply on inputs \p a and \p b, +* then performs a \p nv_bfloat162 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p nv_bfloat162 input pairs \p a, \p b, and \p c as +* complex numbers in \p nv_bfloat16 precision and performs +* complex multiply-accumulate operation: a*b + c +* \param[in] a - nv_bfloat162. Is only being read. +* \param[in] b - nv_bfloat162. Is only being read. +* \param[in] c - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c); + +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 square root in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 square root of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat16 reciprocal square root of input \p a in round-to-nearest +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 natural exponential function in round-to-nearest +* mode. +* +* \details Calculates \p nv_bfloat16 natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 binary exponential function in round-to-nearest +* mode. +* +* \details Calculates \p nv_bfloat16 binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 decimal exponential function in round-to-nearest +* mode. +* +* \details Calculates \p nv_bfloat16 decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 cosine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 cosine of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT16_FUNCTIONS +* \brief Calculates \p nv_bfloat16 sine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat16 sine of input \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat16. Is only being read. +* +* \returns nv_bfloat16 +* - The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 square root of input vector \p a in round-to-nearest +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector reciprocal square root in round-to-nearest +* mode. +* +* \details Calculates \p nv_bfloat162 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 binary logarithm of input vector \p a in round-to-nearest +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p nv_bfloat162 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector exponential function in round-to-nearest +* mode. +* +* \details Calculates \p nv_bfloat162 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 cosine of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a); +/** +* \ingroup CUDA_MATH__BFLOAT162_FUNCTIONS +* \brief Calculates \p nv_bfloat162 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p nv_bfloat162 sine of input vector \p a in round-to-nearest-even mode. +* \param[in] a - nv_bfloat162. Is only being read. +* +* \returns nv_bfloat162 +* - The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a); + +/** +* \ingroup CUDA_MATH__BFLOAT162_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two nv_bfloat16 elements; the entire __nv_bfloat162 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher. +* +* \param[in] address - __nv_bfloat162*. An address in global or shared memory. +* \param[in] val - __nv_bfloat162. The value to be added. +* +* \returns __nv_bfloat162 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val); + +/** +* \ingroup CUDA_MATH__BFLOAT16_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 8.x and higher. +* +* \param[in] address - __nv_bfloat16*. An address in global or shared memory. +* \param[in] val - __nv_bfloat16. The value to be added. +* +* \returns __nv_bfloat16 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val); + +#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */ + +#undef __CUDA_BF16_DECL__ +#undef __CUDA_HOSTDEVICE_BF16_DECL__ + +#endif /* defined(__cplusplus) */ + +/* Note the .hpp file is included even for host-side compilation, to capture the "nv_bfloat16" & "nv_bfloat162" definitions */ +#include "cuda_bf16.hpp" +#undef ___CUDA_BF16_STRINGIFY_INNERMOST +#undef __CUDA_BF16_STRINGIFY + +#endif /* end of include guard: __CUDA_BF16_H__ */ diff --git a/ext/cudart/include/cuda_bf16.hpp b/ext/cudart/include/cuda_bf16.hpp new file mode 100644 index 0000000000000000000000000000000000000000..1e3858dc86d7a42270edff1c4ba801007494a38b --- /dev/null +++ b/ext/cudart/include/cuda_bf16.hpp @@ -0,0 +1,2683 @@ +/* +* Copyright 1993-2022 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_BF16_HPP__) +#define __CUDA_BF16_HPP__ + +#if !defined(__CUDA_BF16_H__) +#error "Do not include this file directly. Instead, include cuda_bf16.h." +#endif + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_BF16 +#endif + +/* C++11 header for std::move. + * In RTC mode, std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) +#include <utility> +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */ + +/* C++ header for std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include <cstring> +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + + +/* Set up function decorations */ +#if defined(__CUDACC__) +#define __CUDA_BF16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_BF16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* defined(__CUDACC_) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ +#endif /* defined(__CUDACC__) */ + +/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */ +#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var))) +#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var))) +#define __BFLOAT162_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var))) +#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var))) + +/** +* Types which allow static initialization of "nv_bfloat16" and "nv_bfloat162" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "nv_bfloat16", and not a conversion from short->nv_bfloat16. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __nv_bfloat16_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x; + unsigned short y; +} __nv_bfloat162_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __nv_bfloat16 { +protected: + unsigned short __x; + +public: +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat16() = default; +#else + __CUDA_HOSTDEVICE__ __nv_bfloat16() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + + /* Convert to/from __nv_bfloat16_raw */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) + /* Construct from float/double */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x; } + + __CUDA_HOSTDEVICE__ operator float() const { return __bfloat162float(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; } + +/* Member functions only available to nvcc compilation so far */ +#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x; } + __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __CUDA_HOSTDEVICE__ operator short() const { return __bfloat162short_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned short() const { return __bfloat162ushort_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator int() const { return __bfloat162int_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned int() const { return __bfloat162uint_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator long long() const { return __bfloat162ll_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __bfloat162ull_rz(*this); } + __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFF) != 0; } +#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */ +#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) +#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3F80 equating to nv_bfloat16(1.0f), to avoid the extra conversion */ +__device__ __forceinline__ __nv_bfloat16 &operator++(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80; h += one; return h; } +__device__ __forceinline__ __nv_bfloat16 &operator--(__nv_bfloat16 &h) { __nv_bfloat16_raw one; one.x = 0x3F80; h -= one; return h; } +__device__ __forceinline__ __nv_bfloat16 operator++(__nv_bfloat16 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast<void>(ignored); + + const __nv_bfloat16 ret = h; + __nv_bfloat16_raw one; + one.x = 0x3F80; + h += one; + return ret; +} +__device__ __forceinline__ __nv_bfloat16 operator--(__nv_bfloat16 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast<void>(ignored); + + const __nv_bfloat16 ret = h; + __nv_bfloat16_raw one; + one.x = 0x3F80; + h -= one; + return ret; +} +/* Unary plus and inverse operators */ +__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; } +__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); } +__device__ __forceinline__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */ +#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* __nv_bfloat162 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __nv_bfloat162 { + __nv_bfloat16 x; + __nv_bfloat16 y; + + // All construct/copy/assign/move +public: +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) + __nv_bfloat162() = default; + __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __nv_bfloat162() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); return *this; } + + /* Convert to/from __nv_bfloat162_raw */ + __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const { __nv_bfloat162_raw ret; ret.x = 0U; ret.y = 0U; __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this); return ret; } +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_BFLOAT162_OPERATORS__) + +__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); } +__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); } +__device__ __forceinline__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); } +__device__ __forceinline__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); } + +__device__ __forceinline__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; } +__device__ __forceinline__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; } + +__device__ __forceinline__ __nv_bfloat162 &operator++(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hadd2(h, one); return h; } +__device__ __forceinline__ __nv_bfloat162 &operator--(__nv_bfloat162 &h) { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hsub2(h, one); return h; } +__device__ __forceinline__ __nv_bfloat162 operator++(__nv_bfloat162 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast<void>(ignored); + + const __nv_bfloat162 ret = h; + __nv_bfloat162_raw one; + one.x = 0x3F80; + one.y = 0x3F80; + h = __hadd2(h, one); + return ret; +} +__device__ __forceinline__ __nv_bfloat162 operator--(__nv_bfloat162 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast<void>(ignored); + + const __nv_bfloat162 ret = h; + __nv_bfloat162_raw one; + one.x = 0x3F80; + one.y = 0x3F80; + h = __hsub2(h, one); + return ret; +} +__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; } +__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); } + +__device__ __forceinline__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); } +__device__ __forceinline__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); } +__device__ __forceinline__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); } +__device__ __forceinline__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); } +__device__ __forceinline__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); } +__device__ __forceinline__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); } + +#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */ +#endif /* defined(__CUDACC__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + +#if defined(__CUDA_ARCH__) + x = __float_as_uint(f); +#elif defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)std::memcpy(&x, &f, sizeof(f)); +#endif + + if ((x & 0x7fffffffU) > 0x7f800000U) { + sign = 0U; + remainder = 0U; + return static_cast<unsigned short>(0x7fffU); + } + sign = x >> 31U; + remainder = x << 16U; + return static_cast<unsigned short>(x >> 16U); +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("{ cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x)); + return val; +#else + + float f = static_cast<float>(x); + const double d = static_cast<double>(f); + unsigned int u; + +#if defined(__CUDA_ARCH__) + u = __float_as_uint(f); +#elif defined(__CUDACC__) + (void)memcpy(&u, &f, sizeof(f)); +#else + (void)std::memcpy(&u, &f, sizeof(f)); +#endif + bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U); + + + if ((x > 0.0) && (d > x)) { + u--; + } + if ((x < 0.0) && (d < x)) { + u--; + } + if ((d != x) && x_is_not_nan) { + u |= 1U; + } + +#if defined(__CUDA_ARCH__) + f = __int_as_float(static_cast<int>(u)); +#elif defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(f)); +#else + (void)std::memcpy(&f, &u, sizeof(f)); +#endif + + return __float2bfloat16(f); + +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +} + +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a) +{ + __nv_bfloat16 val; +#if __CUDA_ARCH__ >= 800 + asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +#else + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a) +{ + __nv_bfloat16 val; +#if __CUDA_ARCH__ >= 800 + asm("{ cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +#else + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a) +{ + __nv_bfloat16 val; +#if __CUDA_ARCH__ >= 800 + asm("{ cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); +#else + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("{ cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); + return val; +#else + __nv_bfloat16 val; + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; + return val; +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("{ cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a)); + return val; +#else + __nv_bfloat16 val; + __nv_bfloat16_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2bfloat16(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; + return val; +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a) +{ + __nv_bfloat162 val; +#if __CUDA_ARCH__ >= 800 + asm("{.reg .b16 low;\n" + " cvt.rn.bf16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a)); +#else + val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a)); +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b) +{ + __nv_bfloat162 val; +#if __CUDA_ARCH__ >= 800 + asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b)); +#else + val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b)); +#endif + return val; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h) +{ + float f; +#if defined(__CUDA_ARCH__) + #if (__CUDA_ARCH__ >= 900) + asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h)); + #else + asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h)); + #endif +#else + unsigned int u = static_cast<unsigned int>(h) << 16; + #if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(f)); + #else + (void)std::memcpy(&f, &u, sizeof(f)); + #endif +#endif + return f; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x); +} +__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x); +} + +__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a) +{ + return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y); +} + +#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) + +/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */ +__VECTOR_FUNCTIONS_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) +{ + __nv_bfloat162 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a) +{ + __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a) +{ + float hi_float; + float lo_float; + lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x); + hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y); + return make_float2(lo_float, hi_float); +} +__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + int val; + asm("{ cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2int_rn(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + int val; + asm("{ cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + const float f = __bfloat162float(h); + int i; + i = static_cast<int>(f); +#if !(defined __CUDA_ARCH__) + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0; + } else if (f >= static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +} +__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + int val; + asm("{ cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2int_rd(__bfloat162float(h)); +#endif +} +__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + int val; + asm("{ cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2int_ru(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i) +{ +#if (defined __CUDA_ARCH__) + #if (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; + #else + const float ru = __int2float_ru(i); + const float rd = __int2float_rd(i); + float rz = __int2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); + #endif +#else + const double d = static_cast<double>(i); + return __double2bfloat16(d); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_rz(__int2float_rz(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_rd(__int2float_rd(i)); +#endif +} + +__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_ru(__int2float_ru(i)); +#endif +} + +__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h) +{ + short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rni.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} + +__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h) +{ + short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#elif (defined __CUDA_ARCH__) + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rzi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + const float f = __bfloat162float(h); + val = static_cast<short int>(f); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + val = 0; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + val = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + val = min_val; + } +#endif + return val; +} +__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h) +{ + short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rmi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h) +{ + short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rpi.s16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + const float f = static_cast<float>(i); + return __float2bfloat16_rn(f); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_rz(__int2float_rz(static_cast<int>(i))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_rd(__int2float_rd(static_cast<int>(i))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_ru(__int2float_ru(static_cast<int>(i))); +#endif +} + +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int val; + asm("{ cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2uint_rn(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int val; + asm("{ cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + + const float f = __bfloat162float(h); + unsigned int i; + i = static_cast<unsigned int>(f); +#if !(defined __CUDA_ARCH__) + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0U; + } else if (f >= static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } +#endif + return i; + +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +} +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int val; + asm("{ cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2uint_rd(__bfloat162float(h)); +#endif +} +__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int val; + asm("{ cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h))); + return val; +#else + return __float2uint_ru(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#elif (defined __CUDA_ARCH__) + const float ru = __uint2float_ru(i); + const float rd = __uint2float_rd(i); + float rz = __uint2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +#else + const double d = static_cast<double>(i); + return __double2bfloat16(d); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_rz(__uint2float_rz(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_rd(__uint2float_rd(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i)); + return val; +#else + return __float2bfloat16_ru(__uint2float_ru(i)); +#endif +} + +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h) +{ + unsigned short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rni.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h) +{ + unsigned short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#elif (defined __CUDA_ARCH__) + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rzi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + const float f = __bfloat162float(h); + val = static_cast<unsigned short int>(f); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + val = 0U; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + val = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + val = min_val; + } +#endif + return val; +} +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h) +{ + unsigned short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rmi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h) +{ + unsigned short int val; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#else + asm("{ .reg.f32 f;\n" + " mov.b32 f, {0,%1};\n" + " cvt.rpi.u16.f32 %0,f;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h))); +#endif + return val; +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + const float f = static_cast<float>(i); + return __float2bfloat16_rn(f); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_rz(__uint2float_rz(static_cast<unsigned int>(i))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_rd(__uint2float_rd(static_cast<unsigned int>(i))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 val; + asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i)); + return val; +#else + return __float2bfloat16_ru(__uint2float_ru(static_cast<unsigned int>(i))); +#endif +} + +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned long long int i; + asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ull_rn(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h) +{ + unsigned long long int i; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + const float f = __bfloat162float(h); + i = static_cast<unsigned long long int>(f); +#if !(defined __CUDA_ARCH__) + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f >= static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } +#endif +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + return i; +} +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned long long int i; + asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ull_rd(__bfloat162float(h)); +#endif +} +__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned long long int i; + asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ull_ru(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#elif (defined __CUDA_ARCH__) + const float ru = __ull2float_ru(i); + const float rd = __ull2float_rd(i); + float rz = __ull2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +#else + float f = static_cast<float>(i); + const unsigned long long int uf = static_cast<unsigned long long int>(f); + unsigned int u; + + #if defined(__CUDA_ARCH__) + u = __float_as_uint(f); + #elif defined(__CUDACC__) + (void)memcpy(&u, &f, sizeof(f)); + #else + (void)std::memcpy(&u, &f, sizeof(f)); + #endif + + // round up happened here + // note: no need to handle round up to f == 0x1.p64 specially + if (uf > i) { + u--; + } + if (uf != i) { + u |= 1U; + } + + #if defined(__CUDA_ARCH__) + f = __int_as_float(static_cast<int>(u)); + #elif defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(f)); + #else + (void)std::memcpy(&f, &u, sizeof(f)); + #endif + + return __float2bfloat16_rn(f); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_rz(__ull2float_rz(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_rd(__ull2float_rd(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_ru(__ull2float_ru(i)); +#endif +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + long long int i; + asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ll_rn(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h) +{ + long long int i; +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); +#else + const float f = __bfloat162float(h); + i = static_cast<long long int>(f); +#if !(defined __CUDA_ARCH__) + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xFF00U) { + // NaN + i = min_val; + } else if (f >= static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } +#endif +#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + return i; +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + long long int i; + asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ll_rd(__bfloat162float(h)); +#endif +} +__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + long long int i; + asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h))); + return i; +#else + return __float2ll_ru(__bfloat162float(h)); +#endif +} +__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#elif (defined __CUDA_ARCH__) + const float ru = __ll2float_ru(i); + const float rd = __ll2float_rd(i); + float rz = __ll2float_rz(i); + if (ru != rd) { + rz = __uint_as_float(__float_as_uint(rz) | 1U); + } + return __float2bfloat16_rn(rz); +#else + float f = static_cast<float>(i); + const long long int lf = static_cast<long long int>(f); + unsigned int u; + + #if defined(__CUDA_ARCH__) + u = __float_as_uint(f); + #elif defined(__CUDACC__) + (void)memcpy(&u, &f, sizeof(f)); + #else + (void)std::memcpy(&u, &f, sizeof(f)); + #endif + + if ((f > 0.0f) && (lf > i)) { + u--; + } + if ((f < 0.0f) && (lf < i)) { + u--; + } + if (lf != i) { + u |= 1U; + } + + #if defined(__CUDA_ARCH__) + f = __int_as_float(static_cast<int>(u)); + #elif defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(f)); + #else + (void)std::memcpy(&f, &u, sizeof(f)); + #endif + + return __float2bfloat16_rn(f); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_rz(__ll2float_rz(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_rd(__ll2float_rd(i)); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 h; + asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i)); + return h; +#else + return __float2bfloat16_ru(__ll2float_ru(i)); +#endif +} + +__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +#else + return __float2bfloat16_rz(truncf(__bfloat162float(h))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +#else + return __float2bfloat16_ru(ceilf(__bfloat162float(h))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +#else + return __float2bfloat16_rd(floorf(__bfloat162float(h))); +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h))); + return r; +#else + return __float2bfloat16_rn(rintf(__bfloat162float(h))); +#endif +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = __float2bfloat16_rz(truncf(__low2float(h))); + const __nv_bfloat16 high = __float2bfloat16_rz(truncf(__high2float(h))); + return __nv_bfloat162(low, high); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = __float2bfloat16_ru(ceilf(__low2float(h))); + const __nv_bfloat16 high = __float2bfloat16_ru(ceilf(__high2float(h))); + return __nv_bfloat162(low, high); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h) +{ + const __nv_bfloat16 low = __float2bfloat16_rd(floorf(__low2float(h))); + const __nv_bfloat16 high = __float2bfloat16_rd(floorf(__high2float(h))); + return __nv_bfloat162(low, high); +} + +__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h) +{ + return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h))); +} +__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm("{.reg .b16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a) +{ + __nv_bfloat16 ret; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); + return ret; +} +__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a) +{ + int retval; + if (__BFLOAT16_TO_CUS(a) == 0xFF80U) { + retval = -1; + } else if (__BFLOAT16_TO_CUS(a) == 0x7F80U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a) +{ + __nv_bfloat16 ret; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a))); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat162 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a) +{ + __nv_bfloat162 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a) +{ + __nv_bfloat162 val; + asm("{.reg .b16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h) +{ + return static_cast<short int>(__BFLOAT16_TO_CUS(h)); +} +__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h) +{ + return __BFLOAT16_TO_CUS(h); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i) +{ + __nv_bfloat16 h; + __BFLOAT16_TO_US(h) = static_cast<unsigned short int>(i); + return h; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i) +{ + __nv_bfloat16 h; + __BFLOAT16_TO_US(h) = i; + return h; +} + +/****************************************************************************** +* __nv_bfloat16, __nv_bfloat162 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name) /* do */ {\ + __nv_bfloat162 r; \ + asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32) +} + +#undef __SHUFFLE_SYNC_BFLOAT162_MACRO + +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width) +{ + const __nv_bfloat162 temp1 = __halves2bfloat162(var, var); + const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2bfloat16(temp2); +} + +/****************************************************************************** +* __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const __nv_bfloat162 *const ptr) +{ + __nv_bfloat162 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr) +{ + __nv_bfloat16 ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} + +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory"); +} +__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__BFLOAT16_TO_CUS(value)) : "memory"); +} + +#undef __LDG_PTR +#endif /*defined(__cplusplus) */ +/****************************************************************************** +* __nv_bfloat162 comparison * +******************************************************************************/ +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\ + __nv_bfloat162 val; \ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; \ +} +#else +#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\ + __nv_bfloat162 val; \ + asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ + " and.b32 high_a, %1, 0xffff0000U;\n"\ + " and.b32 high_b, %2, 0xffff0000U;\n"\ + " shl.b32 low_a, %1, 16;\n"\ + " shl.b32 low_b, %2, 16;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\ + " shr.u32 low_res, low_res, 16;\n"\ + " or.b32 %0, high_res, low_res;}\n"\ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; \ +} +#endif + +__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.eq) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ne) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.le) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ge) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.lt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.gt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.equ) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.neu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.leu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.geu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.ltu) +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __COMPARISON_OP_BFLOAT162_MACRO(set.gtu) +} +#undef __COMPARISON_OP_BFLOAT162_MACRO + +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\ + __nv_bfloat162 val; \ + bool retval; \ + asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + if (__BFLOAT162_TO_CUI(val) == 0x3F803F80U) {\ + retval = true; \ + } else { \ + retval = false; \ + }\ + return retval;\ +} +#else + +#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\ + unsigned int val; \ + asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ + " and.b32 high_a, %1, 0xffff0000U;\n"\ + " and.b32 high_b, %2, 0xffff0000U;\n"\ + " shl.b32 low_a, %1, 16;\n"\ + " shl.b32 low_b, %2, 16;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\ + " and.b32 %0, high_res, low_res;}\n"\ + :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return (val != 0U) ? true : false; \ +} +#endif + +__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq) +} +__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne) +} +__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le) +} +__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge) +} +__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt) +} +__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt) +} +__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ) +} +__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu) +} +__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu) +} +__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu) +} +__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu) +} +__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu) +} +#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO +/****************************************************************************** +* __nv_bfloat16 comparison * +******************************************************************************/ +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) +#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_BF16_STRINGIFY(name) ".bf16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} +#else +#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\ + unsigned int val; \ + asm( "{.reg .b32 a,b;\n"\ + " mov.b32 a, {0, %1};\n"\ + " mov.b32 b, {0, %2};\n"\ + " set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\ + :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} +#endif +__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(eq) +} +__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(ne) +} +__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(le) +} +__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(ge) +} +__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(lt) +} +__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(gt) +} +__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(equ) +} +__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(neu) +} +__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(leu) +} +__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(geu) +} +__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(ltu) +} +__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __COMPARISON_OP_BFLOAT16_MACRO(gtu) +} +#undef __COMPARISON_OP_BFLOAT16_MACRO +/****************************************************************************** +* __nv_bfloat162 arithmetic * +******************************************************************************/ +#define __BINARY_OP_BFLOAT162_MACRO(name) /* do */ {\ + __nv_bfloat162 val; \ + asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\ + " .reg .b16 low,high;\n"\ + " and.b32 high_a, %1, 0xffff0000U;\n"\ + " and.b32 high_b, %2, 0xffff0000U;\n"\ + " shl.b32 low_a, %1, 16;\n"\ + " shl.b32 low_b, %2, 16;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32 low_res, low_a, low_b;\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32 high_res, high_a, high_b;\n"\ + " cvt.rn.bf16.f32 low, low_res;\n"\ + " cvt.rn.bf16.f32 high, high_res;\n"\ + " mov.b32 %0, {low,high};}\n"\ + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; \ +} /* while(0) */ + +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ add.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x3f803f80U;\n" + " fma.rn.bf16x2 %0,%1,c,%2;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ sub.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0xbf80bf80U;\n" + " fma.rn.bf16x2 %0,%2,c,%1;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ mul.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x80008000U;\n" + " fma.rn.bf16x2 %0,%1,%2,c;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ add.rn.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x3f803f80U;\n" + " fma.rn.bf16x2 %0,%1,c,%2;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0xbf80bf80U;\n" + " fma.rn.bf16x2 %0,%2,c,%1;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n" +#else + asm( "{.reg .b32 c;\n" + " mov.b32 c, 0x80008000U;\n" + " fma.rn.bf16x2 %0,%1,%2,c;}\n" +#endif + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{.reg .b32 f, one, zero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " fma.rn.bf16x2 f,%1,one,%2;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{.reg .b32 f, one, zero, mone;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " mov.b32 mone, 0xbf80bf80U;\n" + " fma.rn.bf16x2 f,%2,mone,%1;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{.reg .b32 f, one, zero, mzero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " mov.b32 mzero, 0x80008000U;\n" + " fma.rn.bf16x2 f,%1,%2,mzero;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{ .reg .b32 f, one, zero;\n" + " mov.b32 one, 0x3f803f80U;\n" + " mov.b32 zero, 0;\n" + " fma.rn.bf16x2 f, %1, %2, %3;\n" + " max.bf16x2 f, f, zero;\n" + " min.bf16x2 %0, f, one;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) { + __nv_bfloat16 ha, hb; + + ha = __low2bfloat16(a); + hb = __low2bfloat16(b); + + const __nv_bfloat16 v1 = __hdiv(ha, hb); + + ha = __high2bfloat16(a); + hb = __high2bfloat16(b); + + const __nv_bfloat16 v2 = __hdiv(ha, hb); + + return __halves2bfloat162(v1, v2); +} +/****************************************************************************** +* __nv_bfloat16 arithmetic * +******************************************************************************/ +#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\ + __nv_bfloat16 val; \ + asm( "{.reg .b32 a,b,res;\n"\ + " mov.b32 a, {0,%1};\n"\ + " mov.b32 b, {0,%2};\n"\ + " " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\ + " cvt.rn.bf16.f32 %0, res;}\n"\ + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; \ +} /* while(0) */ + +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ add.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x3f80U;\n" + " fma.rn.bf16 %0,%1,c,%2;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ sub.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0xbf80U;\n" + " fma.rn.bf16 %0,%2,c,%1;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ mul.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x8000U;\n" + " fma.rn.bf16 %0,%1,%2,c;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ add.rn.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x3f80U;\n" + " fma.rn.bf16 %0,%1,c,%2;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ sub.rn.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0xbf80U;\n" + " fma.rn.bf16 %0,%2,c,%1;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + asm( "{ mul.rn.bf16 %0,%1,%2; }\n" +#else + asm( "{.reg .b16 c;\n" + " mov.b16 c, 0x8000U;\n" + " fma.rn.bf16 %0,%1,%2,c;}\n" +#endif + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \ + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " fma.rn.bf16 f, %1, one, %2;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero, mone;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " mov.b16 mone, 0xbf80U;\n" + " fma.rn.bf16 f, %2, mone, %1;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero, mzero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " mov.b16 mzero, 0x8000U;\n" + " fma.rn.bf16 f, %1, %2, mzero;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{ .reg .b16 f, one, zero;\n" + " mov.b16 one, 0x3f80U;\n" + " mov.b16 zero, 0;\n" + " fma.rn.bf16 f, %1, %2, %3;\n" + " max.bf16 f, f, zero;\n" + " min.bf16 %0, f, one;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) { + __BINARY_OP_BFLOAT16_MACRO(div.rn) +} + +/****************************************************************************** +* __nv_bfloat162 functions * +******************************************************************************/ +#define __APPROX_FCAST(fun) /* do */ {\ + __nv_bfloat16 val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " mov.b32 f,{0,r}; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 f,f; \n"\ + " cvt.rn.bf16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __nv_bfloat162 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " mov.b32 fl, {0,hl}; \n"\ + " mov.b32 fu, {0,hu}; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fl, fl; \n"\ + " " __CUDA_BF16_STRINGIFY(fun) ".approx.f32 fu, fu; \n"\ + " cvt.rn.bf16.f32 hl, fl; \n"\ + " cvt.rn.bf16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) { + float f = __bfloat162float(a); + f = sinf(f); + return __float2bfloat16_rn(f); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) { + return __hsin_internal(a); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) { + const __nv_bfloat16 l = __low2bfloat16(a); + const __nv_bfloat16 h = __high2bfloat16(a); + return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h)); +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) { + float f = __bfloat162float(a); + f = cosf(f); + return __float2bfloat16_rn(f); +} +__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) { + return __hcos_internal(a); +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) { + const __nv_bfloat16 l = __low2bfloat16(a); + const __nv_bfloat16 h = __high2bfloat16(a); + return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h)); +} + +#define __BF16_SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n" +#define __BF16_SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\ + " mov.b16 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16.f16 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.bf16 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n" + +__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) { + __nv_bfloat16 val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " mov.b32 f,{0,h}; \n" + " mov.b32 C, 0x3FB8AA3CU; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f,f; \n" + " cvt.rn.bf16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) { + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " mov.b32 C, 0x3FB8AA3CU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) { + __APPROX_FCAST(ex2) +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) { + __APPROX_FCAST2(ex2) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) { + __nv_bfloat16 val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " mov.b32 f, {0,h}; \n" + " mov.b32 C, 0x40549A78U; \n" + " mul.f32 f,f,C; \n" + " ex2.approx.f32 f, f; \n" + " cvt.rn.bf16.f32 r, f; \n" + __BF16_SPEC_CASE(%1, r, 0xBC95U,0xBF00U) + " mov.b16 %0, r; \n" + "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) { + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu, C; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " mov.b32 C, 0x40549A78U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " ex2.approx.f32 fl, fl; \n" + " ex2.approx.f32 fu, fu; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U) + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) { + __APPROX_FCAST(lg2) +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) { + __APPROX_FCAST2(lg2) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) { + __nv_bfloat16 val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " mov.b32 f,{0,h}; \n" + " lg2.approx.f32 f,f; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.bf16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) { + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) { + __nv_bfloat16 val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " mov.b32 f, {0,h}; \n" + " lg2.approx.f32 f, f; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.bf16.f32 r, f; \n" + " mov.b16 %0, r; \n" + "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) { + __nv_bfloat162 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " mov.b32 fl, {0,hl}; \n" + " mov.b32 fu, {0,hu}; \n" + " lg2.approx.f32 fl, fl; \n" + " lg2.approx.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.bf16.f32 hl, fl; \n" + " cvt.rn.bf16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + " mov.b32 %0, r; \n" + "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a))); + return val; +} +#undef __BF16_SPEC_CASE2 +#undef __BF16_SPEC_CASE +__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) { + __APPROX_FCAST(rcp) +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a) +{ +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat162 r; + asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); + return r; +#else + const __nv_bfloat162 b = a; + __BINARY_OP_BFLOAT162_MACRO(set.nan.f32) +#endif +} +__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a) +{ +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm("{set.nan.bf16.bf16 %0,%1,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return __BFLOAT16_TO_CUS(r) != 0U; +#else + unsigned int r; + asm( "{.reg .b32 a;\n" + " mov.b32 a, {0,%1};\n" + " set.nan.f32.f32 %0, a, a;}\n" + :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a))); + return r != 0U; +#endif +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; + asm("{neg.bf16x2 %0,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a) +{ + __nv_bfloat16 r; + asm("{neg.bf16 %0,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a) +{ + __nv_bfloat162 r; + asm("{abs.bf16x2 %0,%1;\n}" + :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a))); + return r; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a) +{ + __nv_bfloat16 r; + asm("{abs.bf16 %0,%1;\n}" + :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a))); + return r; +} +/****************************************************************************** +* __nv_bfloat16 arithmetic * +******************************************************************************/ +__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ max.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ min.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ max.NaN.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b) +{ + __nv_bfloat16 val; + asm( "{ min.NaN.bf16 %0,%1,%2;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c) +{ + __nv_bfloat16 val; + asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}" + :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c))); + return val; +} +/****************************************************************************** +* __nv_bfloat162 arithmetic * +******************************************************************************/ +__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{ max.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{ min.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b) +{ + __nv_bfloat162 val; + asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); + return val; +} +__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + __nv_bfloat162 val; + asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}" + :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c))); + return val; +} + +__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x); + __nv_bfloat16 img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_bfloat162(real_tmp, img_tmp); +} + + +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat162 r; + asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n" + : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val)) + : "memory"); + return r; +#else + unsigned int* address_as_uint = (unsigned int*)address; + unsigned int old = *address_as_uint, assumed; + do { + assumed = old; + __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed); + old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val); + } while (assumed != old); + return *(__nv_bfloat162*)&old; +#endif +} + +__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val) +{ +#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + __nv_bfloat16 r; + asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n" + : "=h"(__BFLOAT16_TO_US(r)) + : __PTR(address), "h"(__BFLOAT16_TO_CUS(val)) + : "memory"); + return r; +#else + unsigned short int* address_as_us = (unsigned short int*)address; + unsigned short int old = *address_as_us, assumed; + do { + assumed = old; + old = atomicCAS(address_as_us, assumed, + __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed)))); + } while (assumed != old); + return __ushort_as_bfloat16(old); +#endif +} + +#undef __PTR +#undef __CUDA_BF16_DECL__ +#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */ +#endif /* defined(__cplusplus) */ + +#undef __BINARY_OP_BFLOAT162_MACRO +#undef __BINARY_OP_BFLOAT16_MACRO + +#undef __CUDA_HOSTDEVICE_BF16_DECL__ +#undef __CUDA_BF16_DECL__ + +/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */ +/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) +typedef __nv_bfloat16 nv_bfloat16; +typedef __nv_bfloat162 nv_bfloat162; + +#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_BF16) +#undef __CPP_VERSION_AT_LEAST_11_BF16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */ + +#endif /* end of include guard: __CUDA_BF16_HPP__ */ diff --git a/ext/cudart/include/cuda_d3d10_interop.h b/ext/cudart/include/cuda_d3d10_interop.h new file mode 100644 index 0000000000000000000000000000000000000000..8f485047c8dccbe4080452bc24e4e46ba87c2adb --- /dev/null +++ b/ext/cudart/include/cuda_d3d10_interop.h @@ -0,0 +1,724 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_D3D10_INTEROP_H__) +#define __CUDA_D3D10_INTEROP_H__ + +#include "cuda_runtime_api.h" + +/** \cond impl_private */ +#if !defined(__dv) + +#if defined(__cplusplus) + +#define __dv(v) \ + = v + +#else /* __cplusplus */ + +#define __dv(v) + +#endif /* __cplusplus */ + +#endif /* !__dv */ +/** \endcond impl_private */ + +#include <d3d10_1.h> + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/** + * \addtogroup CUDART_D3D10 Direct3D 10 Interoperability + * This section describes the Direct3D 10 interoperability functions of the CUDA + * runtime application programming interface. Note that mapping of Direct3D 10 + * resources is performed with the graphics API agnostic, resource mapping + * interface described in \ref CUDART_INTEROP "Graphics Interopability". + * + * @{ + */ + +/** + * CUDA devices corresponding to a D3D10 device + */ +enum cudaD3D10DeviceList +{ + cudaD3D10DeviceListAll = 1, /**< The CUDA devices for all GPUs used by a D3D10 device */ + cudaD3D10DeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by a D3D10 device in its currently rendering frame */ + cudaD3D10DeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by a D3D10 device in the next frame */ +}; + +/** + * \brief Registers a Direct3D 10 resource for access by CUDA + * + * Registers the Direct3D 10 resource \p pD3DResource for access by CUDA. + * + * If this call is successful, then the application will be able to map and + * unmap this resource until it is unregistered through + * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the + * internal reference count on \p pD3DResource. This reference count will be + * decremented when this resource is unregistered through + * ::cudaGraphicsUnregisterResource(). + * + * This call potentially has a high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pD3DResource must be one of the following. + * + * - ::ID3D10Buffer: may be accessed via a device pointer + * - ::ID3D10Texture1D: individual subresources of the texture may be accessed via arrays + * - ::ID3D10Texture2D: individual subresources of the texture may be accessed via arrays + * - ::ID3D10Texture3D: individual subresources of the texture may be accessed via arrays + * + * The \p flags argument may be used to specify additional parameters at register + * time. The valid values for this parameter are + * + * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this + * resource will be used. + * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will + * bind this resource to a surface reference. + * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform + * texture gather operations on this resource. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * + * - The primary rendertarget may not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * A complete list of supported DXGI formats is as follows. For compactness the + * notation A_{B,C,D} represents A_B, A_C, and A_D. + * - DXGI_FORMAT_A8_UNORM + * - DXGI_FORMAT_B8G8R8A8_UNORM + * - DXGI_FORMAT_B8G8R8X8_UNORM + * - DXGI_FORMAT_R16_FLOAT + * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R32_FLOAT + * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT} + * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT} + * - DXGI_FORMAT_R32_{SINT,UINT} + * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB} + * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM} + * + * If \p pD3DResource is of incorrect type or is already registered, then + * ::cudaErrorInvalidResourceHandle is returned. + * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned. + * + * \param resource - Pointer to returned resource handle + * \param pD3DResource - Direct3D resource to register + * \param flags - Parameters for resource registration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsD3D10RegisterResource + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D10RegisterResource(struct cudaGraphicsResource **resource, ID3D10Resource *pD3DResource, unsigned int flags); + +/** + * \brief Gets the device number for an adapter + * + * Returns in \p *device the CUDA-compatible device corresponding to the + * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. This call + * will succeed only if a device on adapter \p pAdapter is CUDA-compatible. + * + * \param device - Returns the device corresponding to pAdapter + * \param pAdapter - D3D10 adapter to get device for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsD3D10RegisterResource, + * ::cuD3D10GetDevice + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D10GetDevice(int *device, IDXGIAdapter *pAdapter); + +/** + * \brief Gets the CUDA devices corresponding to a Direct3D 10 device + * + * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding + * to the Direct3D 10 device \p pD3D10Device. + * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices + * corresponding to the Direct3D 10 device \p pD3D10Device. + * + * If any of the GPUs being used to render \p pDevice are not CUDA capable then the + * call will return ::cudaErrorNoDevice. + * + * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D10Device + * \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D10Device + * \param cudaDeviceCount - The size of the output device array \p pCudaDevices + * \param pD3D10Device - Direct3D 10 device to query for CUDA devices + * \param deviceList - The set of devices to return. This set may be + * ::cudaD3D10DeviceListAll for all devices, + * ::cudaD3D10DeviceListCurrentFrame for the devices used to + * render the current frame (in SLI), or + * ::cudaD3D10DeviceListNextFrame for the devices used to + * render the next frame (in SLI). + * + * \return + * ::cudaSuccess, + * ::cudaErrorNoDevice, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuD3D10GetDevices + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D10GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, ID3D10Device *pD3D10Device, enum cudaD3D10DeviceList deviceList); + +/** @} */ /* END CUDART_D3D10 */ + +/** + * \addtogroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] + * This section describes deprecated Direct3D 10 interoperability functions. + * + * @{ + */ + +/** + * CUDA D3D10 Register Flags + */ +enum cudaD3D10RegisterFlags +{ + cudaD3D10RegisterFlagsNone = 0, /**< Default; Resource can be accessed through a void* */ + cudaD3D10RegisterFlagsArray = 1 /**< Resource can be accessed through a CUarray* */ +}; + +/** + * CUDA D3D10 Map Flags + */ +enum cudaD3D10MapFlags +{ + cudaD3D10MapFlagsNone = 0, /**< Default; Assume resource can be read/written */ + cudaD3D10MapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */ + cudaD3D10MapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */ +}; + +/** + * \brief Gets the Direct3D device against which the current CUDA context was + * created + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA device with a D3D10 + * device in order to achieve maximum interoperability performance. + * + * \param ppD3D10Device - Returns the Direct3D device for this thread + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaD3D10SetDirect3DDevice + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10GetDirect3DDevice(ID3D10Device **ppD3D10Device); + +/** + * \brief Sets the Direct3D 10 device to use for interoperability with + * a CUDA device + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA device with a D3D10 + * device in order to achieve maximum interoperability performance. + * + * \param pD3D10Device - Direct3D device to use for interoperability + * \param device - The CUDA device to use. This device must be among the devices + * returned when querying ::cudaD3D10DeviceListAll from ::cudaD3D10GetDevices, + * may be set to -1 to automatically select an appropriate CUDA device. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorSetOnActiveProcess + * \notefnerr + * + * \sa + * ::cudaD3D10GetDevice, + * ::cudaGraphicsD3D10RegisterResource, + * ::cudaDeviceReset + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10SetDirect3DDevice(ID3D10Device *pD3D10Device, int device __dv(-1)); + +/** + * \brief Registers a Direct3D 10 resource for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Registers the Direct3D resource \p pResource for access by CUDA. + * + * If this call is successful, then the application will be able to map and + * unmap this resource until it is unregistered through + * ::cudaD3D10UnregisterResource(). Also on success, this call will increase + * the internal reference count on \p pResource. This reference count will be + * decremented when this resource is unregistered through + * ::cudaD3D10UnregisterResource(). + * + * This call potentially has a high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pResource must be one of the following: + * + * - ::ID3D10Buffer: Cannot be used with \p flags set to + * \p cudaD3D10RegisterFlagsArray. + * - ::ID3D10Texture1D: No restrictions. + * - ::ID3D10Texture2D: No restrictions. + * - ::ID3D10Texture3D: No restrictions. + * + * The \p flags argument specifies the mechanism through which CUDA will + * access the Direct3D resource. The following values are allowed. + * + * - ::cudaD3D10RegisterFlagsNone: Specifies that CUDA will access this + * resource through a \p void*. The pointer, size, and pitch for each + * subresource of this resource may be queried through + * ::cudaD3D10ResourceGetMappedPointer(), ::cudaD3D10ResourceGetMappedSize(), + * and ::cudaD3D10ResourceGetMappedPitch() respectively. This option is valid + * for all resource types. + * - ::cudaD3D10RegisterFlagsArray: Specifies that CUDA will access this + * resource through a \p CUarray queried on a sub-resource basis through + * ::cudaD3D10ResourceGetMappedArray(). This option is only valid for resources + * of type ::ID3D10Texture1D, ::ID3D10Texture2D, and ::ID3D10Texture3D. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * + * - The primary rendertarget may not be registered with CUDA. + * - Resources allocated as shared may not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * If Direct3D interoperability is not initialized on this context then + * ::cudaErrorInvalidDevice is returned. If \p pResource is of incorrect type + * or is already registered then ::cudaErrorInvalidResourceHandle is returned. + * If \p pResource cannot be registered then ::cudaErrorUnknown is returned. + * + * \param pResource - Resource to register + * \param flags - Parameters for resource registration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsD3D10RegisterResource + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10RegisterResource(ID3D10Resource *pResource, unsigned int flags); + +/** + * \brief Unregisters a Direct3D resource + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unregisters the Direct3D resource \p resource so it is not accessible by + * CUDA unless registered again. + * + * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle + * is returned. + * + * \param pResource - Resource to unregister + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsUnregisterResource + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10UnregisterResource(ID3D10Resource *pResource); + +/** + * \brief Maps Direct3D Resources for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Maps the \p count Direct3D resources in \p ppResources for access by CUDA. + * + * The resources in \p ppResources may be accessed in CUDA kernels until they + * are unmapped. Direct3D should not access any resources while they are + * mapped by CUDA. If an application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any Direct3D + * calls issued before ::cudaD3D10MapResources() will complete before any CUDA + * kernels issued after ::cudaD3D10MapResources() begin. + * + * If any of \p ppResources have not been registered for use with CUDA or if + * \p ppResources contains any duplicate entries then ::cudaErrorInvalidResourceHandle + * is returned. If any of \p ppResources are presently mapped for access by + * CUDA then ::cudaErrorUnknown is returned. + * + * \param count - Number of resources to map for CUDA + * \param ppResources - Resources to map for CUDA + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsMapResources + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10MapResources(int count, ID3D10Resource **ppResources); + +/** + * \brief Unmaps Direct3D resources + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unmaps the \p count Direct3D resource in \p ppResources. + * + * This function provides the synchronization guarantee that any CUDA kernels + * issued before ::cudaD3D10UnmapResources() will complete before any Direct3D + * calls issued after ::cudaD3D10UnmapResources() begin. + * + * If any of \p ppResources have not been registered for use with CUDA or if + * \p ppResources contains any duplicate entries, then + * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are + * not presently mapped for access by CUDA then ::cudaErrorUnknown is returned. + * + * \param count - Number of resources to unmap for CUDA + * \param ppResources - Resources to unmap for CUDA + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsUnmapResources + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10UnmapResources(int count, ID3D10Resource **ppResources); + +/** + * \brief Gets an array through which to access a subresource of a Direct3D + * resource which has been mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *ppArray an array through which the subresource of the mapped + * Direct3D resource \p pResource which corresponds to \p subResource may be + * accessed. The value set in \p ppArray may change every time that + * \p pResource is mapped. + * + * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is + * returned. If \p pResource was not registered with usage flags + * ::cudaD3D10RegisterFlagsArray, then ::cudaErrorInvalidResourceHandle is + * returned. If \p pResource is not mapped then ::cudaErrorUnknown is returned. + * + * For usage requirements of the \p subResource parameter, see + * ::cudaD3D10ResourceGetMappedPointer(). + * + * \param ppArray - Returned array corresponding to subresource + * \param pResource - Mapped resource to access + * \param subResource - Subresource of pResource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsSubResourceGetMappedArray + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedArray(cudaArray **ppArray, ID3D10Resource *pResource, unsigned int subResource); + +/** + * \brief Set usage flags for mapping a Direct3D resource + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Set usage flags for mapping the Direct3D resource \p pResource. + * + * Changes to flags will take effect the next time \p pResource is mapped. + * The \p flags argument may be any of the following: + * + * - ::cudaD3D10MapFlagsNone: Specifies no hints about how this resource will + * be used. It is therefore assumed that this resource will be read from and + * written to by CUDA kernels. This is the default value. + * - ::cudaD3D10MapFlagsReadOnly: Specifies that CUDA kernels which access + * this resource will not write to this resource. + * - ::cudaD3D10MapFlagsWriteDiscard: Specifies that CUDA kernels which access + * this resource will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously stored in + * the resource will be preserved. + * + * If \p pResource has not been registered for use with CUDA then + * ::cudaErrorInvalidHandle is returned. If \p pResource is presently mapped + * for access by CUDA then ::cudaErrorUnknown is returned. + * + * \param pResource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown, + * \notefnerr + * + * \sa ::cudaGraphicsResourceSetMapFlags + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceSetMapFlags(ID3D10Resource *pResource, unsigned int flags); + +/** + * \brief Gets the dimensions of a registered Direct3D surface + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the + * subresource of the mapped Direct3D resource \p pResource which corresponds + * to \p subResource. + * + * Since anti-aliased surfaces may have multiple samples per pixel, it is + * possible that the dimensions of a resource will be an integer factor larger + * than the dimensions reported by the Direct3D runtime. + * + * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D + * surfaces, the value returned in \p *pDepth will be 0. + * + * If \p pResource is not of type ::ID3D10Texture1D, ::ID3D10Texture2D, or + * ::ID3D10Texture3D, or if \p pResource has not been registered for use with + * CUDA, then ::cudaErrorInvalidHandle is returned. + + * For usage requirements of \p subResource parameters see + * ::cudaD3D10ResourceGetMappedPointer(). + * + * \param pWidth - Returned width of surface + * \param pHeight - Returned height of surface + * \param pDepth - Returned depth of surface + * \param pResource - Registered resource to access + * \param subResource - Subresource of pResource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * \notefnerr + * + * \sa ::cudaGraphicsSubResourceGetMappedArray + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, ID3D10Resource *pResource, unsigned int subResource); + +/** + * \brief Gets a pointer through which to access a subresource of a Direct3D + * resource which has been mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pPointer the base pointer of the subresource of the mapped + * Direct3D resource \p pResource which corresponds to \p subResource. The + * value set in \p pPointer may change every time that \p pResource is mapped. + * + * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is + * returned. If \p pResource was not registered with usage flags + * ::cudaD3D9RegisterFlagsNone, then ::cudaErrorInvalidResourceHandle is + * returned. If \p pResource is not mapped then ::cudaErrorUnknown is returned. + * + * If \p pResource is of type ::ID3D10Buffer then \p subResource must be 0. + * If \p pResource is of any other type, then the value of \p subResource must + * come from the subresource calculation in ::D3D10CalcSubResource(). + * + * \param pPointer - Returned pointer corresponding to subresource + * \param pResource - Mapped resource to access + * \param subResource - Subresource of pResource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsResourceGetMappedPointer + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedPointer(void **pPointer, ID3D10Resource *pResource, unsigned int subResource); + +/** + * \brief Gets the size of a subresource of a Direct3D resource which has been + * mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pSize the size of the subresource of the mapped Direct3D + * resource \p pResource which corresponds to \p subResource. The value set in + * \p pSize may change every time that \p pResource is mapped. + * + * If \p pResource has not been registered for use with CUDA then + * ::cudaErrorInvalidHandle is returned. If \p pResource was not registered + * with usage flags ::cudaD3D10RegisterFlagsNone, then + * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped for + * access by CUDA then ::cudaErrorUnknown is returned. + * + * For usage requirements of the \p subResource parameter see + * ::cudaD3D10ResourceGetMappedPointer(). + * + * \param pSize - Returned size of subresource + * \param pResource - Mapped resource to access + * \param subResource - Subresource of pResource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsResourceGetMappedPointer + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedSize(size_t *pSize, ID3D10Resource *pResource, unsigned int subResource); + +/** + * \brief Gets the pitch of a subresource of a Direct3D resource which has been + * mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of + * the subresource of the mapped Direct3D resource \p pResource, which + * corresponds to \p subResource. The values set in \p pPitch and + * \p pPitchSlice may change every time that \p pResource is mapped. + * + * The pitch and Z-slice pitch values may be used to compute the location of a + * sample on a surface as follows. + * + * For a 2D surface, the byte offset of the sample at position \b x, \b y from + * the base pointer of the surface is: + * + * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x + * + * For a 3D surface, the byte offset of the sample at position \b x, \b y, + * \b z from the base pointer of the surface is: + * + * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x + * + * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to + * NULL. + * + * If \p pResource is not of type ::ID3D10Texture1D, ::ID3D10Texture2D, or + * ::ID3D10Texture3D, or if \p pResource has not been registered for use with + * CUDA, then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was + * not registered with usage flags ::cudaD3D10RegisterFlagsNone, then + * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped + * for access by CUDA then ::cudaErrorUnknown is returned. + * + * For usage requirements of the \p subResource parameter see + * ::cudaD3D10ResourceGetMappedPointer(). + * + * \param pPitch - Returned pitch of subresource + * \param pPitchSlice - Returned Z-slice pitch of subresource + * \param pResource - Mapped resource to access + * \param subResource - Subresource of pResource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsSubResourceGetMappedArray + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D10ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, ID3D10Resource *pResource, unsigned int subResource); + +/** @} */ /* END CUDART_D3D10_DEPRECATED */ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#undef __dv +#undef __CUDA_DEPRECATED + +#endif /* __CUDA_D3D10_INTEROP_H__ */ diff --git a/ext/cudart/include/cuda_d3d11_interop.h b/ext/cudart/include/cuda_d3d11_interop.h new file mode 100644 index 0000000000000000000000000000000000000000..7e0c14cf8d2d0e4b739cc9af0651d52b4afc4b5a --- /dev/null +++ b/ext/cudart/include/cuda_d3d11_interop.h @@ -0,0 +1,323 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_D3D11_INTEROP_H__) +#define __CUDA_D3D11_INTEROP_H__ + +#include "cuda_runtime_api.h" + +/** \cond impl_private */ +#if !defined(__dv) + +#if defined(__cplusplus) + +#define __dv(v) \ + = v + +#else /* __cplusplus */ + +#define __dv(v) + +#endif /* __cplusplus */ + +#endif /* !__dv */ +/** \endcond impl_private */ + +#include <d3d11.h> + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/** + * \addtogroup CUDART_D3D11 Direct3D 11 Interoperability + * This section describes the Direct3D 11 interoperability functions of the CUDA + * runtime application programming interface. Note that mapping of Direct3D 11 + * resources is performed with the graphics API agnostic, resource mapping + * interface described in \ref CUDART_INTEROP "Graphics Interopability". + * + * @{ + */ + +/** + * CUDA devices corresponding to a D3D11 device + */ +enum cudaD3D11DeviceList +{ + cudaD3D11DeviceListAll = 1, /**< The CUDA devices for all GPUs used by a D3D11 device */ + cudaD3D11DeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by a D3D11 device in its currently rendering frame */ + cudaD3D11DeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by a D3D11 device in the next frame */ +}; + +/** + * \brief Register a Direct3D 11 resource for access by CUDA + * + * Registers the Direct3D 11 resource \p pD3DResource for access by CUDA. + * + * If this call is successful, then the application will be able to map and + * unmap this resource until it is unregistered through + * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the + * internal reference count on \p pD3DResource. This reference count will be + * decremented when this resource is unregistered through + * ::cudaGraphicsUnregisterResource(). + * + * This call potentially has a high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pD3DResource must be one of the following. + * + * - ::ID3D11Buffer: may be accessed via a device pointer + * - ::ID3D11Texture1D: individual subresources of the texture may be accessed via arrays + * - ::ID3D11Texture2D: individual subresources of the texture may be accessed via arrays + * - ::ID3D11Texture3D: individual subresources of the texture may be accessed via arrays + * + * The \p flags argument may be used to specify additional parameters at register + * time. The valid values for this parameter are + * + * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this + * resource will be used. + * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will + * bind this resource to a surface reference. + * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform + * texture gather operations on this resource. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * + * - The primary rendertarget may not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * A complete list of supported DXGI formats is as follows. For compactness the + * notation A_{B,C,D} represents A_B, A_C, and A_D. + * - DXGI_FORMAT_A8_UNORM + * - DXGI_FORMAT_B8G8R8A8_UNORM + * - DXGI_FORMAT_B8G8R8X8_UNORM + * - DXGI_FORMAT_R16_FLOAT + * - DXGI_FORMAT_R16G16B16A16_{FLOAT,SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R16G16_{FLOAT,SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R16_{SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R32_FLOAT + * - DXGI_FORMAT_R32G32B32A32_{FLOAT,SINT,UINT} + * - DXGI_FORMAT_R32G32_{FLOAT,SINT,UINT} + * - DXGI_FORMAT_R32_{SINT,UINT} + * - DXGI_FORMAT_R8G8B8A8_{SINT,SNORM,UINT,UNORM,UNORM_SRGB} + * - DXGI_FORMAT_R8G8_{SINT,SNORM,UINT,UNORM} + * - DXGI_FORMAT_R8_{SINT,SNORM,UINT,UNORM} + * + * If \p pD3DResource is of incorrect type or is already registered, then + * ::cudaErrorInvalidResourceHandle is returned. + * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned. + * + * \param resource - Pointer to returned resource handle + * \param pD3DResource - Direct3D resource to register + * \param flags - Parameters for resource registration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsD3D11RegisterResource + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D11RegisterResource(struct cudaGraphicsResource **resource, ID3D11Resource *pD3DResource, unsigned int flags); + +/** + * \brief Gets the device number for an adapter + * + * Returns in \p *device the CUDA-compatible device corresponding to the + * adapter \p pAdapter obtained from ::IDXGIFactory::EnumAdapters. This call + * will succeed only if a device on adapter \p pAdapter is CUDA-compatible. + * + * \param device - Returns the device corresponding to pAdapter + * \param pAdapter - D3D11 adapter to get device for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuD3D11GetDevice + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D11GetDevice(int *device, IDXGIAdapter *pAdapter); + +/** + * \brief Gets the CUDA devices corresponding to a Direct3D 11 device + * + * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding + * to the Direct3D 11 device \p pD3D11Device. + * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices + * corresponding to the Direct3D 11 device \p pD3D11Device. + * + * If any of the GPUs being used to render \p pDevice are not CUDA capable then the + * call will return ::cudaErrorNoDevice. + * + * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D11Device + * \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D11Device + * \param cudaDeviceCount - The size of the output device array \p pCudaDevices + * \param pD3D11Device - Direct3D 11 device to query for CUDA devices + * \param deviceList - The set of devices to return. This set may be + * ::cudaD3D11DeviceListAll for all devices, + * ::cudaD3D11DeviceListCurrentFrame for the devices used to + * render the current frame (in SLI), or + * ::cudaD3D11DeviceListNextFrame for the devices used to + * render the next frame (in SLI). + * + * \return + * ::cudaSuccess, + * ::cudaErrorNoDevice, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuD3D11GetDevices + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D11GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, ID3D11Device *pD3D11Device, enum cudaD3D11DeviceList deviceList); + +/** @} */ /* END CUDART_D3D11 */ + +/** + * \addtogroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] + * This section describes deprecated Direct3D 11 interoperability functions. + * + * @{ + */ + +/** + * \brief Gets the Direct3D device against which the current CUDA context was + * created + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA device with a D3D11 + * device in order to achieve maximum interoperability performance. + * + * \param ppD3D11Device - Returns the Direct3D device for this thread + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaD3D11SetDirect3DDevice + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D11GetDirect3DDevice(ID3D11Device **ppD3D11Device); + +/** + * \brief Sets the Direct3D 11 device to use for interoperability with + * a CUDA device + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA device with a D3D11 + * device in order to achieve maximum interoperability performance. + * + * \param pD3D11Device - Direct3D device to use for interoperability + * \param device - The CUDA device to use. This device must be among the devices + * returned when querying ::cudaD3D11DeviceListAll from ::cudaD3D11GetDevices, + * may be set to -1 to automatically select an appropriate CUDA device. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorSetOnActiveProcess + * \notefnerr + * + * \sa + * ::cudaD3D11GetDevice, + * ::cudaGraphicsD3D11RegisterResource, + * ::cudaDeviceReset + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D11SetDirect3DDevice(ID3D11Device *pD3D11Device, int device __dv(-1)); + +/** @} */ /* END CUDART_D3D11_DEPRECATED */ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#undef __dv +#undef __CUDA_DEPRECATED + +#endif /* __CUDA_D3D11_INTEROP_H__ */ diff --git a/ext/cudart/include/cuda_d3d9_interop.h b/ext/cudart/include/cuda_d3d9_interop.h new file mode 100644 index 0000000000000000000000000000000000000000..5bac47db597c48bcfab5971cd47645679348294a --- /dev/null +++ b/ext/cudart/include/cuda_d3d9_interop.h @@ -0,0 +1,782 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_D3D9_INTEROP_H__) +#define __CUDA_D3D9_INTEROP_H__ + +#include "cuda_runtime_api.h" + +/** \cond impl_private */ +#if !defined(__dv) + +#if defined(__cplusplus) + +#define __dv(v) \ + = v + +#else /* __cplusplus */ + +#define __dv(v) + +#endif /* __cplusplus */ + +#endif /* !__dv */ +/** \endcond impl_private */ + +#include <d3d9.h> + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/** + * \addtogroup CUDART_D3D9 Direct3D 9 Interoperability + * This section describes the Direct3D 9 interoperability functions of the CUDA + * runtime application programming interface. Note that mapping of Direct3D 9 + * resources is performed with the graphics API agnostic, resource mapping + * interface described in \ref CUDART_INTEROP "Graphics Interopability". + * + * @{ + */ + +/** + * CUDA devices corresponding to a D3D9 device + */ +enum cudaD3D9DeviceList +{ + cudaD3D9DeviceListAll = 1, /**< The CUDA devices for all GPUs used by a D3D9 device */ + cudaD3D9DeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by a D3D9 device in its currently rendering frame */ + cudaD3D9DeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by a D3D9 device in the next frame */ +}; + +/** + * \brief Gets the Direct3D device against which the current CUDA context was + * created + * + * Returns in \p *ppD3D9Device the Direct3D device against which this CUDA + * context was created in ::cudaD3D9SetDirect3DDevice(). + * + * \param ppD3D9Device - Returns the Direct3D device for this thread + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidGraphicsContext, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaD3D9SetDirect3DDevice, + * ::cuD3D9GetDirect3DDevice + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDirect3DDevice(IDirect3DDevice9 **ppD3D9Device); + +/** + * \brief Register a Direct3D 9 resource for access by CUDA + * + * Registers the Direct3D 9 resource \p pD3DResource for access by CUDA. + * + * If this call is successful then the application will be able to map and + * unmap this resource until it is unregistered through + * ::cudaGraphicsUnregisterResource(). Also on success, this call will increase the + * internal reference count on \p pD3DResource. This reference count will be + * decremented when this resource is unregistered through + * ::cudaGraphicsUnregisterResource(). + * + * This call potentially has a high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pD3DResource must be one of the following. + * + * - ::IDirect3DVertexBuffer9: may be accessed through a device pointer + * - ::IDirect3DIndexBuffer9: may be accessed through a device pointer + * - ::IDirect3DSurface9: may be accessed through an array. + * Only stand-alone objects of type ::IDirect3DSurface9 + * may be explicitly shared. In particular, individual mipmap levels and faces + * of cube maps may not be registered directly. To access individual surfaces + * associated with a texture, one must register the base texture object. + * - ::IDirect3DBaseTexture9: individual surfaces on this texture may be accessed + * through an array. + * + * The \p flags argument may be used to specify additional parameters at register + * time. The valid values for this parameter are + * + * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this + * resource will be used. + * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will + * bind this resource to a surface reference. + * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform + * texture gather operations on this resource. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations. + * + * - The primary rendertarget may not be registered with CUDA. + * - Resources allocated as shared may not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * A complete list of supported formats is as follows: + * - D3DFMT_L8 + * - D3DFMT_L16 + * - D3DFMT_A8R8G8B8 + * - D3DFMT_X8R8G8B8 + * - D3DFMT_G16R16 + * - D3DFMT_A8B8G8R8 + * - D3DFMT_A8 + * - D3DFMT_A8L8 + * - D3DFMT_Q8W8V8U8 + * - D3DFMT_V16U16 + * - D3DFMT_A16B16G16R16F + * - D3DFMT_A16B16G16R16 + * - D3DFMT_R32F + * - D3DFMT_G16R16F + * - D3DFMT_A32B32G32R32F + * - D3DFMT_G32R32F + * - D3DFMT_R16F + * + * If \p pD3DResource is of incorrect type or is already registered, then + * ::cudaErrorInvalidResourceHandle is returned. + * If \p pD3DResource cannot be registered, then ::cudaErrorUnknown is returned. + * + * \param resource - Pointer to returned resource handle + * \param pD3DResource - Direct3D resource to register + * \param flags - Parameters for resource registration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaD3D9SetDirect3DDevice, + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsD3D9RegisterResource + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsD3D9RegisterResource(struct cudaGraphicsResource **resource, IDirect3DResource9 *pD3DResource, unsigned int flags); + +/** + * \brief Gets the device number for an adapter + * + * Returns in \p *device the CUDA-compatible device corresponding to the + * adapter name \p pszAdapterName obtained from ::EnumDisplayDevices or + * ::IDirect3D9::GetAdapterIdentifier(). If no device on the adapter with name + * \p pszAdapterName is CUDA-compatible then the call will fail. + * + * \param device - Returns the device corresponding to pszAdapterName + * \param pszAdapterName - D3D9 adapter to get device for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaD3D9SetDirect3DDevice, + * ::cudaGraphicsD3D9RegisterResource, + * ::cuD3D9GetDevice + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDevice(int *device, const char *pszAdapterName); + +/** + * \brief Gets the CUDA devices corresponding to a Direct3D 9 device + * + * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices corresponding + * to the Direct3D 9 device \p pD3D9Device. + * Also returns in \p *pCudaDevices at most \p cudaDeviceCount of the the CUDA-compatible devices + * corresponding to the Direct3D 9 device \p pD3D9Device. + * + * If any of the GPUs being used to render \p pDevice are not CUDA capable then the + * call will return ::cudaErrorNoDevice. + * + * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to \p pD3D9Device + * \param pCudaDevices - Returned CUDA devices corresponding to \p pD3D9Device + * \param cudaDeviceCount - The size of the output device array \p pCudaDevices + * \param pD3D9Device - Direct3D 9 device to query for CUDA devices + * \param deviceList - The set of devices to return. This set may be + * ::cudaD3D9DeviceListAll for all devices, + * ::cudaD3D9DeviceListCurrentFrame for the devices used to + * render the current frame (in SLI), or + * ::cudaD3D9DeviceListNextFrame for the devices used to + * render the next frame (in SLI). + * + * \return + * ::cudaSuccess, + * ::cudaErrorNoDevice, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuD3D9GetDevices + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D9GetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, IDirect3DDevice9 *pD3D9Device, enum cudaD3D9DeviceList deviceList); + +/** + * \brief Sets the Direct3D 9 device to use for interoperability with + * a CUDA device + * + * Records \p pD3D9Device as the Direct3D 9 device to use for Direct3D 9 + * interoperability with the CUDA device \p device and sets \p device as + * the current device for the calling host thread. + * + * If \p device has already been initialized then this call will fail with + * the error ::cudaErrorSetOnActiveProcess. In this case it is necessary + * to reset \p device using ::cudaDeviceReset() before Direct3D 9 + * interoperability on \p device may be enabled. + * + * Successfully initializing CUDA interoperability with \p pD3D9Device + * will increase the internal reference count on \p pD3D9Device. This + * reference count will be decremented when \p device is reset using + * ::cudaDeviceReset(). + * + * Note that this function is never required for correct functionality. Use of + * this function will result in accelerated interoperability only when the + * operating system is Windows Vista or Windows 7, and the device \p pD3DDdevice + * is not an IDirect3DDevice9Ex. In all other cirumstances, this function is + * not necessary. + * + * \param pD3D9Device - Direct3D device to use for this thread + * \param device - The CUDA device to use. This device must be among the devices + * returned when querying ::cudaD3D9DeviceListAll from ::cudaD3D9GetDevices, + * may be set to -1 to automatically select an appropriate CUDA device. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorSetOnActiveProcess + * \notefnerr + * + * \sa + * ::cudaD3D9GetDevice, + * ::cudaGraphicsD3D9RegisterResource, + * ::cudaDeviceReset + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D9SetDirect3DDevice(IDirect3DDevice9 *pD3D9Device, int device __dv(-1)); + +/** @} */ /* END CUDART_D3D9 */ + +/** + * \addtogroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] + * This section describes deprecated Direct3D 9 interoperability functions. + * + * @{ + */ + +/** + * CUDA D3D9 Register Flags + */ +enum cudaD3D9RegisterFlags +{ + cudaD3D9RegisterFlagsNone = 0, /**< Default; Resource can be accessed througa void* */ + cudaD3D9RegisterFlagsArray = 1 /**< Resource can be accessed through a CUarray* */ +}; + +/** + * CUDA D3D9 Map Flags + */ +enum cudaD3D9MapFlags +{ + cudaD3D9MapFlagsNone = 0, /**< Default; Assume resource can be read/written */ + cudaD3D9MapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */ + cudaD3D9MapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */ +}; + +/** + * \brief Registers a Direct3D resource for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Registers the Direct3D resource \p pResource for access by CUDA. + * + * If this call is successful, then the application will be able to map and + * unmap this resource until it is unregistered through + * ::cudaD3D9UnregisterResource(). Also on success, this call will increase + * the internal reference count on \p pResource. This reference count will be + * decremented when this resource is unregistered through + * ::cudaD3D9UnregisterResource(). + * + * This call potentially has a high-overhead and should not be called every frame + * in interactive applications. + * + * The type of \p pResource must be one of the following. + * + * - ::IDirect3DVertexBuffer9: No notes. + * - ::IDirect3DIndexBuffer9: No notes. + * - ::IDirect3DSurface9: Only stand-alone objects of type ::IDirect3DSurface9 + * may be explicitly shared. In particular, individual mipmap levels and faces + * of cube maps may not be registered directly. To access individual surfaces + * associated with a texture, one must register the base texture object. + * - ::IDirect3DBaseTexture9: When a texture is registered, all surfaces + * associated with all mipmap levels of all faces of the texture will be + * accessible to CUDA. + * + * The \p flags argument specifies the mechanism through which CUDA will + * access the Direct3D resource. The following value is allowed: + * + * - ::cudaD3D9RegisterFlagsNone: Specifies that CUDA will access this + * resource through a \p void*. The pointer, size, and pitch for each + * subresource of this resource may be queried through + * ::cudaD3D9ResourceGetMappedPointer(), ::cudaD3D9ResourceGetMappedSize(), + * and ::cudaD3D9ResourceGetMappedPitch() respectively. This option is valid + * for all resource types. + * + * Not all Direct3D resources of the above types may be used for + * interoperability with CUDA. The following are some limitations: + * + * - The primary rendertarget may not be registered with CUDA. + * - Resources allocated as shared may not be registered with CUDA. + * - Any resources allocated in ::D3DPOOL_SYSTEMMEM or ::D3DPOOL_MANAGED may + * not be registered with CUDA. + * - Textures which are not of a format which is 1, 2, or 4 channels of 8, 16, + * or 32-bit integer or floating-point data cannot be shared. + * - Surfaces of depth or stencil formats cannot be shared. + * + * If Direct3D interoperability is not initialized on this context, then + * ::cudaErrorInvalidDevice is returned. If \p pResource is of incorrect type + * (e.g, is a non-stand-alone ::IDirect3DSurface9) or is already registered, + * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource cannot + * be registered then ::cudaErrorUnknown is returned. + * + * \param pResource - Resource to register + * \param flags - Parameters for resource registration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsD3D9RegisterResource + */ +extern __host__ cudaError_t CUDARTAPI cudaD3D9RegisterResource(IDirect3DResource9 *pResource, unsigned int flags); + +/** + * \brief Unregisters a Direct3D resource for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unregisters the Direct3D resource \p pResource so it is not accessible by + * CUDA unless registered again. + * + * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is + * returned. + * + * \param pResource - Resource to unregister + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnregisterResource(IDirect3DResource9 *pResource); + +/** + * \brief Map Direct3D resources for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Maps the \p count Direct3D resources in \p ppResources for access by CUDA. + * + * The resources in \p ppResources may be accessed in CUDA kernels until they + * are unmapped. Direct3D should not access any resources while they are + * mapped by CUDA. If an application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any Direct3D + * calls issued before ::cudaD3D9MapResources() will complete before any CUDA + * kernels issued after ::cudaD3D9MapResources() begin. + * + * If any of \p ppResources have not been registered for use with CUDA or if + * \p ppResources contains any duplicate entries then + * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are + * presently mapped for access by CUDA then ::cudaErrorUnknown is returned. + * + * \param count - Number of resources to map for CUDA + * \param ppResources - Resources to map for CUDA + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsMapResources + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9MapResources(int count, IDirect3DResource9 **ppResources); + +/** + * \brief Unmap Direct3D resources for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unmaps the \p count Direct3D resources in \p ppResources. + * + * This function provides the synchronization guarantee that any CUDA kernels + * issued before ::cudaD3D9UnmapResources() will complete before any Direct3D + * calls issued after ::cudaD3D9UnmapResources() begin. + * + * If any of \p ppResources have not been registered for use with CUDA or if + * \p ppResources contains any duplicate entries, then + * ::cudaErrorInvalidResourceHandle is returned. If any of \p ppResources are + * not presently mapped for access by CUDA then ::cudaErrorUnknown is returned. + * + * \param count - Number of resources to unmap for CUDA + * \param ppResources - Resources to unmap for CUDA + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnmapResources + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnmapResources(int count, IDirect3DResource9 **ppResources); + +/** + * \brief Set usage flags for mapping a Direct3D resource + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Set flags for mapping the Direct3D resource \p pResource. + * + * Changes to flags will take effect the next time \p pResource is mapped. + * The \p flags argument may be any of the following: + * + * - ::cudaD3D9MapFlagsNone: Specifies no hints about how this resource will + * be used. It is therefore assumed that this resource will be read from and + * written to by CUDA kernels. This is the default value. + * - ::cudaD3D9MapFlagsReadOnly: Specifies that CUDA kernels which access this + * resource will not write to this resource. + * - ::cudaD3D9MapFlagsWriteDiscard: Specifies that CUDA kernels which access + * this resource will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously stored in + * the resource will be preserved. + * + * If \p pResource has not been registered for use with CUDA, then + * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is presently + * mapped for access by CUDA, then ::cudaErrorUnknown is returned. + * + * \param pResource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaInteropResourceSetMapFlags + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceSetMapFlags(IDirect3DResource9 *pResource, unsigned int flags); + +/** + * \brief Get the dimensions of a registered Direct3D surface + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pWidth, \p *pHeight, and \p *pDepth the dimensions of the + * subresource of the mapped Direct3D resource \p pResource which corresponds + * to \p face and \p level. + * + * Since anti-aliased surfaces may have multiple samples per pixel, it is + * possible that the dimensions of a resource will be an integer factor larger + * than the dimensions reported by the Direct3D runtime. + * + * The parameters \p pWidth, \p pHeight, and \p pDepth are optional. For 2D + * surfaces, the value returned in \p *pDepth will be 0. + * + * If \p pResource is not of type ::IDirect3DBaseTexture9 or + * ::IDirect3DSurface9 or if \p pResource has not been registered for use with + * CUDA, then ::cudaErrorInvalidResourceHandle is returned. + * + * For usage requirements of \p face and \p level parameters, see + * ::cudaD3D9ResourceGetMappedPointer. + * + * \param pWidth - Returned width of surface + * \param pHeight - Returned height of surface + * \param pDepth - Returned depth of surface + * \param pResource - Registered resource to access + * \param face - Face of resource to access + * \param level - Level of resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * \notefnerr + * + * \sa + * ::cudaGraphicsSubResourceGetMappedArray + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetSurfaceDimensions(size_t *pWidth, size_t *pHeight, size_t *pDepth, IDirect3DResource9 *pResource, unsigned int face, unsigned int level); + +/** + * \brief Get an array through which to access a subresource of a Direct3D + * resource which has been mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pArray an array through which the subresource of the mapped + * Direct3D resource \p pResource, which corresponds to \p face and \p level + * may be accessed. The value set in \p pArray may change every time that + * \p pResource is mapped. + * + * If \p pResource is not registered then ::cudaErrorInvalidResourceHandle is + * returned. If \p pResource was not registered with usage flags + * ::cudaD3D9RegisterFlagsArray, then ::cudaErrorInvalidResourceHandle is + * returned. If \p pResource is not mapped, then ::cudaErrorUnknown is + * returned. + * + * For usage requirements of \p face and \p level parameters, see + * ::cudaD3D9ResourceGetMappedPointer(). + * + * \param ppArray - Returned array corresponding to subresource + * \param pResource - Mapped resource to access + * \param face - Face of resource to access + * \param level - Level of resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsSubResourceGetMappedArray + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedArray(cudaArray **ppArray, IDirect3DResource9 *pResource, unsigned int face, unsigned int level); + +/** + * \brief Get a pointer through which to access a subresource of a Direct3D + * resource which has been mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pPointer the base pointer of the subresource of the mapped + * Direct3D resource \p pResource, which corresponds to \p face and \p level. + * The value set in \p pPointer may change every time that \p pResource is + * mapped. + * + * If \p pResource is not registered, then ::cudaErrorInvalidResourceHandle is + * returned. If \p pResource was not registered with usage flags + * ::cudaD3D9RegisterFlagsNone, then ::cudaErrorInvalidResourceHandle is + * returned. If \p pResource is not mapped, then ::cudaErrorUnknown is + * returned. + * + * If \p pResource is of type ::IDirect3DCubeTexture9, then \p face must one + * of the values enumerated by type ::D3DCUBEMAP_FACES. For all other types, + * \p face must be 0. If \p face is invalid, then ::cudaErrorInvalidValue is + * returned. + * + * If \p pResource is of type ::IDirect3DBaseTexture9, then \p level must + * correspond to a valid mipmap level. Only mipmap level 0 is supported for + * now. For all other types \p level must be 0. If \p level is invalid, then + * ::cudaErrorInvalidValue is returned. + * + * \param pPointer - Returned pointer corresponding to subresource + * \param pResource - Mapped resource to access + * \param face - Face of resource to access + * \param level - Level of resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedPointer(void **pPointer, IDirect3DResource9 *pResource, unsigned int face, unsigned int level); + +/** + * \brief Get the size of a subresource of a Direct3D resource which has been + * mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pSize the size of the subresource of the mapped Direct3D + * resource \p pResource, which corresponds to \p face and \p level. The value + * set in \p pSize may change every time that \p pResource is mapped. + * + * If \p pResource has not been registered for use with CUDA then + * ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not + * registered with usage flags ::cudaD3D9RegisterFlagsNone, then + * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped + * for access by CUDA then ::cudaErrorUnknown is returned. + * + * For usage requirements of \p face and \p level parameters, see + * ::cudaD3D9ResourceGetMappedPointer(). + * + * \param pSize - Returned size of subresource + * \param pResource - Mapped resource to access + * \param face - Face of resource to access + * \param level - Level of resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedSize(size_t *pSize, IDirect3DResource9 *pResource, unsigned int face, unsigned int level); + +/** + * \brief Get the pitch of a subresource of a Direct3D resource which has been + * mapped for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Returns in \p *pPitch and \p *pPitchSlice the pitch and Z-slice pitch of + * the subresource of the mapped Direct3D resource \p pResource, which + * corresponds to \p face and \p level. The values set in \p pPitch and + * \p pPitchSlice may change every time that \p pResource is mapped. + * + * The pitch and Z-slice pitch values may be used to compute the location of a + * sample on a surface as follows. + * + * For a 2D surface, the byte offset of the sample at position \b x, \b y from + * the base pointer of the surface is: + * + * \b y * \b pitch + (<b>bytes per pixel</b>) * \b x + * + * For a 3D surface, the byte offset of the sample at position \b x, \b y, + * \b z from the base pointer of the surface is: + * + * \b z* \b slicePitch + \b y * \b pitch + (<b>bytes per pixel</b>) * \b x + * + * Both parameters \p pPitch and \p pPitchSlice are optional and may be set to + * NULL. + * + * If \p pResource is not of type ::IDirect3DBaseTexture9 or one of its + * sub-types or if \p pResource has not been registered for use with CUDA, + * then ::cudaErrorInvalidResourceHandle is returned. If \p pResource was not + * registered with usage flags ::cudaD3D9RegisterFlagsNone, then + * ::cudaErrorInvalidResourceHandle is returned. If \p pResource is not mapped + * for access by CUDA then ::cudaErrorUnknown is returned. + * + * For usage requirements of \p face and \p level parameters, see + * ::cudaD3D9ResourceGetMappedPointer(). + * + * \param pPitch - Returned pitch of subresource + * \param pPitchSlice - Returned Z-slice pitch of subresource + * \param pResource - Mapped resource to access + * \param face - Face of resource to access + * \param level - Level of resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9ResourceGetMappedPitch(size_t *pPitch, size_t *pPitchSlice, IDirect3DResource9 *pResource, unsigned int face, unsigned int level); + +/* D3D9 1.x interop interface */ + +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9Begin(IDirect3DDevice9 *pDevice); +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9End(void); +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9RegisterVertexBuffer(IDirect3DVertexBuffer9 *pVB); +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnregisterVertexBuffer(IDirect3DVertexBuffer9 *pVB); +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9MapVertexBuffer(void **dptr, IDirect3DVertexBuffer9 *pVB); +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaD3D9UnmapVertexBuffer(IDirect3DVertexBuffer9 *pVB); + +/** @} */ /* END CUDART_D3D9_DEPRECATED */ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#undef __dv +#undef __CUDA_DEPRECATED + +#endif /* __CUDA_D3D9_INTEROP_H__ */ diff --git a/ext/cudart/include/cuda_device_runtime_api.h b/ext/cudart/include/cuda_device_runtime_api.h new file mode 100644 index 0000000000000000000000000000000000000000..e3bc812d777c2d2ba612e6b55f5f101c75fb24e3 --- /dev/null +++ b/ext/cudart/include/cuda_device_runtime_api.h @@ -0,0 +1,268 @@ +/* + * Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_DEVICE_RUNTIME_API_H__) +#define __CUDA_DEVICE_RUNTIME_API_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#if !defined(__CUDACC_RTC__) + +#if !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) + +#if defined(__cplusplus) +extern "C" { +#endif + +struct cudaFuncAttributes; + + +inline __device__ cudaError_t CUDARTAPI cudaMalloc(void **p, size_t s) +{ + return cudaErrorUnknown; +} + +inline __device__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *p, const void *c) +{ + return cudaErrorUnknown; +} + +inline __device__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) +{ + return cudaErrorUnknown; +} + +inline __device__ cudaError_t CUDARTAPI cudaGetDevice(int *device) +{ + return cudaErrorUnknown; +} + +inline __device__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize) +{ + return cudaErrorUnknown; +} + +inline __device__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags) +{ + return cudaErrorUnknown; +} + + +#if defined(__cplusplus) +} +#endif + +#endif /* !defined(__CUDACC_INTERNAL_NO_STUBS__) && !defined(__CUDACC_RDC__) && !defined(__CUDACC_EWP__) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) && !defined(__CUDADEVRT_INTERNAL__) */ + +#endif /* !defined(__CUDACC_RTC__) */ + +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +# define __DEPRECATED__(msg) +#elif defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if defined(__CUDA_ARCH__) && !defined(__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING) +# define __CDPRT_DEPRECATED(func_name) __DEPRECATED__("Use of "#func_name" from device code is deprecated. Moreover, such use will cause this module to fail to load on sm_90+ devices. If calls to "#func_name" from device code cannot be removed for older devices at this time, you may guard them with __CUDA_ARCH__ macros to remove them only for sm_90+ devices, making sure to generate code for compute_90 for the macros to take effect. Note that this mitigation will no longer work when support for "#func_name" from device code is eventually dropped for all devices. Disable this warning with -D__CDPRT_SUPPRESS_SYNC_DEPRECATION_WARNING.") +#else +# define __CDPRT_DEPRECATED(func_name) +#endif + +#if defined(__cplusplus) && defined(__CUDACC__) /* Visible to nvcc front-end only */ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) // Visible to SM>=3.5 and "__host__ __device__" only + +#include "driver_types.h" +#include "crt/host_defines.h" + +extern "C" +{ +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig); +#if (__CUDA_ARCH__ < 900) +// cudaDeviceSynchronize is removed on sm_90+ +extern __device__ __cudart_builtin__ __CDPRT_DEPRECATED(cudaDeviceSynchronize) cudaError_t CUDARTAPI cudaDeviceSynchronize(void); +#endif +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI __cudaDeviceSynchronizeDeprecationAvoidance(void); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void); +extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error); +extern __device__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent_ptsz(cudaStream_t stream, cudaEvent_t event, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord_ptsz(cudaEvent_t event, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags_ptsz(cudaEvent_t event, cudaStream_t stream, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync_ptsz(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync_ptsz(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync_ptsz(const struct cudaMemcpy3DParms *p, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync_ptsz(void *devPtr, int value, size_t count, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync_ptsz(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync_ptsz(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion); + +/** + * \ingroup CUDART_EXECUTION + * \brief Obtains a parameter buffer + * + * Obtains a parameter buffer which can be filled with parameters for a kernel launch. + * Parameters passed to ::cudaLaunchDevice must be allocated via this function. + * + * This is a low level API and can only be accessed from Parallel Thread Execution (PTX). + * CUDA user code should use <<< >>> to launch kernels. + * + * \param alignment - Specifies alignment requirement of the parameter buffer + * \param size - Specifies size requirement in bytes + * + * \return + * Returns pointer to the allocated parameterBuffer + * \notefnerr + * + * \sa cudaLaunchDevice + */ +extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBuffer(size_t alignment, size_t size); + +/** + * \ingroup CUDART_EXECUTION + * \brief Launches a specified kernel + * + * Launches a specified kernel with the specified parameter buffer. A parameter buffer can be obtained + * by calling ::cudaGetParameterBuffer(). + * + * This is a low level API and can only be accessed from Parallel Thread Execution (PTX). + * CUDA user code should use <<< >>> to launch the kernels. + * + * \param func - Pointer to the kernel to be launched + * \param parameterBuffer - Holds the parameters to the launched kernel. parameterBuffer can be NULL. (Optional) + * \param gridDimension - Specifies grid dimensions + * \param blockDimension - Specifies block dimensions + * \param sharedMemSize - Specifies size of shared memory + * \param stream - Specifies the stream to be used + * + * \return + * ::cudaSuccess, ::cudaErrorInvalidDevice, ::cudaErrorLaunchMaxDepthExceeded, ::cudaErrorInvalidConfiguration, + * ::cudaErrorStartupFailure, ::cudaErrorLaunchPendingCountExceeded, ::cudaErrorLaunchOutOfResources + * \notefnerr + * \n Please refer to Execution Configuration and Parameter Buffer Layout from the CUDA Programming + * Guide for the detailed descriptions of launch configuration and parameter layout respectively. + * + * \sa cudaGetParameterBuffer + */ +extern __device__ __cudart_builtin__ void * CUDARTAPI cudaGetParameterBufferV2(void *func, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice_ptsz(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2_ptsz(void *parameterBuffer, cudaStream_t stream); + +#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) && defined(__CUDA_ARCH__) + // When compiling for the device and per thread default stream is enabled, add + // a static inline redirect to the per thread stream entry points. + + static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI + cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream) + { + return cudaLaunchDevice_ptsz(func, parameterBuffer, gridDimension, blockDimension, sharedMemSize, stream); + } + + static __inline__ __device__ __cudart_builtin__ cudaError_t CUDARTAPI + cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream) + { + return cudaLaunchDeviceV2_ptsz(parameterBuffer, stream); + } +#else + extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDevice(void *func, void *parameterBuffer, dim3 gridDimension, dim3 blockDimension, unsigned int sharedMemSize, cudaStream_t stream); + extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaLaunchDeviceV2(void *parameterBuffer, cudaStream_t stream); +#endif + +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSmemSize, unsigned int flags); + +extern __device__ __cudart_builtin__ unsigned long long CUDARTAPI cudaCGGetIntrinsicHandle(enum cudaCGScope scope); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronize(unsigned long long handle, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGSynchronizeGrid(unsigned long long handle, unsigned int flags); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetSize(unsigned int *numThreads, unsigned int *numGrids, unsigned long long handle); +extern __device__ __cudart_builtin__ cudaError_t CUDARTAPI cudaCGGetRank(unsigned int *threadRank, unsigned int *gridRank, unsigned long long handle); +} + +template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaMalloc(T **devPtr, size_t size); +template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, T *entry); +template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize); +template <typename T> static __inline__ __device__ __cudart_builtin__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, T func, int blockSize, size_t dynamicSmemSize, unsigned int flags); + + +#endif // !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350) +#endif /* defined(__cplusplus) && defined(__CUDACC__) */ + +#undef __DEPRECATED__ +#undef __CDPRT_DEPRECATED + +#endif /* !__CUDA_DEVICE_RUNTIME_API_H__ */ diff --git a/ext/cudart/include/cuda_egl_interop.h b/ext/cudart/include/cuda_egl_interop.h new file mode 100644 index 0000000000000000000000000000000000000000..40ab01b33e0e9bec536192676c2a804809276fc4 --- /dev/null +++ b/ext/cudart/include/cuda_egl_interop.h @@ -0,0 +1,642 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_EGL_INTEROP_H__) +#define __CUDA_EGL_INTEROP_H__ + +#include "cuda_runtime_api.h" +#include "cuda_runtime.h" +#include "cudart_platform.h" +#include "EGL/egl.h" +#include "EGL/eglext.h" + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/** + * \addtogroup CUDART_TYPES + * @{ + */ + + /** + * Maximum number of planes per frame + */ +#define CUDA_EGL_MAX_PLANES 3 + +/** + * CUDA EglFrame type - array or pointer + */ +typedef enum cudaEglFrameType_enum +{ + cudaEglFrameTypeArray = 0, /**< Frame type CUDA array */ + cudaEglFrameTypePitch = 1, /**< Frame type CUDA pointer */ +} cudaEglFrameType; + +/** + * Resource location flags- sysmem or vidmem + * + * For CUDA context on iGPU, since video and system memory are equivalent - + * these flags will not have an effect on the execution. + * + * For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags + * to give a hint about the desired location. + * + * ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory + * to be accessed by CUDA. + * + * ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated + * video memory to be accessed by CUDA. + * + * There may be an additional latency due to new allocation and data migration, + * if the frame is produced on a different memory. + */ +typedef enum cudaEglResourceLocationFlags_enum { + cudaEglResourceLocationSysmem = 0x00, /**< Resource location sysmem */ + cudaEglResourceLocationVidmem = 0x01, /**< Resource location vidmem */ +} cudaEglResourceLocationFlags; + +/** + * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops. + */ +typedef enum cudaEglColorFormat_enum { + cudaEglColorFormatYUV420Planar = 0, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYUV420SemiPlanar = 1, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */ + cudaEglColorFormatYUV422Planar = 2, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatYUV422SemiPlanar = 3, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */ + cudaEglColorFormatARGB = 6, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */ + cudaEglColorFormatRGBA = 7, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */ + cudaEglColorFormatL = 8, /**< single luminance channel in one surface. */ + cudaEglColorFormatR = 9, /**< single color channel in one surface. */ + cudaEglColorFormatYUV444Planar = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatYUV444SemiPlanar = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */ + cudaEglColorFormatYUYV422 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */ + cudaEglColorFormatUYVY422 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */ + cudaEglColorFormatABGR = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */ + cudaEglColorFormatBGRA = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */ + cudaEglColorFormatA = 16, /**< Alpha color format - one channel in one surface. */ + cudaEglColorFormatRG = 17, /**< R/G color format - two channels in one surface with GR byte ordering */ + cudaEglColorFormatAYUV = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */ + cudaEglColorFormatYVU444SemiPlanar = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatYVU422SemiPlanar = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatYVU420SemiPlanar = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatVYUY_ER = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */ + cudaEglColorFormatUYVY_ER = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */ + cudaEglColorFormatYUYV_ER = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */ + cudaEglColorFormatYVYU_ER = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */ + cudaEglColorFormatYUVA_ER = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */ + cudaEglColorFormatAYUV_ER = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */ + cudaEglColorFormatYUV444Planar_ER = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatYUV422Planar_ER = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatYUV420Planar_ER = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYUV444SemiPlanar_ER = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatYUV422SemiPlanar_ER = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatYUV420SemiPlanar_ER = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYVU444Planar_ER = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatYVU422Planar_ER = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatYVU420Planar_ER = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYVU444SemiPlanar_ER = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatYVU422SemiPlanar_ER = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatYVU420SemiPlanar_ER = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatBayerRGGB = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */ + cudaEglColorFormatBayerBGGR = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */ + cudaEglColorFormatBayerGRBG = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */ + cudaEglColorFormatBayerGBRG = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */ + cudaEglColorFormatBayer10RGGB = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */ + cudaEglColorFormatBayer10BGGR = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */ + cudaEglColorFormatBayer10GRBG = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */ + cudaEglColorFormatBayer10GBRG = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */ + cudaEglColorFormatBayer12RGGB = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatBayer12BGGR = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatBayer12GRBG = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatBayer12GBRG = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatBayer14RGGB = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */ + cudaEglColorFormatBayer14BGGR = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */ + cudaEglColorFormatBayer14GRBG = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */ + cudaEglColorFormatBayer14GBRG = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */ + cudaEglColorFormatBayer20RGGB = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */ + cudaEglColorFormatBayer20BGGR = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */ + cudaEglColorFormatBayer20GRBG = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */ + cudaEglColorFormatBayer20GBRG = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */ + cudaEglColorFormatYVU444Planar = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatYVU422Planar = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatYVU420Planar = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatBayerIspRGGB = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */ + cudaEglColorFormatBayerIspBGGR = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */ + cudaEglColorFormatBayerIspGRBG = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */ + cudaEglColorFormatBayerIspGBRG = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */ + cudaEglColorFormatBayerBCCR = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */ + cudaEglColorFormatBayerRCCB = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */ + cudaEglColorFormatBayerCRBC = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */ + cudaEglColorFormatBayerCBRC = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */ + cudaEglColorFormatBayer10CCCC = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */ + cudaEglColorFormatBayer12BCCR = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatBayer12RCCB = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatBayer12CRBC = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatBayer12CBRC = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatBayer12CCCC = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */ + cudaEglColorFormatY = 82, /**< Color format for single Y plane. */ + cudaEglColorFormatYUV420SemiPlanar_2020 = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYVU420SemiPlanar_2020 = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYUV420Planar_2020 = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYVU420Planar_2020 = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYUV420SemiPlanar_709 = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYVU420SemiPlanar_709 = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYUV420Planar_709 = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatYVU420Planar_709 = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY10V10U10_420SemiPlanar_709 = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatY10V10U10_422SemiPlanar = 94, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatY10V10U10_422SemiPlanar_709 = 95, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */ + cudaEglColorFormatY_ER = 96, /**< Extended Range Color format for single Y plane. */ + cudaEglColorFormatY_709_ER = 97, /**< Extended Range Color format for single Y plane. */ + cudaEglColorFormatY10_ER = 98, /**< Extended Range Color format for single Y10 plane. */ + cudaEglColorFormatY10_709_ER = 99, /**< Extended Range Color format for single Y10 plane. */ + cudaEglColorFormatY12_ER = 100, /**< Extended Range Color format for single Y12 plane. */ + cudaEglColorFormatY12_709_ER = 101, /**< Extended Range Color format for single Y12 plane. */ + cudaEglColorFormatYUVA = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */ + cudaEglColorFormatYVYU = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */ + cudaEglColorFormatVYUY = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */ + cudaEglColorFormatY10V10U10_420SemiPlanar_ER = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY10V10U10_444SemiPlanar_ER = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatY12V12U12_420SemiPlanar_ER = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */ + cudaEglColorFormatY12V12U12_444SemiPlanar_ER = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ + cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */ +} cudaEglColorFormat; + +/** + * CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame + */ +typedef struct cudaEglPlaneDesc_st { + unsigned int width; /**< Width of plane */ + unsigned int height; /**< Height of plane */ + unsigned int depth; /**< Depth of plane */ + unsigned int pitch; /**< Pitch of plane */ + unsigned int numChannels; /**< Number of channels for the plane */ + struct cudaChannelFormatDesc channelDesc; /**< Channel Format Descriptor */ + unsigned int reserved[4]; /**< Reserved for future use */ +} cudaEglPlaneDesc; + +/** + * CUDA EGLFrame Descriptor - structure defining one frame of EGL. + * + * Each frame may contain one or more planes depending on whether the surface is Multiplanar or not. + * Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as: + * \code + * typedef struct cudaEglPlaneDesc_st { + * unsigned int width; + * unsigned int height; + * unsigned int depth; + * unsigned int pitch; + * unsigned int numChannels; + * struct cudaChannelFormatDesc channelDesc; + * unsigned int reserved[4]; + * } cudaEglPlaneDesc; + * \endcode + +*/ +typedef struct cudaEglFrame_st { + union { + cudaArray_t pArray[CUDA_EGL_MAX_PLANES]; /**< Array of CUDA arrays corresponding to each plane*/ + struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/ + } frame; + cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES]; /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/ + unsigned int planeCount; /**< Number of planes */ + cudaEglFrameType frameType; /**< Array or Pitch */ + cudaEglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/ +} cudaEglFrame; + +/** + * CUDA EGLSream Connection + */ +typedef struct CUeglStreamConnection_st *cudaEglStreamConnection; + +/** @} */ /* END CUDART_TYPES */ + +/** + * \addtogroup CUDART_EGL EGL Interoperability + * This section describes the EGL interoperability functions of the CUDA + * runtime application programming interface. + * + * @{ + */ + +/** + * \brief Registers an EGL image + * + * Registers the EGLImageKHR specified by \p image for access by + * CUDA. A handle to the registered object is returned as \p pCudaResource. + * Additional Mapping/Unmapping is not required for the registered resource and + * ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource. + * + * The application will be responsible for synchronizing access to shared objects. + * The application must ensure that any pending operation which access the objects have completed + * before passing control to CUDA. This may be accomplished by issuing and waiting for + * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs). + * The application will be also responsible for ensuring that any pending operation on the + * registered CUDA resource has completed prior to executing subsequent commands in other APIs + * accesing the same memory objects. + * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably). + * + * The surface's intended usage is specified using \p flags, as follows: + * + * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA. This is the default value. + * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA + * will not write to this resource. + * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that + * CUDA will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously + * stored in the resource will be preserved. + * + * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer. + * typedef void* EGLImageKHR + * + * \param pCudaResource - Pointer to the returned object handle + * \param image - An EGLImageKHR image which can be used to create target resource. + * \param flags - Map flags + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsResourceGetMappedEglFrame, + * ::cuGraphicsEGLRegisterImage + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags); + +/** + * \brief Connect CUDA to EGLStream as a consumer. + * + * Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream. + * + * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one + * API to another. + * + * \param conn - Pointer to the returned connection handle + * \param eglStream - EGLStreamKHR handle + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaEGLStreamConsumerDisconnect, + * ::cudaEGLStreamConsumerAcquireFrame, + * ::cudaEGLStreamConsumerReleaseFrame, + * ::cuEGLStreamConsumerConnect + */ +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream); + +/** + * \brief Connect CUDA to EGLStream as a consumer with given flags. + * + * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by + * ::cudaEglResourceLocationFlags. + * + * The flags specify whether the consumer wants to access frames from system memory or video memory. + * Default is ::cudaEglResourceLocationVidmem. + * + * \param conn - Pointer to the returned connection handle + * \param eglStream - EGLStreamKHR handle + * \param flags - Flags denote intended location - system or video. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaEGLStreamConsumerDisconnect, + * ::cudaEGLStreamConsumerAcquireFrame, + * ::cudaEGLStreamConsumerReleaseFrame, + * ::cuEGLStreamConsumerConnectWithFlags + */ +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags); + +/** + * \brief Disconnect CUDA as a consumer to EGLStream . + * + * Disconnect CUDA as a consumer to EGLStreamKHR. + * + * \param conn - Conection to disconnect. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaEGLStreamConsumerConnect, + * ::cudaEGLStreamConsumerAcquireFrame, + * ::cudaEGLStreamConsumerReleaseFrame, + * ::cuEGLStreamConsumerDisconnect + */ +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn); + +/** + * \brief Acquire an image frame from the EGLStream with CUDA as a consumer. + * + * Acquire an image frame from EGLStreamKHR. + * ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get + * ::cudaEglFrame. + * + * \param conn - Connection on which to acquire + * \param pCudaResource - CUDA resource on which the EGLStream frame will be mapped for use. + * \param pStream - CUDA stream for synchronization and any data migrations + * implied by ::cudaEglResourceLocationFlags. + * \param timeout - Desired timeout in usec. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * ::cudaErrorLaunchTimeout + * + * \sa + * ::cudaEGLStreamConsumerConnect, + * ::cudaEGLStreamConsumerDisconnect, + * ::cudaEGLStreamConsumerReleaseFrame, + * ::cuEGLStreamConsumerAcquireFrame + */ + +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn, + cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout); +/** + * \brief Releases the last frame acquired from the EGLStream. + * + * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR. + * + * \param conn - Connection on which to release + * \param pCudaResource - CUDA resource whose corresponding frame is to be released + * \param pStream - CUDA stream on which release will be done. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaEGLStreamConsumerConnect, + * ::cudaEGLStreamConsumerDisconnect, + * ::cudaEGLStreamConsumerAcquireFrame, + * ::cuEGLStreamConsumerReleaseFrame + */ +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn, + cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream); + +/** + * \brief Connect CUDA to EGLStream as a producer. + * + * Connect CUDA as a producer to EGLStreamKHR specified by \p stream. + * + * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one + * API to another. + * + * \param conn - Pointer to the returned connection handle + * \param eglStream - EGLStreamKHR handle + * \param width - width of the image to be submitted to the stream + * \param height - height of the image to be submitted to the stream + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaEGLStreamProducerDisconnect, + * ::cudaEGLStreamProducerPresentFrame, + * ::cudaEGLStreamProducerReturnFrame, + * ::cuEGLStreamProducerConnect + */ +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn, + EGLStreamKHR eglStream, EGLint width, EGLint height); + +/** + * \brief Disconnect CUDA as a producer to EGLStream . + * + * Disconnect CUDA as a producer to EGLStreamKHR. + * + * \param conn - Conection to disconnect. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaEGLStreamProducerConnect, + * ::cudaEGLStreamProducerPresentFrame, + * ::cudaEGLStreamProducerReturnFrame, + * ::cuEGLStreamProducerDisconnect + */ +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn); + +/** + * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer. + * + * The ::cudaEglFrame is defined as: + * \code + * typedef struct cudaEglFrame_st { + * union { + * cudaArray_t pArray[CUDA_EGL_MAX_PLANES]; + * struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES]; + * } frame; + * cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES]; + * unsigned int planeCount; + * cudaEglFrameType frameType; + * cudaEglColorFormat eglColorFormat; + * } cudaEglFrame; + * \endcode + * + * For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory + * allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in + * the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region. + * + * \param conn - Connection on which to present the CUDA array + * \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream. + * \param pStream - CUDA stream on which to present the frame. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaEGLStreamProducerConnect, + * ::cudaEGLStreamProducerDisconnect, + * ::cudaEGLStreamProducerReturnFrame, + * ::cuEGLStreamProducerPresentFrame + */ +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn, + cudaEglFrame eglframe, cudaStream_t *pStream); + +/** + * \brief Return the CUDA eglFrame to the EGLStream last released by the consumer. + * + * This API can potentially return cudaErrorLaunchTimeout if the consumer has not + * returned a frame to EGL stream. If timeout is returned the application can retry. + * + * \param conn - Connection on which to present the CUDA array + * \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream. + * \param pStream - CUDA stream on which to return the frame. + * + * \return + * ::cudaSuccess, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \sa + * ::cudaEGLStreamProducerConnect, + * ::cudaEGLStreamProducerDisconnect, + * ::cudaEGLStreamProducerPresentFrame, + * ::cuEGLStreamProducerReturnFrame + */ +extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn, + cudaEglFrame *eglframe, cudaStream_t *pStream); + +/** + * \brief Get an eglFrame through which to access a registered EGL graphics resource. + * + * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource + * \p resource may be accessed. + * This API can only be called for EGL graphics resources. + * + * The ::cudaEglFrame is defined as + * \code + * typedef struct cudaEglFrame_st { + * union { + * cudaArray_t pArray[CUDA_EGL_MAX_PLANES]; + * struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES]; + * } frame; + * cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES]; + * unsigned int planeCount; + * cudaEglFrameType frameType; + * cudaEglColorFormat eglColorFormat; + * } cudaEglFrame; + * \endcode + * + * + * \param eglFrame - Returned eglFrame. + * \param resource - Registered resource to access. + * \param index - Index for cubemap surfaces. + * \param mipLevel - Mipmap level for the subresource to access. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown + * + * \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application. + * + * \sa + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsResourceGetMappedEglFrame + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame, + cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel); + +/** + * \brief Creates an event from EGLSync object + * + * Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified + * via \p flags. Valid flags include: + * - ::cudaEventDefault: Default event creation flag. + * - ::cudaEventBlockingSync: Specifies that the created event should use blocking + * synchronization. A CPU thread that uses ::cudaEventSynchronize() to wait on + * an event created with this flag will block until the event has actually + * been completed. + * + * ::cudaEventRecord and TimingData are not supported for events created from EGLSync. + * + * The EGLSyncKHR is an opaque handle to an EGL sync object. + * typedef void* EGLSyncKHR + * + * \param phEvent - Returns newly created event + * \param eglSync - Opaque handle to EGLSync object + * \param flags - Event creation flags + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * + * \sa + * ::cudaEventQuery, + * ::cudaEventSynchronize, + * ::cudaEventDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags); + +/** @} */ /* END CUDART_EGL */ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#endif /* __CUDA_EGL_INTEROP_H__ */ + diff --git a/ext/cudart/include/cuda_fp16.h b/ext/cudart/include/cuda_fp16.h new file mode 100644 index 0000000000000000000000000000000000000000..8dc1c29c78d23517ab9f9c5ebfb9da3a531b0240 --- /dev/null +++ b/ext/cudart/include/cuda_fp16.h @@ -0,0 +1,3794 @@ +/* +* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +/** +* \defgroup CUDA_MATH_INTRINSIC_HALF Half Precision Intrinsics +* This section describes half precision intrinsic functions that are +* only supported in device code. +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_ARITHMETIC Half Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_ARITHMETIC Half2 Arithmetic Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_COMPARISON Half Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_COMPARISON Half2 Comparison Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_MISC Half Precision Conversion and Data Movement +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF_FUNCTIONS Half Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +/** +* \defgroup CUDA_MATH__HALF2_FUNCTIONS Half2 Math Functions +* \ingroup CUDA_MATH_INTRINSIC_HALF +* To use these functions, include the header file \p cuda_fp16.h in your program. +*/ + +#ifndef __CUDA_FP16_H__ +#define __CUDA_FP16_H__ + +#define ___CUDA_FP16_STRINGIFY_INNERMOST(x) #x +#define __CUDA_FP16_STRINGIFY(x) ___CUDA_FP16_STRINGIFY_INNERMOST(x) + +#if defined(__cplusplus) +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__CUDACC__) */ + +#define __CUDA_FP16_TYPES_EXIST__ + +/* Forward-declaration of structures defined in "cuda_fp16.hpp" */ + +/** + * \brief half datatype + * + * \details This structure implements the datatype for storing + * half-precision floating-point numbers. The structure implements + * assignment operators and type conversions. + * 16 bits are being used in total: 1 sign bit, 5 bits for the exponent, + * and the significand is being stored in 10 bits. + * The total precision is 11 bits. There are 15361 representable + * numbers within the interval [0.0, 1.0], endpoints included. + * On average we have log10(2**11) ~ 3.311 decimal digits. + * + * \internal + * \req IEEE 754-2008 compliant implementation of half-precision + * floating-point numbers. + * \endinternal + */ +struct __half; + +/** + * \brief half2 datatype + * + * \details This structure implements the datatype for storing two + * half-precision floating-point numbers. + * The structure implements assignment operators and type conversions. + * + * \internal + * \req Vectorified version of half. + * \endinternal + */ +struct __half2; + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts double number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts double number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - double. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-to-nearest-even mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-to-nearest-even mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-towards-zero mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-towards-zero mode. +* \param[in] a - float. Is only being read. +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-down mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-down mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts float number to half precision in round-up mode +* and returns \p half with converted value. +* +* \details Converts float number \p a to half precision in round-up mode. +* \param[in] a - float. Is only being read. +* +* \returns half +* - \p a converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts \p half number to float. +* +* \details Converts half number \p a to float. +* \param[in] a - float. Is only being read. +* +* \returns float +* - \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts input to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* +* \details Converts input \p a to half precision in round-to-nearest-even mode and +* populates both halves of \p half2 with converted value. +* \param[in] a - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with both halves equal to the converted half +* precision number. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both input floats to half precision in round-to-nearest-even +* mode and returns \p half2 with converted values. +* +* \details Converts both input floats to half precision in round-to-nearest-even mode +* and combines the results into one \p half2 number. Low 16 bits of the return +* value correspond to the input \p a, high 16 bits correspond to the input \p +* b. +* \param[in] a - float. Is only being read. +* \param[in] b - float. Is only being read. +* +* \returns half2 +* - The \p half2 value with corresponding halves equal to the +* converted input floats. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts low 16 bits of \p half2 to float and returns the result +* +* \details Converts low 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The low 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts high 16 bits of \p half2 to float and returns the result +* +* \details Converts high 16 bits of \p half2 input \p a to 32-bit floating-point number +* and returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns float +* - The high 16 bits of \p a converted to float. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-towards-zero mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-towards-zero mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-towards-zero mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-towards-zero +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-towards-zero mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h); + +#if defined(__CUDACC__) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both components of float2 number to half precision in +* round-to-nearest-even mode and returns \p half2 with converted values. +* +* \details Converts both components of float2 to half precision in round-to-nearest +* mode and combines the results into one \p half2 number. Low 16 bits of the +* return value correspond to \p a.x and high 16 bits of the return value +* correspond to \p a.y. +* \param[in] a - float2. Is only being read. +* +* \returns half2 +* - The \p half2 which has corresponding halves equal to the +* converted float2 components. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Converts both halves of \p half2 to float2 and returns the result. +* +* \details Converts both halves of \p half2 input \p a to float2 and returns the +* result. +* \param[in] a - half2. Is only being read. +* +* \returns float2 +* - \p a converted to float2. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed integer in +* round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns int +* - \p h converted to a signed integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __half2int_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-to-nearest-even mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-towards-zero mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rz(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-down mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_rd(const int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed integer to a half in round-up mode. +* +* \details Convert the signed integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __int2half_ru(const int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - \p h converted to a signed short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-towards-zero mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rz(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-down mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_rd(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed short integer to a half in round-up mode. +* +* \details Convert the signed short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short2half_ru(const short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-to-nearest-even mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned integer +* in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned int +* - \p h converted to an unsigned integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-to-nearest-even mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-towards-zero mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-down mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned integer to a half in round-up mode. +* +* \details Convert the unsigned integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-to-nearest-even mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-down mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned short integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned short +* integer in round-up mode. NaN inputs are converted to 0. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - \p h converted to an unsigned short integer. +*/ +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-down mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned short integer to a half in round-up mode. +* +* \details Convert the unsigned short integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-to-nearest-even mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-down mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to an unsigned 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to an unsigned 64-bit +* integer in round-up mode. NaN inputs return 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns unsigned long long int +* - \p h converted to an unsigned 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-towards-zero +* mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-down mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert an unsigned 64-bit integer to a half in round-up mode. +* +* \details Convert the unsigned 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - unsigned long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-to-nearest-even +* mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-to-nearest-even mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-down mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-down mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a half to a signed 64-bit integer in round-up mode. +* +* \details Convert the half-precision floating-point value \p h to a signed 64-bit +* integer in round-up mode. NaN inputs return a long long int with hex value of 0x8000000000000000. +* \param[in] h - half. Is only being read. +* +* \returns long long int +* - \p h converted to a signed 64-bit integer. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-to-nearest-even +* mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-to-nearest-even mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-towards-zero mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-towards-zero mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +*/ +__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-down mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-down mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Convert a signed 64-bit integer to a half in round-up mode. +* +* \details Convert the signed 64-bit integer value \p i to a half-precision floating-point +* value in round-up mode. +* \param[in] i - long long int. Is only being read. +* +* \returns half +* - \p i converted to half. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i); + +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Truncate input argument to the integral part. +* +* \details Round \p h to the nearest integer value that does not exceed \p h in +* magnitude. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The truncated integer value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half htrunc(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate ceiling of the input argument. +* +* \details Compute the smallest integer value not less than \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The smallest integer value not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hceil(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details Calculate the largest integer value which is less than or equal to \p h. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The largest integer value which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hfloor(const __half h); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round \p h to the nearest integer value in half-precision floating-point +* format, with halfway cases rounded to the nearest even integer value. +* \param[in] h - half. Is only being read. +* +* \returns half +* - The nearest integer to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrint(const __half h); + +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Truncate \p half2 vector input argument to the integral part. +* +* \details Round each component of vector \p h to the nearest integer value that does +* not exceed \p h in magnitude. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The truncated \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate \p half2 vector ceiling of the input argument. +* +* \details For each component of vector \p h compute the smallest integer value not less +* than \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of smallest integers not less than \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculate the largest integer less than or equal to \p h. +* +* \details For each component of vector \p h calculate the largest integer value which +* is less than or equal to \p h. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of largest integers which is less than or equal to \p h. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Round input to nearest integer value in half-precision floating-point +* number. +* +* \details Round each component of \p half2 vector \p h to the nearest integer value in +* half-precision floating-point format, with halfway cases rounded to the +* nearest even integer value. +* \param[in] h - half2. Is only being read. +* +* \returns half2 +* - The vector of rounded integer values. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns \p half2 with both halves equal to the input value. +* +* \details Returns \p half2 number with both halves equal to the input \p a \p half +* number. +* \param[in] a - half. Is only being read. +* +* \returns half2 +* - The vector which has both its halves equal to the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Swaps both halves of the \p half2 input. +* +* \details Swaps both halves of the \p half2 input and returns a new \p half2 number +* with swapped halves. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - \p a with its halves being swapped. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from each of the two \p half2 inputs and combines +* into one \p half2 number. +* +* \details Extracts low 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. Low 16 bits from input \p a is stored in low 16 bits of +* the return value, low 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The low 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from each of the two \p half2 inputs and +* combines into one \p half2 number. +* +* \details Extracts high 16 bits from each of the two \p half2 inputs and combines into +* one \p half2 number. High 16 bits from input \p a is stored in low 16 bits of +* the return value, high 16 bits from input \p b is stored in high 16 bits of +* the return value. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The high 16 bits of \p a and of \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns high 16 bits of \p half2 input. +* +* \details Returns high 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - The high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __high2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Returns low 16 bits of \p half2 input. +* +* \details Returns low 16 bits of \p half2 input \p a. +* \param[in] a - half2. Is only being read. +* +* \returns half +* - Returns \p half which contains low 16 bits of the input \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __low2half(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Checks if the input \p half number is infinite. +* +* \details Checks if the input \p half number \p a is infinite. +* \param[in] a - half. Is only being read. +* +* \returns int +* - -1 iff \p a is equal to negative infinity, +* - 1 iff \p a is equal to positive infinity, +* - 0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ int __hisinf(const __half a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Combines two \p half numbers into one \p half2 number. +* +* \details Combines two input \p half number \p a and \p b into one \p half2 number. +* Input \p a is stored in low 16 bits of the return value, input \p b is stored +* in high 16 bits of the return value. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half2 +* - The half2 with one half equal to \p a and the other to \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts low 16 bits from \p half2 input. +* +* \details Extracts low 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the low 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Extracts high 16 bits from \p half2 input. +* +* \details Extracts high 16 bits from \p half2 input \p a and returns a new \p half2 +* number which has both halves equal to the extracted bits. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with both halves equal to the high 16 bits of the input. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a); + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as a signed short integer. +* +* \details Reinterprets the bits in the half-precision floating-point number \p h +* as a signed short integer. +* \param[in] h - half. Is only being read. +* +* \returns short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ short int __half_as_short(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a \p half as an unsigned short integer. +* +* \details Reinterprets the bits in the half-precision floating-point \p h +* as an unsigned short number. +* \param[in] h - half. Is only being read. +* +* \returns unsigned short int +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in a signed short integer as a \p half. +* +* \details Reinterprets the bits in the signed short integer \p i as a +* half-precision floating-point number. +* \param[in] i - short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __short_as_half(const short int i); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Reinterprets bits in an unsigned short integer as a \p half. +* +* \details Reinterprets the bits in the unsigned short integer \p i as a +* half-precision floating-point number. +* \param[in] i - unsigned short int. Is only being read. +* +* \returns half +* - The reinterpreted value. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, the other input is returned. +* - If both inputs are NaNs, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b); + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +#define __WSB_DEPRECATION_MESSAGE(x) __CUDA_FP16_STRINGIFY(x) "() is deprecated in favor of " __CUDA_FP16_STRINGIFY(x) "_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." + +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half2 __shfl(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down))__half2 __shfl_down(const __half2 var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half2 __shfl_xor(const __half2 var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) __half __shfl(const __half var, const int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) __half __shfl_up(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) __half __shfl_down(const __half var, const unsigned int delta, const int width = warpSize); +__CUDA_FP16_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) __half __shfl_xor(const __half var, const int delta, const int width = warpSize); +#endif + +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half2. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 4-byte word referenced by var from the source thread ID as half2. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Direct copy from indexed thread. +* +* \details Returns the value of var held by the thread whose ID is given by delta. +* If width is less than warpSize then each subsection of the warp behaves as a separate +* entity with a starting logical thread ID of 0. If delta is outside the range [0:width-1], +* the value returned corresponds to the value of var held by the delta modulo width (i.e. +* within the same subsection). width must have a value which is a power of 2; +* results are undefined if width is not a power of 2, or is a number greater than +* warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with lower ID relative to the caller. +* \details Calculates a source thread ID by subtracting delta from the caller's lane ID. +* The value of var held by the resulting lane ID is returned: in effect, var is shifted up +* the warp by delta threads. If width is less than warpSize then each subsection of the warp +* behaves as a separate entity with a starting logical thread ID of 0. The source thread index +* will not wrap around the value of width, so effectively the lower delta threads will be unchanged. +* width must have a value which is a power of 2; results are undefined if width is not a power of 2, +* or is a number greater than warpSize. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread with higher ID relative to the caller. +* +* \details Calculates a source thread ID by adding delta to the caller's thread ID. +* The value of var held by the resulting thread ID is returned: this has the effect +* of shifting var down the warp by delta threads. If width is less than warpSize then +* each subsection of the warp behaves as a separate entity with a starting logical +* thread ID of 0. As for __shfl_up_sync(), the ID number of the source thread +* will not wrap around the value of width and so the upper delta threads +* will remain unchanged. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width = warpSize); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Exchange a variable between threads within a warp. Copy from a thread based on bitwise XOR of own thread ID. +* +* \details Calculates a source thread ID by performing a bitwise XOR of the caller's thread ID with mask: +* the value of var held by the resulting thread ID is returned. If width is less than warpSize then each +* group of width consecutive threads are able to access elements from earlier groups of threads, +* however if they attempt to access elements from later groups of threads their own value of var +* will be returned. This mode implements a butterfly addressing pattern such as is used in tree +* reduction and broadcast. +* \param[in] mask - unsigned int. Is only being read. +* \param[in] var - half. Is only being read. +* \param[in] delta - int. Is only being read. +* \param[in] width - int. Is only being read. +* +* \returns Returns the 2-byte word referenced by var from the source thread ID as half. +* If the source thread ID is out of range or the source thread has exited, the calling thread's own var is returned. +* \note_ref_guide_warp_shuffle +* \internal +* \exception-guarantee no-throw guarantee +* \behavior not reentrant, not thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width = warpSize); + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) */ + +#if defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) ) +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.nc` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cg` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.ca` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cs` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.lu` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `ld.global.cv` load instruction. +* \param[in] ptr - memory location +* \returns The value pointed by `ptr` +*/ +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wb` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cg` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.cs` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value); +/** +* \ingroup CUDA_MATH__HALF_MISC +* \brief Generates a `st.global.wt` store instruction. +* \param[out] ptr - memory location +* \param[in] value - the value to be stored +*/ +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value); +#endif /*defined(__cplusplus) && ( !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320) )*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs half2 vector if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 result of less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The half2 vector result of less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered if-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered not-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison. +* +* Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-equal comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The vector result of unordered less-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The corresponding \p half results are set to 1.0 for true, or 0.0 for false. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The \p half2 vector result of unordered greater-than comparison of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Determine whether \p half2 argument is a NaN. +* +* \details Determine whether each half of input \p half2 number \p a is a NaN. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The half2 with the corresponding \p half results set to +* 1.0 for NaN, 0.0 otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-95 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode. Prevents floating-point contractions of mul+sub +* into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-104 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode. Prevents floating-point contractions of +* mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-102 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplying the vectors \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector division in round-to-nearest-even mode. +* +* \details Divides \p half2 input vector \p a by input vector \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-103 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The elementwise division of \p a with \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* +* \details Calculates the absolute value of both halves of the input \p half2 number and +* returns the result. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with the absolute value of both halves. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector add of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the results to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector subtraction in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Subtracts \p half2 input vector \p b from input vector \p a in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The subtraction of vector \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector multiplication in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiplication of inputs \p a and \p b, in +* round-to-nearest-even mode, and clamps the results to range [0.0, 1.0]. NaN +* results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise multiplication of vectors \p a and \p b, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-105 +* \endinternal +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode, with saturation to [0.0, 1.0]. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the +* results to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c, +* with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Negates both halves of the input \p half2 number and returns the +* result. +* +* \details Negates both halves of the input \p half2 number \p a and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-101 +* \endinternal +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - Returns \p a with both halves negated. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Calculates the absolute value of input \p half number and returns the result. +* +* \details Calculates the absolute value of input \p half number and returns the result. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The absolute value of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __habs(const __half a); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode. +* +* \details Performs \p half addition of inputs \p a and \p b, in round-to-nearest-even +* mode. Prevents floating-point contractions of mul+add into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-94 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode. Prevents floating-point contractions of mul+sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-97 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtracting \p b from \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode. Prevents floating-point contractions of mul+add or sub into fma. +* \internal +* \req DEEPLEARN-SRM_REQ-99 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b. +*/ +__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half division in round-to-nearest-even mode. +* +* \details Divides \p half input \p a by input \p b in round-to-nearest +* mode. +* \internal +* \req DEEPLEARN-SRM_REQ-98 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of dividing \p a by \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half addition in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half add of inputs \p a and \p b, in round-to-nearest-even mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The sum of \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half subtraction in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Subtracts \p half input \p b from input \p a in round-to-nearest +* mode, +* and clamps the result to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of subtraction of \p b from \p a, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half multiplication in round-to-nearest-even mode, with +* saturation to [0.0, 1.0]. +* +* \details Performs \p half multiplication of inputs \p a and \p b, in round-to-nearest +* mode, and clamps the result to range [0.0, 1.0]. NaN results are flushed to +* +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* - The result of multiplying \p a and \p b, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* \internal +* \req DEEPLEARN-SRM_REQ-96 +* \endinternal +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode, +* with saturation to [0.0, 1.0]. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode, and clamps the result +* to range [0.0, 1.0]. NaN results are flushed to +0.0. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c, with respect to saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Negates input \p half number and returns the result. +* +* \details Negates input \p half number and returns the result. +* \internal +* \req DEEPLEARN-SRM_REQ-100 +* \endinternal +* \param[in] a - half. Is only being read. +* +* \returns half +* - minus a +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hneg(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector if-equal comparison and returns boolean true +* iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of if-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector not-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of not-equal comparison +* of vectors \p a and \p b are true, +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-equal comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-equal comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector less-than comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of less-than comparison +* of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector greater-than comparison and returns boolean +* true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate false results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of greater-than +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered if-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector if-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half if-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered if-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered not-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector not-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half not-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered not-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-equal comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-equal +* comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-equal comparison and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-equal comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-equal comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-equal comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered less-than comparison and returns +* boolean true iff both \p half results are true, boolean false otherwise. +* +* \details Performs \p half2 vector less-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half less-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered less-than comparison of +* vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Performs \p half2 vector unordered greater-than comparison and +* returns boolean true iff both \p half results are true, boolean false +* otherwise. +* +* \details Performs \p half2 vector greater-than comparison of inputs \p a and \p b. +* The bool result is set to true only if both \p half greater-than comparisons +* evaluate to true, or false otherwise. +* NaN inputs generate true results. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns bool +* - true if both \p half results of unordered +* greater-than comparison of vectors \p a and \p b are true; +* - false otherwise. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of if-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of not-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-equal comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of less-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate false results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of greater-than comparison of \p a and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered if-equal comparison. +* +* \details Performs \p half if-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered if-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered not-equal comparison. +* +* \details Performs \p half not-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered not-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-equal comparison. +* +* \details Performs \p half less-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-equal comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-equal comparison. +* +* \details Performs \p half greater-equal comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-equal comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered less-than comparison. +* +* \details Performs \p half less-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered less-than comparison of \p a and +* \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Performs \p half unordered greater-than comparison. +* +* \details Performs \p half greater-than comparison of inputs \p a and \p b. +* NaN inputs generate true results. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns bool +* - The boolean result of unordered greater-than comparison of \p a +* and \p b. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Determine whether \p half argument is a NaN. +* +* \details Determine whether \p half value \p a is a NaN. +* \param[in] a - half. Is only being read. +* +* \returns bool +* - true iff argument is NaN. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ bool __hisnan(const __half a); +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half maximum of two input values, NaNs pass through. +* +* \details Calculates \p half max(\p a, \p b) +* defined as (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_COMPARISON +* \brief Calculates \p half minimum of two input values, NaNs pass through. +* +* \details Calculates \p half min(\p a, \p b) +* defined as (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* +* \returns half +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b); +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Performs \p half fused multiply-add in round-to-nearest-even mode with relu saturation. +* +* \details Performs \p half multiply on inputs \p a and \p b, +* then performs a \p half add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half. Is only being read. +* \param[in] b - half. Is only being read. +* \param[in] c - half. Is only being read. +* +* \returns half +* - The result of fused multiply-add operation on \p +* a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector maximum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector max(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a > \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise maximum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_COMPARISON +* \brief Calculates \p half2 vector minimum of two inputs, NaNs pass through. +* +* \details Calculates \p half2 vector min(\p a, \p b). +* Elementwise \p half operation is defined as +* (\p a < \p b) ? \p a : \p b. +* - If either of inputs is NaN, then canonical NaN is returned. +* - If values of both inputs are 0.0, then +0.0 > -0.0 +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise minimum of vectors \p a and \p b, with NaNs pass through +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b); +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs \p half2 vector fused multiply-add in round-to-nearest-even +* mode with relu saturation. +* +* \details Performs \p half2 vector multiply on inputs \p a and \p b, +* then performs a \p half2 vector add of the result with \p c, +* rounding the result once in round-to-nearest-even mode. +* Then negative result is clamped to 0. +* NaN result is converted to canonical NaN. +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of elementwise fused multiply-add operation on vectors \p a, \p b, and \p c with relu saturation. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c); +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) */ +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Performs fast complex multiply-accumulate +* +* \details Interprets vector \p half2 input pairs \p a, \p b, and \p c as +* complex numbers in \p half precision and performs +* complex multiply-accumulate operation: a*b + c +* \param[in] a - half2. Is only being read. +* \param[in] b - half2. Is only being read. +* \param[in] c - half2. Is only being read. +* +* \returns half2 +* - The result of complex multiply-accumulate operation on complex numbers \p a, \p b, and \p c +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half square root in round-to-nearest-even mode. +* +* \details Calculates \p half square root of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal square root in round-to-nearest-even +* mode. +* +* \details Calculates \p half reciprocal square root of input \p a in round-to-nearest +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal square root of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrsqrt(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half reciprocal of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The reciprocal of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hrcp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half natural logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half binary logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal logarithm in round-to-nearest-even mode. +* +* \details Calculates \p half decimal logarithm of input \p a in round-to-nearest-even +* mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal logarithm of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hlog10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half natural exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half natural exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The natural exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half binary exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half binary exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The binary exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp2(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half decimal exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half decimal exponential function of input \p a in +* round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The decimal exponential function on \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hexp10(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half cosine in round-to-nearest-even mode. +* +* \details Calculates \p half cosine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The cosine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hcos(const __half a); +/** +* \ingroup CUDA_MATH__HALF_FUNCTIONS +* \brief Calculates \p half sine in round-to-nearest-even mode. +* +* \details Calculates \p half sine of input \p a in round-to-nearest-even mode. +* \param[in] a - half. Is only being read. +* +* \returns half +* - The sine of \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half hsin(const __half a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector square root in round-to-nearest-even mode. +* +* \details Calculates \p half2 square root of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal square root in round-to-nearest +* mode. +* +* \details Calculates \p half2 reciprocal square root of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal square root on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector reciprocal in round-to-nearest-even mode. +* +* \details Calculates \p half2 reciprocal of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise reciprocal on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector natural logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 natural logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise natural logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 binary logarithm of input vector \p a in round-to-nearest +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal logarithm in round-to-nearest-even +* mode. +* +* \details Calculates \p half2 decimal logarithm of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal logarithm on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector exponential function in round-to-nearest +* mode. +* +* \details Calculates \p half2 exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector binary exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 binary exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise binary exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector decimal exponential function in +* round-to-nearest-even mode. +* +* \details Calculates \p half2 decimal exponential function of input vector \p a in +* round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise decimal exponential function on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector cosine in round-to-nearest-even mode. +* +* \details Calculates \p half2 cosine of input vector \p a in round-to-nearest-even +* mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise cosine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a); +/** +* \ingroup CUDA_MATH__HALF2_FUNCTIONS +* \brief Calculates \p half2 vector sine in round-to-nearest-even mode. +* +* \details Calculates \p half2 sine of input vector \p a in round-to-nearest-even mode. +* \param[in] a - half2. Is only being read. +* +* \returns half2 +* - The elementwise sine on vector \p a. +* \internal +* \exception-guarantee no-throw guarantee +* \behavior reentrant, thread safe +* \endinternal +*/ +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) + +/** +* \ingroup CUDA_MATH__HALF2_ARITHMETIC +* \brief Vector add \p val to the value stored at \p address in global or shared memory, and writes this +* value back to \p address. The atomicity of the add operation is guaranteed separately for each of the +* two __half elements; the entire __half2 is not guaranteed to be atomic as a single 32-bit access. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 6.x and higher. +* +* \param[in] address - half2*. An address in global or shared memory. +* \param[in] val - half2. The value to be added. +* +* \returns half2 +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600)*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) + +/** +* \ingroup CUDA_MATH__HALF_ARITHMETIC +* \brief Adds \p val to the value stored at \p address in global or shared memory, and writes this value +* back to \p address. This operation is performed in one atomic operation. +* +* \details The location of \p address must be in global or shared memory. This operation has undefined +* behavior otherwise. This operation is only supported by devices of compute capability 7.x and higher. +* +* \param[in] address - half*. An address in global or shared memory. +* \param[in] val - half. The value to be added. +* +* \returns half +* - The old value read from \p address. +* +* \note_ref_guide_atomic +*/ +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val); + +#endif /*if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700)*/ + +#endif /* defined(__CUDACC__) */ + +#undef __CUDA_FP16_DECL__ +#undef __CUDA_HOSTDEVICE_FP16_DECL__ + +#endif /* defined(__cplusplus) */ + +/* Note the .hpp file is included even for host-side compilation, to capture the "half" & "half2" definitions */ +#include "cuda_fp16.hpp" +#undef ___CUDA_FP16_STRINGIFY_INNERMOST +#undef __CUDA_FP16_STRINGIFY + +#endif /* end of include guard: __CUDA_FP16_H__ */ diff --git a/ext/cudart/include/cuda_fp16.hpp b/ext/cudart/include/cuda_fp16.hpp new file mode 100644 index 0000000000000000000000000000000000000000..e15f46b35a90fc7940d152e02bc2cec0fe87f857 --- /dev/null +++ b/ext/cudart/include/cuda_fp16.hpp @@ -0,0 +1,2614 @@ +/* +* Copyright 1993-2021 NVIDIA Corporation. All rights reserved. +* +* NOTICE TO LICENSEE: +* +* This source code and/or documentation ("Licensed Deliverables") are +* subject to NVIDIA intellectual property rights under U.S. and +* international Copyright laws. +* +* These Licensed Deliverables contained herein is PROPRIETARY and +* CONFIDENTIAL to NVIDIA and is being provided under the terms and +* conditions of a form of NVIDIA software license agreement by and +* between NVIDIA and Licensee ("License Agreement") or electronically +* accepted by Licensee. Notwithstanding any terms or conditions to +* the contrary in the License Agreement, reproduction or disclosure +* of the Licensed Deliverables to any third party without the express +* written consent of NVIDIA is prohibited. +* +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE +* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS +* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. +* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED +* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, +* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. +* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE +* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY +* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY +* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE +* OF THESE LICENSED DELIVERABLES. +* +* U.S. Government End Users. These Licensed Deliverables are a +* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT +* 1995), consisting of "commercial computer software" and "commercial +* computer software documentation" as such terms are used in 48 +* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government +* only as a commercial end item. Consistent with 48 C.F.R.12.212 and +* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all +* U.S. Government End Users acquire the Licensed Deliverables with +* only those rights set forth herein. +* +* Any use of the Licensed Deliverables in individual and commercial +* software must include, in the user documentation and internal +* comments to the code, the above Disclaimer and U.S. Government End +* Users Notice. +*/ + +#if !defined(__CUDA_FP16_HPP__) +#define __CUDA_FP16_HPP__ + +#if !defined(__CUDA_FP16_H__) +#error "Do not include this file directly. Instead, include cuda_fp16.h." +#endif + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +# define __CPP_VERSION_AT_LEAST_11_FP16 +#endif + +/* C++11 header for std::move. + * In RTC mode, std::move is provided implicitly; don't include the header + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) && !defined(__CUDACC_RTC__) +#include <utility> +#endif /* __cplusplus >= 201103L && !defined(__CUDACC_RTC__) */ + +/* C++ header for std::memcpy (used for type punning in host-side implementations). + * When compiling as a CUDA source file memcpy is provided implicitly. + * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include <cstring> +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + + +/* Set up function decorations */ +#if defined(__CUDACC__) +#define __CUDA_FP16_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __host__ __device__ __inline__ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#define __CUDA_HOSTDEVICE__ __host__ __device__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP16_DECL__ static __attribute__ ((unused)) +#else +#define __CUDA_HOSTDEVICE_FP16_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE__ +#endif /* defined(__CUDACC_) */ + +/* Set up structure-alignment attribute */ +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP16)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ +#endif /* defined(__CUDACC__) */ + +/* Macros to allow half & half2 to be used by inline assembly */ +#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var))) +#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var))) +#define __HALF2_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var))) +#define __HALF2_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var))) + +/* Macros for half & half2 binary arithmetic */ +#define __BINARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b))); \ + return val; \ +} /* while(0) */ +#define __BINARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF_MACRO(name) /* do */ {\ + __half val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16 %0,%1,%2,%3;\n}" \ + :"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)),"h"(__HALF_TO_CUS(b)),"h"(__HALF_TO_CUS(c))); \ + return val; \ +} /* while(0) */ +#define __TERNARY_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{" __CUDA_FP16_STRINGIFY(name) ".f16x2 %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b)),"r"(__HALF2_TO_CUI(c))); \ + return val; \ +} /* while(0) */ + +/** +* Types which allow static initialization of "half" and "half2" until +* these become an actual builtin. Note this initialization is as a +* bitfield representation of "half", and not a conversion from short->half. +* Such a representation will be deprecated in a future version of CUDA. +* (Note these are visible to non-nvcc compilers, including C-only compilation) +*/ +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + +typedef struct __CUDA_ALIGN__(4) { + unsigned short x; + unsigned short y; +} __half2_raw; + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/* Hide GCC member initialization list warnings because of host/device in-function init requirement */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#pragma GCC diagnostic ignored "-Weffc++" +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +/* class' : multiple assignment operators specified + The class has multiple assignment operators of a single type. This warning is informational */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( push ) +#pragma warning( disable:4522 ) +#endif /* defined(__GNUC__) */ + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; + +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half() = default; +#else + __CUDA_HOSTDEVICE__ __half() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + + /* Convert to/from __half_raw */ + __CUDA_HOSTDEVICE__ __half(const __half_raw &hr) : __x(hr.x) { } + __CUDA_HOSTDEVICE__ __half &operator=(const __half_raw &hr) { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ volatile __half &operator=(const volatile __half_raw &hr) volatile { __x = hr.x; return *this; } + __CUDA_HOSTDEVICE__ operator __half_raw() const { __half_raw ret; ret.x = __x; return ret; } + __CUDA_HOSTDEVICE__ operator __half_raw() const volatile { __half_raw ret; ret.x = __x; return ret; } + +#if !defined(__CUDA_NO_HALF_CONVERSIONS__) + + /* Construct from float/double */ + __CUDA_HOSTDEVICE__ __half(const float f) { __x = __float2half(f).__x; } + __CUDA_HOSTDEVICE__ __half(const double f) { __x = __double2half(f).__x; } + + __CUDA_HOSTDEVICE__ operator float() const { return __half2float(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const float f) { __x = __float2half(f).__x; return *this; } + + /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */ + __CUDA_HOSTDEVICE__ __half &operator=(const double f) { __x = __double2half(f).__x; return *this; } + +/* Member functions only available to nvcc compilation so far */ +#if defined(__CUDACC__) + /* Allow automatic construction from types supported natively in hardware */ + /* Note we do avoid constructor init-list because of special host/device compilation rules */ + __CUDA_HOSTDEVICE__ __half(const short val) { __x = __short2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned short val) { __x = __ushort2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const int val) { __x = __int2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned int val) { __x = __uint2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const long long val) { __x = __ll2half_rn(val).__x; } + __CUDA_HOSTDEVICE__ __half(const unsigned long long val) { __x = __ull2half_rn(val).__x; } + + /* Allow automatic casts to supported builtin types, matching all that are permitted with float */ + __CUDA_HOSTDEVICE__ operator short() const { return __half2short_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const short val) { __x = __short2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned short() const { return __half2ushort_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned short val) { __x = __ushort2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator int() const { return __half2int_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const int val) { __x = __int2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned int() const { return __half2uint_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned int val) { __x = __uint2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator long long() const { return __half2ll_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const long long val) { __x = __ll2half_rn(val).__x; return *this; } + + __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __half2ull_rz(*this); } + __CUDA_HOSTDEVICE__ __half &operator=(const unsigned long long val) { __x = __ull2half_rn(val).__x; return *this; } + + /* Boolean conversion - note both 0 and -0 must return false */ + __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFFU) != 0U; } +#endif /* defined(__CUDACC__) */ +#endif /* !defined(__CUDA_NO_HALF_CONVERSIONS__) */ +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16 operations only supported on arch >= 5.3 */ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +#if !defined(__CUDA_NO_HALF_OPERATORS__) +/* Some basic arithmetic operations expected of a builtin */ +__device__ __forceinline__ __half operator+(const __half &lh, const __half &rh) { return __hadd(lh, rh); } +__device__ __forceinline__ __half operator-(const __half &lh, const __half &rh) { return __hsub(lh, rh); } +__device__ __forceinline__ __half operator*(const __half &lh, const __half &rh) { return __hmul(lh, rh); } +__device__ __forceinline__ __half operator/(const __half &lh, const __half &rh) { return __hdiv(lh, rh); } + +__device__ __forceinline__ __half &operator+=(__half &lh, const __half &rh) { lh = __hadd(lh, rh); return lh; } +__device__ __forceinline__ __half &operator-=(__half &lh, const __half &rh) { lh = __hsub(lh, rh); return lh; } +__device__ __forceinline__ __half &operator*=(__half &lh, const __half &rh) { lh = __hmul(lh, rh); return lh; } +__device__ __forceinline__ __half &operator/=(__half &lh, const __half &rh) { lh = __hdiv(lh, rh); return lh; } + +/* Note for increment and decrement we use the raw value 0x3C00U equating to half(1.0F), to avoid the extra conversion */ +__device__ __forceinline__ __half &operator++(__half &h) { __half_raw one; one.x = 0x3C00U; h += one; return h; } +__device__ __forceinline__ __half &operator--(__half &h) { __half_raw one; one.x = 0x3C00U; h -= one; return h; } +__device__ __forceinline__ __half operator++(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast<void>(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h += one; + return ret; +} +__device__ __forceinline__ __half operator--(__half &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast<void>(ignored); + + const __half ret = h; + __half_raw one; + one.x = 0x3C00U; + h -= one; + return ret; +} + +/* Unary plus and inverse operators */ +__device__ __forceinline__ __half operator+(const __half &h) { return h; } +__device__ __forceinline__ __half operator-(const __half &h) { return __hneg(h); } + +/* Some basic comparison operations to make it look like a builtin */ +__device__ __forceinline__ bool operator==(const __half &lh, const __half &rh) { return __heq(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half &lh, const __half &rh) { return __hneu(lh, rh); } +__device__ __forceinline__ bool operator> (const __half &lh, const __half &rh) { return __hgt(lh, rh); } +__device__ __forceinline__ bool operator< (const __half &lh, const __half &rh) { return __hlt(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half &lh, const __half &rh) { return __hge(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half &lh, const __half &rh) { return __hle(lh, rh); } +#endif /* !defined(__CUDA_NO_HALF_OPERATORS__) */ +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */ +#endif /* defined(__CUDACC__) */ + +/* __half2 is visible to non-nvcc host compilers */ +struct __CUDA_ALIGN__(4) __half2 { + __half x; + __half y; + + // All construct/copy/assign/move +public: +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) + __half2() = default; + __CUDA_HOSTDEVICE__ __half2(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &&src) { __HALF2_TO_UI(*this) = std::move(__HALF2_TO_CUI(src)); return *this; } +#else + __CUDA_HOSTDEVICE__ __half2() { } +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + __CUDA_HOSTDEVICE__ __half2(const __half &a, const __half &b) : x(a), y(b) { } + __CUDA_HOSTDEVICE__ __half2(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2 &src) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(src); return *this; } + + /* Convert to/from __half2_raw */ + __CUDA_HOSTDEVICE__ __half2(const __half2_raw &h2r ) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); } + __CUDA_HOSTDEVICE__ __half2 &operator=(const __half2_raw &h2r) { __HALF2_TO_UI(*this) = __HALF2_TO_CUI(h2r); return *this; } + __CUDA_HOSTDEVICE__ operator __half2_raw() const { __half2_raw ret; ret.x = 0U; ret.y = 0U; __HALF2_TO_UI(ret) = __HALF2_TO_CUI(*this); return ret; } +}; + +/* Global-space operator functions are only available to nvcc compilation */ +#if defined(__CUDACC__) + +/* Arithmetic FP16x2 operations only supported on arch >= 5.3 */ +#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)) && !defined(__CUDA_NO_HALF2_OPERATORS__) + +__device__ __forceinline__ __half2 operator+(const __half2 &lh, const __half2 &rh) { return __hadd2(lh, rh); } +__device__ __forceinline__ __half2 operator-(const __half2 &lh, const __half2 &rh) { return __hsub2(lh, rh); } +__device__ __forceinline__ __half2 operator*(const __half2 &lh, const __half2 &rh) { return __hmul2(lh, rh); } +__device__ __forceinline__ __half2 operator/(const __half2 &lh, const __half2 &rh) { return __h2div(lh, rh); } + +__device__ __forceinline__ __half2& operator+=(__half2 &lh, const __half2 &rh) { lh = __hadd2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator-=(__half2 &lh, const __half2 &rh) { lh = __hsub2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator*=(__half2 &lh, const __half2 &rh) { lh = __hmul2(lh, rh); return lh; } +__device__ __forceinline__ __half2& operator/=(__half2 &lh, const __half2 &rh) { lh = __h2div(lh, rh); return lh; } + +__device__ __forceinline__ __half2 &operator++(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hadd2(h, one); return h; } +__device__ __forceinline__ __half2 &operator--(__half2 &h) { __half2_raw one; one.x = 0x3C00U; one.y = 0x3C00U; h = __hsub2(h, one); return h; } +__device__ __forceinline__ __half2 operator++(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast<void>(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hadd2(h, one); + return ret; +} +__device__ __forceinline__ __half2 operator--(__half2 &h, const int ignored) +{ + // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators. + static_cast<void>(ignored); + + const __half2 ret = h; + __half2_raw one; + one.x = 0x3C00U; + one.y = 0x3C00U; + h = __hsub2(h, one); + return ret; +} + +__device__ __forceinline__ __half2 operator+(const __half2 &h) { return h; } +__device__ __forceinline__ __half2 operator-(const __half2 &h) { return __hneg2(h); } + +__device__ __forceinline__ bool operator==(const __half2 &lh, const __half2 &rh) { return __hbeq2(lh, rh); } +__device__ __forceinline__ bool operator!=(const __half2 &lh, const __half2 &rh) { return __hbneu2(lh, rh); } +__device__ __forceinline__ bool operator>(const __half2 &lh, const __half2 &rh) { return __hbgt2(lh, rh); } +__device__ __forceinline__ bool operator<(const __half2 &lh, const __half2 &rh) { return __hblt2(lh, rh); } +__device__ __forceinline__ bool operator>=(const __half2 &lh, const __half2 &rh) { return __hbge2(lh, rh); } +__device__ __forceinline__ bool operator<=(const __half2 &lh, const __half2 &rh) { return __hble2(lh, rh); } + +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) */ +#endif /* defined(__CUDACC__) */ + +/* Restore warning for multiple assignment operators */ +#if defined(_MSC_VER) && _MSC_VER >= 1500 +#pragma warning( pop ) +#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */ + +/* Restore -Weffc++ warnings from here on */ +#if defined(__GNUC__) +#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) +#pragma GCC diagnostic pop +#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */ +#endif /* defined(__GNUC__) */ + +#undef __CUDA_HOSTDEVICE__ +#undef __CUDA_ALIGN__ + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline unsigned short __internal_float2half(const float f, unsigned int &sign, unsigned int &remainder) +{ + unsigned int x; + unsigned int u; + unsigned int result; +#if defined(__CUDACC__) + (void)memcpy(&x, &f, sizeof(f)); +#else + (void)std::memcpy(&x, &f, sizeof(f)); +#endif + u = (x & 0x7fffffffU); + sign = ((x >> 16U) & 0x8000U); + // NaN/+Inf/-Inf + if (u >= 0x7f800000U) { + remainder = 0U; + result = ((u == 0x7f800000U) ? (sign | 0x7c00U) : 0x7fffU); + } else if (u > 0x477fefffU) { // Overflows + remainder = 0x80000000U; + result = (sign | 0x7bffU); + } else if (u >= 0x38800000U) { // Normal numbers + remainder = u << 19U; + u -= 0x38000000U; + result = (sign | (u >> 13U)); + } else if (u < 0x33000001U) { // +0/-0 + remainder = u; + result = sign; + } else { // Denormal numbers + const unsigned int exponent = u >> 23U; + const unsigned int shift = 0x7eU - exponent; + unsigned int mantissa = (u & 0x7fffffU); + mantissa |= 0x800000U; + remainder = mantissa << (32U - shift); + result = (sign | (mantissa >> shift)); + result &= 0x0000FFFFU; + } + return static_cast<unsigned short>(result); +} +#endif /* #if !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __double2half(const double a) +{ +#if defined(__CUDA_ARCH__) + __half val; + asm("{ cvt.rn.f16.f64 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "d"(a)); + return val; +#else + __half result; + /* + // Perform rounding to 11 bits of precision, convert value + // to float and call existing float to half conversion. + // By pre-rounding to 11 bits we avoid additional rounding + // in float to half conversion. + */ + unsigned long long int absa; + unsigned long long int ua; + #if defined(__CUDACC__) + (void)memcpy(&ua, &a, sizeof(a)); + #else + (void)std::memcpy(&ua, &a, sizeof(a)); + #endif + absa = (ua & 0x7fffffffffffffffULL); + if ((absa >= 0x40f0000000000000ULL) || (absa <= 0x3e60000000000000ULL)) + { + /* + // |a| >= 2^16 or NaN or |a| <= 2^(-25) + // double-rounding is not a problem + */ + result = __float2half(static_cast<float>(a)); + } + else + { + /* + // here 2^(-25) < |a| < 2^16 + // prepare shifter value such that a + shifter + // done in double precision performs round-to-nearest-even + // and (a + shifter) - shifter results in a rounded to + // 11 bits of precision. Shifter needs to have exponent of + // a plus 53 - 11 = 42 and a leading bit in mantissa to guard + // against negative values. + // So need to have |a| capped to avoid overflow in exponent. + // For inputs that are smaller than half precision minnorm + // we prepare fixed shifter exponent. + */ + unsigned long long shifterBits; + if (absa >= 0x3f10000000000000ULL) + { + /* + // Here if |a| >= 2^(-14) + // add 42 to exponent bits + */ + shifterBits = (ua & 0x7ff0000000000000ULL) + 0x02A0000000000000ULL; + } + else + { + /* + // 2^(-25) < |a| < 2^(-14), potentially results in denormal + // set exponent bits to 42 - 14 + bias + */ + shifterBits = 0x41B0000000000000ULL; + } + // set leading mantissa bit to protect against negative inputs + shifterBits |= 0x0008000000000000ULL; + double shifter; + #if defined(__CUDACC__) + (void)memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + #else + (void)std::memcpy(&shifter, &shifterBits, sizeof(shifterBits)); + #endif + double aShiftRound = a + shifter; + + /* + // Prevent the compiler from optimizing away a + shifter - shifter + // by doing intermediate memcopy and harmless bitwize operation + */ + unsigned long long int aShiftRoundBits; + #if defined(__CUDACC__) + (void)memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + #else + (void)std::memcpy(&aShiftRoundBits, &aShiftRound, sizeof(aShiftRound)); + #endif + + // the value is positive, so this operation doesn't change anything + aShiftRoundBits &= 0x7fffffffffffffffULL; + + #if defined(__CUDACC__) + (void)memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + #else + (void)std::memcpy(&aShiftRound, &aShiftRoundBits, sizeof(aShiftRound)); + #endif + + result = __float2half(static_cast<float>(aShiftRound - shifter)); + } + + return result; +#endif +} + +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rn(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rn.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rz(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rz.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_rd(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rm.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign != 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __float2half_ru(const float a) +{ + __half val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.rp.f16.f32 %0, %1;}\n" : "=h"(__HALF_TO_US(val)) : "f"(a)); +#else + __half_raw r; + unsigned int sign = 0U; + unsigned int remainder = 0U; + r.x = __internal_float2half(a, sign, remainder); + if ((remainder != 0U) && (sign == 0U)) { + r.x++; + } + val = r; +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float2half2_rn(const float a) +{ + __half2 val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low;\n" + " cvt.rn.f16.f32 low, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a)); +#else + val = __half2(__float2half_rn(a), __float2half_rn(a)); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __floats2half2_rn(const float a, const float b) +{ + __half2 val; +#if defined(__CUDA_ARCH__) +#if (__CUDA_ARCH__ >= 800) + asm("{ cvt.rn.f16x2.f32 %0, %2, %1; }\n" + : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +#else + asm("{.reg .f16 low,high;\n" + " cvt.rn.f16.f32 low, %1;\n" + " cvt.rn.f16.f32 high, %2;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "f"(a), "f"(b)); +#endif +#else + val = __half2(__float2half_rn(a), __float2half_rn(b)); +#endif + return val; +} + +#ifndef __CUDACC_RTC__ /* no host functions in NVRTC mode */ +static inline float __internal_half2float(const unsigned short h) +{ + unsigned int sign = ((static_cast<unsigned int>(h) >> 15U) & 1U); + unsigned int exponent = ((static_cast<unsigned int>(h) >> 10U) & 0x1fU); + unsigned int mantissa = ((static_cast<unsigned int>(h) & 0x3ffU) << 13U); + float f; + if (exponent == 0x1fU) { /* NaN or Inf */ + /* discard sign of a NaN */ + sign = ((mantissa != 0U) ? (sign >> 1U) : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)std::memcpy(&f, &u, sizeof(u)); +#endif + return f; +} +#endif /* !defined(__CUDACC_RTC__) */ + +__CUDA_HOSTDEVICE_FP16_DECL__ float __half2float(const __half a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{ cvt.f32.f16 %0, %1;}\n" : "=f"(val) : "h"(__HALF_TO_CUS(a))); +#else + val = __internal_half2float(static_cast<__half_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __low2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).x); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float __high2float(const __half2 a) +{ + float val; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(val) : "r"(__HALF2_TO_CUI(a))); +#else + val = __internal_half2float(static_cast<__half2_raw>(a).y); +#endif + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ short int __half2short_rz(const __half h) +{ + short int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const short int max_val = (short int)0x7fffU; + const short int min_val = (short int)0x8000U; + const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast<short int>(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned short int __half2ushort_rz(const __half h) +{ + unsigned short int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const unsigned short int max_val = 0xffffU; + const unsigned short int min_val = 0U; + const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast<unsigned short int>(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ int __half2int_rz(const __half h) +{ + int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const int max_val = (int)0x7fffffffU; + const int min_val = (int)0x80000000U; + const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast<int>(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned int __half2uint_rz(const __half h) +{ + unsigned int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const unsigned int max_val = 0xffffffffU; + const unsigned int min_val = 0U; + const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0U; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast<unsigned int>(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ long long int __half2ll_rz(const __half h) +{ + long long int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const long long int max_val = (long long int)0x7fffffffffffffffULL; + const long long int min_val = (long long int)0x8000000000000000ULL; + const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = min_val; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast<long long int>(f); + } +#endif + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ unsigned long long int __half2ull_rz(const __half h) +{ + unsigned long long int i; +#if defined __CUDA_ARCH__ + asm("cvt.rzi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); +#else + const float f = __half2float(h); + const unsigned long long int max_val = 0xffffffffffffffffULL; + const unsigned long long int min_val = 0ULL; + const unsigned short bits = static_cast<unsigned short>(static_cast<__half_raw>(h).x << 1U); + // saturation fixup + if (bits > (unsigned short)0xF800U) { + // NaN + i = 0x8000000000000000ULL; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value, conversion is well-defined + i = static_cast<unsigned long long int>(f); + } +#endif + return i; +} + +/* Intrinsic functions only available to nvcc compilers */ +#if defined(__CUDACC__) + +/* CUDA vector-types compatible vector creation function (note returns __half2, not half2) */ +__VECTOR_FUNCTIONS_DECL__ __half2 make_half2(const __half x, const __half y) +{ + __half2 t; t.x = x; t.y = y; return t; +} +#undef __VECTOR_FUNCTIONS_DECL__ + + +/* Definitions of intrinsics */ +__CUDA_HOSTDEVICE_FP16_DECL__ __half2 __float22half2_rn(const float2 a) +{ + const __half2 val = __floats2half2_rn(a.x, a.y); + return val; +} +__CUDA_HOSTDEVICE_FP16_DECL__ float2 __half22float2(const __half2 a) +{ + float hi_float; + float lo_float; +#if defined(__CUDA_ARCH__) + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, low;}\n" : "=f"(lo_float) : "r"(__HALF2_TO_CUI(a))); + + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high},%1;\n" + " cvt.f32.f16 %0, high;}\n" : "=f"(hi_float) : "r"(__HALF2_TO_CUI(a))); +#else + lo_float = __internal_half2float(((__half2_raw)a).x); + hi_float = __internal_half2float(((__half2_raw)a).y); +#endif + return make_float2(lo_float, hi_float); +} +__CUDA_FP16_DECL__ int __half2int_rn(const __half h) +{ + int i; + asm("cvt.rni.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_rd(const __half h) +{ + int i; + asm("cvt.rmi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ int __half2int_ru(const __half h) +{ + int i; + asm("cvt.rpi.s32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __int2half_rn(const int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast<float>(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rz(const int i) +{ + __half h; + asm("cvt.rz.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_rd(const int i) +{ + __half h; + asm("cvt.rm.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __int2half_ru(const int i) +{ + __half h; + asm("cvt.rp.f16.s32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ short int __half2short_rn(const __half h) +{ + short int i; + asm("cvt.rni.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_rd(const __half h) +{ + short int i; + asm("cvt.rmi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ short int __half2short_ru(const __half h) +{ + short int i; + asm("cvt.rpi.s16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __short2half_rn(const short int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +#else + const float f = static_cast<float>(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rz(const short int i) +{ + __half h; + asm("cvt.rz.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_rd(const short int i) +{ + __half h; + asm("cvt.rm.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __short2half_ru(const short int i) +{ + __half h; + asm("cvt.rp.f16.s16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned int __half2uint_rn(const __half h) +{ + unsigned int i; + asm("cvt.rni.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_rd(const __half h) +{ + unsigned int i; + asm("cvt.rmi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned int __half2uint_ru(const __half h) +{ + unsigned int i; + asm("cvt.rpi.u32.f16 %0, %1;" : "=r"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __uint2half_rn(const unsigned int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast<float>(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rz(const unsigned int i) +{ + __half h; + asm("cvt.rz.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_rd(const unsigned int i) +{ + __half h; + asm("cvt.rm.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __uint2half_ru(const unsigned int i) +{ + __half h; + asm("cvt.rp.f16.u32 %0, %1;" : "=h"(__HALF_TO_US(h)) : "r"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rn(const __half h) +{ + unsigned short int i; + asm("cvt.rni.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_rd(const __half h) +{ + unsigned short int i; + asm("cvt.rmi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned short int __half2ushort_ru(const __half h) +{ + unsigned short int i; + asm("cvt.rpi.u16.f16 %0, %1;" : "=h"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ushort2half_rn(const unsigned short int i) +{ + __half h; +#if defined __CUDA_ARCH__ + asm("cvt.rn.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); +#else + const float f = static_cast<float>(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rz(const unsigned short int i) +{ + __half h; + asm("cvt.rz.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_rd(const unsigned short int i) +{ + __half h; + asm("cvt.rm.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ushort2half_ru(const unsigned short int i) +{ + __half h; + asm("cvt.rp.f16.u16 %0, %1;" : "=h"(__HALF_TO_US(h)) : "h"(i)); + return h; +} + +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rn(const __half h) +{ + unsigned long long int i; + asm("cvt.rni.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_rd(const __half h) +{ + unsigned long long int i; + asm("cvt.rmi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ unsigned long long int __half2ull_ru(const __half h) +{ + unsigned long long int i; + asm("cvt.rpi.u64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ull2half_rn(const unsigned long long int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast<float>(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rz(const unsigned long long int i) +{ + __half h; + asm("cvt.rz.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_rd(const unsigned long long int i) +{ + __half h; + asm("cvt.rm.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ull2half_ru(const unsigned long long int i) +{ + __half h; + asm("cvt.rp.f16.u64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ long long int __half2ll_rn(const __half h) +{ + long long int i; + asm("cvt.rni.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_rd(const __half h) +{ + long long int i; + asm("cvt.rmi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_FP16_DECL__ long long int __half2ll_ru(const __half h) +{ + long long int i; + asm("cvt.rpi.s64.f16 %0, %1;" : "=l"(i) : "h"(__HALF_TO_CUS(h))); + return i; +} +__CUDA_HOSTDEVICE_FP16_DECL__ __half __ll2half_rn(const long long int i) +{ + __half h; +#if defined(__CUDA_ARCH__) + asm("cvt.rn.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); +#else + // double-rounding is not a problem here: if integer + // has more than 24 bits, it is already too large to + // be represented in half precision, and result will + // be infinity. + const float f = static_cast<float>(i); + h = __float2half_rn(f); +#endif + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rz(const long long int i) +{ + __half h; + asm("cvt.rz.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_rd(const long long int i) +{ + __half h; + asm("cvt.rm.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} +__CUDA_FP16_DECL__ __half __ll2half_ru(const long long int i) +{ + __half h; + asm("cvt.rp.f16.s64 %0, %1;" : "=h"(__HALF_TO_US(h)) : "l"(i)); + return h; +} + +__CUDA_FP16_DECL__ __half htrunc(const __half h) +{ + __half r; + asm("cvt.rzi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hceil(const __half h) +{ + __half r; + asm("cvt.rpi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hfloor(const __half h) +{ + __half r; + asm("cvt.rmi.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} +__CUDA_FP16_DECL__ __half hrint(const __half h) +{ + __half r; + asm("cvt.rni.f16.f16 %0, %1;" : "=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(h))); + return r; +} + +__CUDA_FP16_DECL__ __half2 h2trunc(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rzi.f16.f16 low, low;\n" + " cvt.rzi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2ceil(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rpi.f16.f16 low, low;\n" + " cvt.rpi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2floor(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rmi.f16.f16 low, low;\n" + " cvt.rmi.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2rint(const __half2 h) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " cvt.rni.f16.f16 low, low;\n" + " cvt.rni.f16.f16 high, high;\n" + " mov.b32 %0, {low,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(h))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lows2half2(const __half2 a, const __half2 b) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {alow,blow};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); + return val; +} +__CUDA_FP16_DECL__ __half2 __highs2half2(const __half2 a, const __half2 b) +{ + __half2 val; + asm("{.reg .f16 alow,ahigh,blow,bhigh;\n" + " mov.b32 {alow,ahigh}, %1;\n" + " mov.b32 {blow,bhigh}, %2;\n" + " mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(b))); + return val; +} +__CUDA_FP16_DECL__ __half __low2half(const __half2 a) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, low;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); + return ret; +} +__CUDA_FP16_DECL__ int __hisinf(const __half a) +{ + int retval; + if (__HALF_TO_CUS(a) == 0xFC00U) { + retval = -1; + } else if (__HALF_TO_CUS(a) == 0x7C00U) { + retval = 1; + } else { + retval = 0; + } + return retval; +} +__CUDA_FP16_DECL__ __half2 __low2half2(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {low,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 __high2half2(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,high};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half __high2half(const __half2 a) +{ + __half ret; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b16 %0, high;}" : "=h"(__HALF_TO_US(ret)) : "r"(__HALF2_TO_CUI(a))); + return ret; +} +__CUDA_FP16_DECL__ __half2 __halves2half2(const __half a, const __half b) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%2};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); + return val; +} +__CUDA_FP16_DECL__ __half2 __half2half2(const __half a) +{ + __half2 val; + asm("{ mov.b32 %0, {%1,%1};}\n" + : "=r"(__HALF2_TO_UI(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 __lowhigh2highlow(const __half2 a) +{ + __half2 val; + asm("{.reg .f16 low,high;\n" + " mov.b32 {low,high}, %1;\n" + " mov.b32 %0, {high,low};}\n" : "=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ short int __half_as_short(const __half h) +{ + return static_cast<short int>(__HALF_TO_CUS(h)); +} +__CUDA_FP16_DECL__ unsigned short int __half_as_ushort(const __half h) +{ + return __HALF_TO_CUS(h); +} +__CUDA_FP16_DECL__ __half __short_as_half(const short int i) +{ + __half h; + __HALF_TO_US(h) = static_cast<unsigned short int>(i); + return h; +} +__CUDA_FP16_DECL__ __half __ushort_as_half(const unsigned short int i) +{ + __half h; + __HALF_TO_US(h) = i; + return h; +} + +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hmax(const __half a, const __half b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF_MACRO(max) +#else + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +#endif +} +__CUDA_FP16_DECL__ __half __hmin(const __half a, const __half b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF_MACRO(min) +#else + const float fa = __half2float(a); + const float fb = __half2float(b); + float fr; + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr) : "f"(fa), "f"(fb)); + const __half hr = __float2half(fr); + return hr; +#endif +} + +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hmax2(const __half2 a, const __half2 b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF2_MACRO(max) +#else + const float2 fa = __half22float2(a); + const float2 fb = __half22float2(b); + float2 fr; + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr.x) : "f"(fa.x), "f"(fb.x)); + asm("{max.f32 %0,%1,%2;\n}" + :"=f"(fr.y) : "f"(fa.y), "f"(fb.y)); + const __half2 hr = __float22half2_rn(fr); + return hr; +#endif +} +__CUDA_FP16_DECL__ __half2 __hmin2(const __half2 a, const __half2 b) +{ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) + __BINARY_OP_HALF2_MACRO(min) +#else + const float2 fa = __half22float2(a); + const float2 fb = __half22float2(b); + float2 fr; + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr.x) : "f"(fa.x), "f"(fb.x)); + asm("{min.f32 %0,%1,%2;\n}" + :"=f"(fr.y) : "f"(fa.y), "f"(fb.y)); + const __half2 hr = __float22half2_rn(fr); + return hr; +#endif +} + + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300) +/****************************************************************************** +* __half, __half2 warp shuffle * +******************************************************************************/ +#define __SHUFFLE_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c)); \ + return r; \ +} /* while(0) */ + +#define __SHUFFLE_SYNC_HALF2_MACRO(name) /* do */ {\ + __half2 r; \ + asm volatile ("{" __CUDA_FP16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \ + :"=r"(__HALF2_TO_UI(r)): "r"(__HALF2_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \ + return r; \ +} /* while(0) */ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half2 __shfl(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U; + __SHUFFLE_HALF2_MACRO(shfl.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down(const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor(const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_HALF2_MACRO(shfl.bfly.b32) +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half2 __shfl_sync(const unsigned mask, const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.idx.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_up_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.up.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_down_sync(const unsigned mask, const __half2 var, const unsigned int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.down.b32) +} +__CUDA_FP16_DECL__ __half2 __shfl_xor_sync(const unsigned mask, const __half2 var, const int delta, const int width) +{ + unsigned int warp_size; + asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size)); + const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU; + __SHUFFLE_SYNC_HALF2_MACRO(shfl.sync.bfly.b32) +} + +#undef __SHUFFLE_HALF2_MACRO +#undef __SHUFFLE_SYNC_HALF2_MACRO + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__CUDA_FP16_DECL__ __half __shfl(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down(const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down(temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor(const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor(temp1, delta, width); + return __low2half(temp2); +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +__CUDA_FP16_DECL__ __half __shfl_sync(const unsigned mask, const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_up_sync(const unsigned mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_up_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_down_sync(const unsigned mask, const __half var, const unsigned int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_down_sync(mask, temp1, delta, width); + return __low2half(temp2); +} +__CUDA_FP16_DECL__ __half __shfl_xor_sync(const unsigned mask, const __half var, const int delta, const int width) +{ + const __half2 temp1 = __halves2half2(var, var); + const __half2 temp2 = __shfl_xor_sync(mask, temp1, delta, width); + return __low2half(temp2); +} + +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 300)*/ +/****************************************************************************** +* __half and __half2 __ldg,__ldcg,__ldca,__ldcs * +******************************************************************************/ + +#if defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320)) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ +__CUDA_FP16_DECL__ __half2 __ldg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.nc.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.nc.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcg(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cg.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcg(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cg.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldca(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.ca.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldca(const __half *const ptr) +{ + __half ret; + asm ("ld.global.ca.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcs(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cs.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcs(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cs.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr)); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldlu(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.lu.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldlu(const __half *const ptr) +{ + __half ret; + asm ("ld.global.lu.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half2 __ldcv(const __half2 *const ptr) +{ + __half2 ret; + asm ("ld.global.cv.b32 %0, [%1];" : "=r"(__HALF2_TO_UI(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ __half __ldcv(const __half *const ptr) +{ + __half ret; + asm ("ld.global.cv.b16 %0, [%1];" : "=h"(__HALF_TO_US(ret)) : __LDG_PTR(ptr) : "memory"); + return ret; +} +__CUDA_FP16_DECL__ void __stwb(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wb.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwb(__half *const ptr, const __half value) +{ + asm ("st.global.wb.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cg.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcg(__half *const ptr, const __half value) +{ + asm ("st.global.cg.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.cs.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stcs(__half *const ptr, const __half value) +{ + asm ("st.global.cs.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half2 *const ptr, const __half2 value) +{ + asm ("st.global.wt.b32 [%0], %1;" :: __LDG_PTR(ptr), "r"(__HALF2_TO_CUI(value)) : "memory"); +} +__CUDA_FP16_DECL__ void __stwt(__half *const ptr, const __half value) +{ + asm ("st.global.wt.b16 [%0], %1;" :: __LDG_PTR(ptr), "h"(__HALF_TO_CUS(value)) : "memory"); +} +#undef __LDG_PTR +#endif /*defined(__cplusplus) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 320))*/ +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530) +/****************************************************************************** +* __half2 comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + return val; \ +} /* while(0) */ +__CUDA_FP16_DECL__ __half2 __heq2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ __half2 __hne2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ __half2 __hle2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ __half2 __hge2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ __half2 __hlt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ __half2 __hgt2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ __half2 __hequ2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ __half2 __hneu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ __half2 __hleu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ __half2 __hgeu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ __half2 __hltu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ __half2 __hgtu2(const __half2 a, const __half2 b) +{ + __COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __COMPARISON_OP_HALF2_MACRO +#define __BOOL_COMPARISON_OP_HALF2_MACRO(name) /* do */ {\ + __half2 val; \ + bool retval; \ + asm( "{ " __CUDA_FP16_STRINGIFY(name) ".f16x2.f16x2 %0,%1,%2;\n}" \ + :"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a)),"r"(__HALF2_TO_CUI(b))); \ + if (__HALF2_TO_CUI(val) == 0x3C003C00U) {\ + retval = true; \ + } else { \ + retval = false; \ + }\ + return retval;\ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __hbeq2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.eq) +} +__CUDA_FP16_DECL__ bool __hbne2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ne) +} +__CUDA_FP16_DECL__ bool __hble2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.le) +} +__CUDA_FP16_DECL__ bool __hbge2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ge) +} +__CUDA_FP16_DECL__ bool __hblt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.lt) +} +__CUDA_FP16_DECL__ bool __hbgt2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gt) +} +__CUDA_FP16_DECL__ bool __hbequ2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.equ) +} +__CUDA_FP16_DECL__ bool __hbneu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.neu) +} +__CUDA_FP16_DECL__ bool __hbleu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.leu) +} +__CUDA_FP16_DECL__ bool __hbgeu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.geu) +} +__CUDA_FP16_DECL__ bool __hbltu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.ltu) +} +__CUDA_FP16_DECL__ bool __hbgtu2(const __half2 a, const __half2 b) +{ + __BOOL_COMPARISON_OP_HALF2_MACRO(set.gtu) +} +#undef __BOOL_COMPARISON_OP_HALF2_MACRO +/****************************************************************************** +* __half comparison * +******************************************************************************/ +#define __COMPARISON_OP_HALF_MACRO(name) /* do */ {\ + unsigned short val; \ + asm( "{ .reg .pred __$temp3;\n" \ + " setp." __CUDA_FP16_STRINGIFY(name) ".f16 __$temp3, %1, %2;\n" \ + " selp.u16 %0, 1, 0, __$temp3;}" \ + : "=h"(val) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(b))); \ + return (val != 0U) ? true : false; \ +} /* while(0) */ +__CUDA_FP16_DECL__ bool __heq(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(eq) +} +__CUDA_FP16_DECL__ bool __hne(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ne) +} +__CUDA_FP16_DECL__ bool __hle(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(le) +} +__CUDA_FP16_DECL__ bool __hge(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ge) +} +__CUDA_FP16_DECL__ bool __hlt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(lt) +} +__CUDA_FP16_DECL__ bool __hgt(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gt) +} +__CUDA_FP16_DECL__ bool __hequ(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(equ) +} +__CUDA_FP16_DECL__ bool __hneu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(neu) +} +__CUDA_FP16_DECL__ bool __hleu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(leu) +} +__CUDA_FP16_DECL__ bool __hgeu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(geu) +} +__CUDA_FP16_DECL__ bool __hltu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(ltu) +} +__CUDA_FP16_DECL__ bool __hgtu(const __half a, const __half b) +{ + __COMPARISON_OP_HALF_MACRO(gtu) +} +#undef __COMPARISON_OP_HALF_MACRO +/****************************************************************************** +* __half2 arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half2 __hadd2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add) +} +__CUDA_FP16_DECL__ __half2 __hsub2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub) +} +__CUDA_FP16_DECL__ __half2 __hmul2(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul) +} +__CUDA_FP16_DECL__ __half2 __hadd2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half2 __hsub2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half2 __hmul2_sat(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.sat) +} +__CUDA_FP16_DECL__ __half2 __hadd2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(add.rn) +} +__CUDA_FP16_DECL__ __half2 __hsub2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(sub.rn) +} +__CUDA_FP16_DECL__ __half2 __hmul2_rn(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(mul.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half2 __h2div(const __half2 a, const __half2 b) { + __half ha = __low2half(a); + __half hb = __low2half(b); + + const __half v1 = __hdiv(ha, hb); + + ha = __high2half(a); + hb = __high2half(b); + + const __half v2 = __hdiv(ha, hb); + + return __halves2half2(v1, v2); +} +/****************************************************************************** +* __half arithmetic * +******************************************************************************/ +__CUDA_FP16_DECL__ __half __hadd(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add) +} +__CUDA_FP16_DECL__ __half __hsub(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub) +} +__CUDA_FP16_DECL__ __half __hmul(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul) +} +__CUDA_FP16_DECL__ __half __hadd_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.sat) +} +__CUDA_FP16_DECL__ __half __hsub_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.sat) +} +__CUDA_FP16_DECL__ __half __hmul_sat(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.sat) +} +__CUDA_FP16_DECL__ __half __hadd_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(add.rn) +} +__CUDA_FP16_DECL__ __half __hsub_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(sub.rn) +} +__CUDA_FP16_DECL__ __half __hmul_rn(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(mul.rn) +} +__CUDA_FP16_DECL__ __half __hfma(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn) +} +__CUDA_FP16_DECL__ __half __hfma_sat(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.sat) +} +__CUDA_FP16_DECL__ __half __hdiv(const __half a, const __half b) { + __half v; + __half abs; + __half den; + __HALF_TO_US(den) = 0x008FU; + + float rcp; + const float fa = __half2float(a); + const float fb = __half2float(b); + + asm("{rcp.approx.ftz.f32 %0, %1;\n}" :"=f"(rcp) : "f"(fb)); + + float fv = rcp * fa; + + v = __float2half(fv); + __HALF_TO_US(abs) = static_cast<unsigned short>(static_cast<unsigned int>(__HALF_TO_CUS(v)) & 0x00007FFFU); + if (__hlt(abs, den) && (!(__HALF_TO_CUS(abs) == 0x0000U))) { + const float err = __fmaf_rn(-fb, fv, fa); + fv = __fmaf_rn(rcp, err, fv); + v = __float2half(fv); + } + return v; +} + +/****************************************************************************** +* __half2 functions * +******************************************************************************/ +#define __SPEC_CASE2(i,r, spc, ulp) \ + "{.reg.b32 spc, ulp, p;\n"\ + " mov.b32 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b32 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16x2.f16x2 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16x2 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +#define __SPEC_CASE(i,r, spc, ulp) \ + "{.reg.b16 spc, ulp, p;\n"\ + " mov.b16 spc," __CUDA_FP16_STRINGIFY(spc) ";\n"\ + " mov.b16 ulp," __CUDA_FP16_STRINGIFY(ulp) ";\n"\ + " set.eq.f16.f16 p," __CUDA_FP16_STRINGIFY(i) ", spc;\n"\ + " fma.rn.f16 " __CUDA_FP16_STRINGIFY(r) ",p,ulp," __CUDA_FP16_STRINGIFY(r) ";\n}\n" +#define __APPROX_FCAST(fun) /* do */ {\ + __half val;\ + asm("{.reg.b32 f; \n"\ + " .reg.b16 r; \n"\ + " mov.b16 r,%1; \n"\ + " cvt.f32.f16 f,r; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 f,f; \n"\ + " cvt.rn.f16.f32 r,f; \n"\ + " mov.b16 %0,r; \n"\ + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a)));\ + return val;\ +} /* while(0) */ +#define __APPROX_FCAST2(fun) /* do */ {\ + __half2 val;\ + asm("{.reg.b16 hl, hu; \n"\ + " .reg.b32 fl, fu; \n"\ + " mov.b32 {hl, hu}, %1; \n"\ + " cvt.f32.f16 fl, hl; \n"\ + " cvt.f32.f16 fu, hu; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fl, fl; \n"\ + " " __CUDA_FP16_STRINGIFY(fun) ".approx.ftz.f32 fu, fu; \n"\ + " cvt.rn.f16.f32 hl, fl; \n"\ + " cvt.rn.f16.f32 hu, fu; \n"\ + " mov.b32 %0, {hl, hu}; \n"\ + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); \ + return val;\ +} /* while(0) */ +static __device__ __forceinline__ float __float_simpl_sinf(float a); +static __device__ __forceinline__ float __float_simpl_cosf(float a); +__CUDA_FP16_DECL__ __half hsin(const __half a) { + const float sl = __float_simpl_sinf(__half2float(a)); + __half r = __float2half_rn(sl); + asm("{\n\t" + " .reg.b16 i,r,t; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " and.b16 t, r, 0x8000U; \n\t" + " abs.f16 r, r; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X32B3U, 0x0800U) + __SPEC_CASE(i, r, 0X5CB0U, 0x9000U) + " or.b16 r,r,t; \n\t" + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2sin(const __half2 a) { + const float sl = __float_simpl_sinf(__half2float(a.x)); + const float sh = __float_simpl_sinf(__half2float(a.y)); + __half2 r = __floats2half2_rn(sl, sh); + asm("{\n\t" + " .reg.b32 i,r,t; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " and.b32 t, r, 0x80008000U; \n\t" + " abs.f16x2 r, r; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X32B332B3U, 0x08000800U) + __SPEC_CASE2(i, r, 0X5CB05CB0U, 0x90009000U) + " or.b32 r, r, t; \n\t" + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half hcos(const __half a) { + const float cl = __float_simpl_cosf(__half2float(a)); + __half r = __float2half_rn(cl); + asm("{\n\t" + " .reg.b16 i,r; \n\t" + " mov.b16 r, %0; \n\t" + " mov.b16 i, %1; \n\t" + " abs.f16 i, i; \n\t" + __SPEC_CASE(i, r, 0X2B7CU, 0x1000U) + " mov.b16 %0, r; \n" + "}\n" : "+h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 h2cos(const __half2 a) { + const float cl = __float_simpl_cosf(__half2float(a.x)); + const float ch = __float_simpl_cosf(__half2float(a.y)); + __half2 r = __floats2half2_rn(cl, ch); + asm("{\n\t" + " .reg.b32 i,r; \n\t" + " mov.b32 r, %0; \n\t" + " mov.b32 i, %1; \n\t" + " abs.f16x2 i, i; \n\t" + __SPEC_CASE2(i, r, 0X2B7C2B7CU, 0x10001000U) + " mov.b32 %0, r; \n" + "}\n" : "+r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +static __device__ __forceinline__ float __internal_trig_reduction_kernel(const float a, unsigned int *const quadrant) +{ + const float ar = __fmaf_rn(a, 0.636619772F, 12582912.0F); + const unsigned q = __float_as_uint(ar); + const float j = __fsub_rn(ar, 12582912.0F); + float t = __fmaf_rn(j, -1.5707962512969971e+000F, a); + t = __fmaf_rn(j, -7.5497894158615964e-008F, t); + *quadrant = q; + return t; +} +static __device__ __forceinline__ float __internal_sin_cos_kernel(const float x, const unsigned int i) +{ + float z; + const float x2 = x*x; + float a8; + float a6; + float a4; + float a2; + float a1; + float a0; + + if ((i & 1U) != 0U) { + // cos + a8 = 2.44331571e-5F; + a6 = -1.38873163e-3F; + a4 = 4.16666457e-2F; + a2 = -5.00000000e-1F; + a1 = x2; + a0 = 1.0F; + } + else { + // sin + a8 = -1.95152959e-4F; + a6 = 8.33216087e-3F; + a4 = -1.66666546e-1F; + a2 = 0.0F; + a1 = x; + a0 = x; + } + + z = __fmaf_rn(a8, x2, a6); + z = __fmaf_rn(z, x2, a4); + z = __fmaf_rn(z, x2, a2); + z = __fmaf_rn(z, a1, a0); + + if ((i & 2U) != 0U) { + z = -z; + } + return z; +} +static __device__ __forceinline__ float __float_simpl_sinf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, i); + return z; +} +static __device__ __forceinline__ float __float_simpl_cosf(float a) +{ + float z; + unsigned i; + a = __internal_trig_reduction_kernel(a, &i); + z = __internal_sin_cos_kernel(a, (i & 0x3U) + 1U); + return z; +} + +__CUDA_FP16_DECL__ __half hexp(const __half a) { + __half val; + asm("{.reg.b32 f, C, nZ; \n" + " .reg.b16 h,r; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f,f; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X1F79U, 0x9400U) + __SPEC_CASE(h, r, 0X25CFU, 0x9400U) + __SPEC_CASE(h, r, 0XC13BU, 0x0400U) + __SPEC_CASE(h, r, 0XC1EFU, 0x0200U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x3fb8aa3bU; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X1F791F79U, 0x94009400U) + __SPEC_CASE2(h, r, 0X25CF25CFU, 0x94009400U) + __SPEC_CASE2(h, r, 0XC13BC13BU, 0x04000400U) + __SPEC_CASE2(h, r, 0XC1EFC1EFU, 0x02000200U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp2(const __half a) { + __half val; + asm("{.reg.b32 f, ULP; \n" + " .reg.b16 r; \n" + " mov.b16 r,%1; \n" + " cvt.f32.f16 f,r; \n" + " ex2.approx.ftz.f32 f,f; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 f,f,ULP,f; \n" + " cvt.rn.f16.f32 r,f; \n" + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, ULP; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " mov.b32 ULP, 0x33800000U;\n" + " fma.rn.f32 fl,fl,ULP,fl; \n" + " fma.rn.f32 fu,fu,ULP,fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 %0, {hl, hu}; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hexp10(const __half a) { + __half val; + asm("{.reg.b16 h,r; \n" + " .reg.b32 f, C, nZ; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 f,f,C,nZ; \n" + " ex2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x34DEU, 0x9800U) + __SPEC_CASE(h, r, 0x9766U, 0x9000U) + __SPEC_CASE(h, r, 0x9972U, 0x1000U) + __SPEC_CASE(h, r, 0xA5C4U, 0x1000U) + __SPEC_CASE(h, r, 0xBF0AU, 0x8100U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2exp10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 h,r,fl,fu,C,nZ; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " mov.b32 C, 0x40549A78U; \n" + " mov.b32 nZ, 0x80000000U;\n" + " fma.rn.f32 fl,fl,C,nZ; \n" + " fma.rn.f32 fu,fu,C,nZ; \n" + " ex2.approx.ftz.f32 fl, fl; \n" + " ex2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x34DE34DEU, 0x98009800U) + __SPEC_CASE2(h, r, 0x97669766U, 0x90009000U) + __SPEC_CASE2(h, r, 0x99729972U, 0x10001000U) + __SPEC_CASE2(h, r, 0xA5C4A5C4U, 0x10001000U) + __SPEC_CASE2(h, r, 0xBF0ABF0AU, 0x81008100U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog2(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(r, r, 0xA2E2U, 0x8080U) + __SPEC_CASE(r, r, 0xBF46U, 0x9400U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log2(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 fl, fu, r, p; \n" + " mov.b32 {hl, hu}, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(r, r, 0xA2E2A2E2U, 0x80808080U) + __SPEC_CASE2(r, r, 0xBF46BF46U, 0x94009400U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog(const __half a) { + __half val; + asm("{.reg.b32 f, C; \n" + " .reg.b16 r,h; \n" + " mov.b16 h,%1; \n" + " cvt.f32.f16 f,h; \n" + " lg2.approx.ftz.f32 f,f; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r,f; \n" + __SPEC_CASE(h, r, 0X160DU, 0x9C00U) + __SPEC_CASE(h, r, 0X3BFEU, 0x8010U) + __SPEC_CASE(h, r, 0X3C0BU, 0x8080U) + __SPEC_CASE(h, r, 0X6051U, 0x1C00U) + " mov.b16 %0,r; \n" + "}": "=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3f317218U; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0X160D160DU, 0x9C009C00U) + __SPEC_CASE2(h, r, 0X3BFE3BFEU, 0x80108010U) + __SPEC_CASE2(h, r, 0X3C0B3C0BU, 0x80808080U) + __SPEC_CASE2(h, r, 0X60516051U, 0x1C001C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +__CUDA_FP16_DECL__ __half hlog10(const __half a) { + __half val; + asm("{.reg.b16 h, r; \n" + " .reg.b32 f, C; \n" + " mov.b16 h, %1; \n" + " cvt.f32.f16 f, h; \n" + " lg2.approx.ftz.f32 f, f; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 f,f,C; \n" + " cvt.rn.f16.f32 r, f; \n" + __SPEC_CASE(h, r, 0x338FU, 0x1000U) + __SPEC_CASE(h, r, 0x33F8U, 0x9000U) + __SPEC_CASE(h, r, 0x57E1U, 0x9800U) + __SPEC_CASE(h, r, 0x719DU, 0x9C00U) + " mov.b16 %0, r; \n" + "}":"=h"(__HALF_TO_US(val)) : "h"(__HALF_TO_CUS(a))); + return val; +} +__CUDA_FP16_DECL__ __half2 h2log10(const __half2 a) { + __half2 val; + asm("{.reg.b16 hl, hu; \n" + " .reg.b32 r, fl, fu, C, h; \n" + " mov.b32 {hl, hu}, %1; \n" + " mov.b32 h, %1; \n" + " cvt.f32.f16 fl, hl; \n" + " cvt.f32.f16 fu, hu; \n" + " lg2.approx.ftz.f32 fl, fl; \n" + " lg2.approx.ftz.f32 fu, fu; \n" + " mov.b32 C, 0x3E9A209BU; \n" + " mul.f32 fl,fl,C; \n" + " mul.f32 fu,fu,C; \n" + " cvt.rn.f16.f32 hl, fl; \n" + " cvt.rn.f16.f32 hu, fu; \n" + " mov.b32 r, {hl, hu}; \n" + __SPEC_CASE2(h, r, 0x338F338FU, 0x10001000U) + __SPEC_CASE2(h, r, 0x33F833F8U, 0x90009000U) + __SPEC_CASE2(h, r, 0x57E157E1U, 0x98009800U) + __SPEC_CASE2(h, r, 0x719D719DU, 0x9C009C00U) + " mov.b32 %0, r; \n" + "}":"=r"(__HALF2_TO_UI(val)) : "r"(__HALF2_TO_CUI(a))); + return val; +} +#undef __SPEC_CASE2 +#undef __SPEC_CASE +__CUDA_FP16_DECL__ __half2 h2rcp(const __half2 a) { + __APPROX_FCAST2(rcp) +} +__CUDA_FP16_DECL__ __half hrcp(const __half a) { + __APPROX_FCAST(rcp) +} +__CUDA_FP16_DECL__ __half2 h2rsqrt(const __half2 a) { + __APPROX_FCAST2(rsqrt) +} +__CUDA_FP16_DECL__ __half hrsqrt(const __half a) { + __APPROX_FCAST(rsqrt) +} +__CUDA_FP16_DECL__ __half2 h2sqrt(const __half2 a) { + __APPROX_FCAST2(sqrt) +} +__CUDA_FP16_DECL__ __half hsqrt(const __half a) { + __APPROX_FCAST(sqrt) +} +#undef __APPROX_FCAST +#undef __APPROX_FCAST2 +__CUDA_FP16_DECL__ __half2 __hisnan2(const __half2 a) +{ + __half2 r; + asm("{set.nan.f16x2.f16x2 %0,%1,%2;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a)), "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ bool __hisnan(const __half a) +{ + __half r; + asm("{set.nan.f16.f16 %0,%1,%2;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a)), "h"(__HALF_TO_CUS(a))); + return __HALF_TO_CUS(r) != 0U; +} +__CUDA_FP16_DECL__ __half2 __hneg2(const __half2 a) +{ + __half2 r; + asm("{neg.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __hneg(const __half a) +{ + __half r; + asm("{neg.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} +__CUDA_FP16_DECL__ __half2 __habs2(const __half2 a) +{ + __half2 r; + asm("{abs.f16x2 %0,%1;\n}" + :"=r"(__HALF2_TO_UI(r)) : "r"(__HALF2_TO_CUI(a))); + return r; +} +__CUDA_FP16_DECL__ __half __habs(const __half a) +{ + __half r; + asm("{abs.f16 %0,%1;\n}" + :"=h"(__HALF_TO_US(r)) : "h"(__HALF_TO_CUS(a))); + return r; +} + +__CUDA_FP16_DECL__ __half2 __hcmadd(const __half2 a, const __half2 b, const __half2 c) +{ + // fast version of complex multiply-accumulate + // (a.re, a.im) * (b.re, b.im) + (c.re, c.im) + // acc.re = (c.re + a.re*b.re) - a.im*b.im + // acc.im = (c.im + a.re*b.im) + a.im*b.re + __half real_tmp = __hfma(a.x, b.x, c.x); + __half img_tmp = __hfma(a.x, b.y, c.y); + real_tmp = __hfma(__hneg(a.y), b.y, real_tmp); + img_tmp = __hfma(a.y, b.x, img_tmp); + return make_half2(real_tmp, img_tmp); +} + +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 530)*/ + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800) +__CUDA_FP16_DECL__ __half __hmax_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half __hmin_nan(const __half a, const __half b) +{ + __BINARY_OP_HALF_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half __hfma_relu(const __half a, const __half b, const __half c) +{ + __TERNARY_OP_HALF_MACRO(fma.rn.relu) +} + +__CUDA_FP16_DECL__ __half2 __hmax2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(max.NaN) +} +__CUDA_FP16_DECL__ __half2 __hmin2_nan(const __half2 a, const __half2 b) +{ + __BINARY_OP_HALF2_MACRO(min.NaN) +} +__CUDA_FP16_DECL__ __half2 __hfma2_relu(const __half2 a, const __half2 b, const __half2 c) +{ + __TERNARY_OP_HALF2_MACRO(fma.rn.relu) +} +#endif /*!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)*/ + +/* Define __PTR for atomicAdd prototypes below, undef after done */ +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __PTR "l" +#else +#define __PTR "r" +#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + +__CUDA_FP16_DECL__ __half2 atomicAdd(__half2 *const address, const __half2 val) { + __half2 r; + asm volatile ("{ atom.add.noftz.f16x2 %0,[%1],%2; }\n" + : "=r"(__HALF2_TO_UI(r)) : __PTR(address), "r"(__HALF2_TO_CUI(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600*/ + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 + +__CUDA_FP16_DECL__ __half atomicAdd(__half *const address, const __half val) { + __half r; + asm volatile ("{ atom.add.noftz.f16 %0,[%1],%2; }\n" + : "=h"(__HALF_TO_US(r)) + : __PTR(address), "h"(__HALF_TO_CUS(val)) + : "memory"); + return r; +} + +#endif /*!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700*/ + +#undef __PTR + +#undef __CUDA_FP16_DECL__ +#endif /* defined(__CUDACC__) */ +#endif /* defined(__cplusplus) */ + +#undef __TERNARY_OP_HALF2_MACRO +#undef __TERNARY_OP_HALF_MACRO +#undef __BINARY_OP_HALF2_MACRO +#undef __BINARY_OP_HALF_MACRO + +#undef __CUDA_HOSTDEVICE_FP16_DECL__ +#undef __CUDA_FP16_DECL__ + +#undef __HALF_TO_US +#undef __HALF_TO_CUS +#undef __HALF2_TO_UI +#undef __HALF2_TO_CUI + +/* Define first-class types "half" and "half2", unless user specifies otherwise via "#define CUDA_NO_HALF" */ +/* C cannot ever have these types defined here, because __half and __half2 are C++ classes */ +#if defined(__cplusplus) && !defined(CUDA_NO_HALF) +typedef __half half; +typedef __half2 half2; +// for consistency with __nv_bfloat16 +typedef __half __nv_half; +typedef __half2 __nv_half2; +typedef __half_raw __nv_half_raw; +typedef __half2_raw __nv_half2_raw; +typedef __half nv_half; +typedef __half2 nv_half2; +#endif /* defined(__cplusplus) && !defined(CUDA_NO_HALF) */ + +#if defined(__CPP_VERSION_AT_LEAST_11_FP16) +#undef __CPP_VERSION_AT_LEAST_11_FP16 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP16) */ + +#endif /* end of include guard: __CUDA_FP16_HPP__ */ diff --git a/ext/cudart/include/cuda_fp8.h b/ext/cudart/include/cuda_fp8.h new file mode 100644 index 0000000000000000000000000000000000000000..13a1d733f10f9dd90d98e280cef34be1472132fd --- /dev/null +++ b/ext/cudart/include/cuda_fp8.h @@ -0,0 +1,360 @@ +/* + * Copyright 2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef __CUDA_FP8_H__ +#define __CUDA_FP8_H__ + +/* Set up function decorations */ +#if defined(__CUDACC__) +#define __CUDA_FP8_DECL__ static __device__ __inline__ +#define __CUDA_HOSTDEVICE_FP8__ __host__ __device__ +#define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__ +#else /* !defined(__CUDACC__) */ +#if defined(__GNUC__) +#define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused)) +#else +#define __CUDA_HOSTDEVICE_FP8_DECL__ static +#endif /* defined(__GNUC__) */ +#define __CUDA_HOSTDEVICE_FP8__ +#endif /* defined(__CUDACC_) */ + +#if !defined(_MSC_VER) && __cplusplus >= 201103L +#define __CPP_VERSION_AT_LEAST_11_FP8 +#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L +#define __CPP_VERSION_AT_LEAST_11_FP8 +#endif + +/* bring in __half_raw data type */ +#include "cuda_fp16.h" +/* bring in __nv_bfloat16_raw data type */ +#include "cuda_bf16.h" +/* bring in float2, double4, etc vector types */ +#include "vector_types.h" + +/** + * \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics + * This section describes fp8 intrinsic functions. + * To use these functions, include the header file \p cuda_fp8.h in your + * program. + */ + +/** + * \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement + * \ingroup CUDA_MATH_INTRINSIC_FP8 + * To use these functions, include the header file \p cuda_fp8.h in your + * program. + */ + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief 8-bit \p unsigned \p integer + * type abstraction used to for \p fp8 floating-point + * numbers storage. + */ +typedef unsigned char __nv_fp8_storage_t; + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief 16-bit \p unsigned \p integer + * type abstraction used to for storage of pairs of + * \p fp8 floating-point numbers. + */ +typedef unsigned short int __nv_fp8x2_storage_t; + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief 32-bit \p unsigned \p integer + * type abstraction used to for storage of tetrads of + * \p fp8 floating-point numbers. + */ +typedef unsigned int __nv_fp8x4_storage_t; + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Enumerates the modes applicable when + * performing a narrowing conversion to \p fp8 destination types. + */ +typedef enum __nv_saturation_t { + /** + * Means no saturation to finite is performed when conversion + * results in rounding values outside the range of destination + * type. + * NOTE: for fp8 type of e4m3 kind, the results that are larger + * than the maximum representable finite number of the target + * format become NaN. + */ + __NV_NOSAT, + /** + * Means input larger than the maximum representable + * finite number MAXNORM of the target format round to the + * MAXNORM of the same sign as input. + */ + __NV_SATFINITE, +} __nv_saturation_t; + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Enumerates the possible + * interpretations of the 8-bit values when referring to them as + * \p fp8 types. + */ +typedef enum __nv_fp8_interpretation_t { + __NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */ + __NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */ +} __nv_fp8_interpretation_t; + +/* Forward-declaration of C-style APIs */ + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input \p double precision \p x to \p fp8 type of the + * requested kind using round-to-nearest-even rounding and requested saturation + * mode. + * + * \details Converts input \p x to \p fp8 type of the kind specified by + * \p fp8_interpretation parameter, + * using round-to-nearest-even rounding and + * saturation mode specified by \p saturate parameter. + * + * \returns + * - The \p __nv_fp8_storage_t value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t +__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation); + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input vector of two \p double precision numbers packed + * in \p double2 \p x into a vector of two values of \p fp8 type of + * the requested kind using round-to-nearest-even rounding and requested + * saturation mode. + * + * \details Converts input vector \p x to a vector of two \p fp8 values of the + * kind specified by \p fp8_interpretation parameter, using + * round-to-nearest-even rounding and saturation mode specified by \p saturate + * parameter. + * + * \returns + * - The \p __nv_fp8x2_storage_t value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t +__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation); + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input \p single precision \p x to \p fp8 type of the + * requested kind using round-to-nearest-even rounding and requested saturation + * mode. + * + * \details Converts input \p x to \p fp8 type of the kind specified by + * \p fp8_interpretation parameter, + * using round-to-nearest-even rounding and + * saturation mode specified by \p saturate parameter. + * + * \returns + * - The \p __nv_fp8_storage_t value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t +__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation); + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input vector of two \p single precision numbers packed + * in \p float2 \p x into a vector of two values of \p fp8 type of + * the requested kind using round-to-nearest-even rounding and requested + * saturation mode. + * + * \details Converts input vector \p x to a vector of two \p fp8 values of the + * kind specified by \p fp8_interpretation parameter, using + * round-to-nearest-even rounding and saturation mode specified by \p saturate + * parameter. + * + * \returns + * - The \p __nv_fp8x2_storage_t value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t +__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation); + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input \p half precision \p x to \p fp8 type of the requested + * kind using round-to-nearest-even rounding and requested saturation mode. + * + * \details Converts input \p x to \p fp8 type of the kind specified by + * \p fp8_interpretation parameter, + * using round-to-nearest-even rounding and + * saturation mode specified by \p saturate parameter. + * + * \returns + * - The \p __nv_fp8_storage_t value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t +__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation); + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input vector of two \p half precision numbers packed + * in \p __half2_raw \p x into a vector of two values of \p fp8 type of + * the requested kind using round-to-nearest-even rounding and requested + * saturation mode. + * + * \details Converts input vector \p x to a vector of two \p fp8 values of the + * kind specified by \p fp8_interpretation parameter, using + * round-to-nearest-even rounding and saturation mode specified by \p saturate + * parameter. + * + * \returns + * - The \p __nv_fp8x2_storage_t value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2( + const __half2_raw x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation); + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the + * requested kind using round-to-nearest-even rounding and requested saturation + * mode. + * + * \details Converts input \p x to \p fp8 type of the kind specified by + * \p fp8_interpretation parameter, + * using round-to-nearest-even rounding and + * saturation mode specified by \p saturate parameter. + * + * \returns + * - The \p __nv_fp8_storage_t value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8( + const __nv_bfloat16_raw x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation); + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed + * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of + * the requested kind using round-to-nearest-even rounding and requested + * saturation mode. + * + * \details Converts input vector \p x to a vector of two \p fp8 values of the + * kind specified by \p fp8_interpretation parameter, using + * round-to-nearest-even rounding and saturation mode specified by \p saturate + * parameter. + * + * \returns + * - The \p __nv_fp8x2_storage_t value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t +__nv_cvt_bfloat16raw2_to_fp8x2( + const __nv_bfloat162_raw x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation); + +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input \p fp8 \p x of the specified kind + * to \p half precision. + * + * \details Converts input \p x of \p fp8 type of the kind specified by + * \p fp8_interpretation parameter + * to \p half precision. + * + * \returns + * - The \p __half_raw value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw +__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x, + const __nv_fp8_interpretation_t fp8_interpretation); +/** + * \ingroup CUDA_MATH_FP8_MISC + * \brief Converts input vector of two \p fp8 values of the specified kind + * to a vector of two \p half precision values packed in \p __half2_raw + * structure. + * + * \details Converts input vector \p x of \p fp8 type of the kind specified by + * \p fp8_interpretation parameter + * to a vector of two \p half precision values and returns as \p __half2_raw + * structure. + * + * \returns + * - The \p __half2_raw value holds the result of conversion. + */ +__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw +__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x, + const __nv_fp8_interpretation_t fp8_interpretation); + +#if defined(__cplusplus) + +#define __CUDA_FP8_TYPES_EXIST__ + +/* Forward-declaration of structures defined in "cuda_fp8.hpp" */ +struct __nv_fp8_e5m2; +struct __nv_fp8x2_e5m2; +struct __nv_fp8x4_e5m2; + +struct __nv_fp8_e4m3; +struct __nv_fp8x2_e4m3; +struct __nv_fp8x4_e4m3; + +#endif /* defined(__cplusplus) */ + +#include "cuda_fp8.hpp" + +#undef __CUDA_FP8_DECL__ +#undef __CUDA_HOSTDEVICE_FP8__ +#undef __CUDA_HOSTDEVICE_FP8_DECL__ + +#if defined(__CPP_VERSION_AT_LEAST_11_FP8) +#undef __CPP_VERSION_AT_LEAST_11_FP8 +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */ + +#endif /* end of include guard: __CUDA_FP8_H__ */ diff --git a/ext/cudart/include/cuda_fp8.hpp b/ext/cudart/include/cuda_fp8.hpp new file mode 100644 index 0000000000000000000000000000000000000000..9bfe2b7891c994b5432837f865716e29d3ae831b --- /dev/null +++ b/ext/cudart/include/cuda_fp8.hpp @@ -0,0 +1,1546 @@ +/* + * Copyright 2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_FP8_HPP__) +#define __CUDA_FP8_HPP__ + +#if !defined(__CUDA_FP8_H__) +#error "Do not include this file directly. Instead, include cuda_fp8.h." +#endif + +/* C++ header for std::memcpy (used for type punning in host-side + * implementations). When compiling as a CUDA source file memcpy is provided + * implicitly. !defined(__CUDACC__) implies !defined(__CUDACC_RTC__). + */ +#if defined(__cplusplus) && !defined(__CUDACC__) +#include <cstring> +#elif !defined(__cplusplus) && !defined(__CUDACC__) +#include <string.h> +#endif /* defined(__cplusplus) && !defined(__CUDACC__) */ + +/* Set up structure-alignment attribute */ +#if !(defined __CUDA_ALIGN__) +#if defined(__CUDACC__) +#define __CUDA_ALIGN__(align) __align__(align) +#else +/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" + * is available) */ +#if __cplusplus >= 201103L +#define __CUDA_ALIGN__(n) \ + alignas(n) /* C++11 kindly gives us a keyword for this */ +#else /* !defined(__CPP_VERSION_AT_LEAST_11_FP8)*/ +#if defined(__GNUC__) +#define __CUDA_ALIGN__(n) __attribute__((aligned(n))) +#elif defined(_MSC_VER) +#define __CUDA_ALIGN__(n) __declspec(align(n)) +#else +#define __CUDA_ALIGN__(n) +#endif /* defined(__GNUC__) */ +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */ +#endif /* defined(__CUDACC__) */ +#endif /* !(defined __CUDA_ALIGN__) */ + +#if !(defined __CPP_VERSION_AT_LEAST_11_FP8) +/* need c++11 for explicit operators */ +#define __CUDA_NO_FP8_CONVERSION_OPERATORS__ +#endif + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t +__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation) { + unsigned char res; + unsigned long long int xbits; + +#if defined(__CUDACC__) || (!defined __cplusplus) + (void)memcpy(&xbits, &x, sizeof(x)); +#else + (void)std::memcpy(&xbits, &x, sizeof(x)); +#endif + unsigned char FP8_MAXNORM; + unsigned char FP8_MANTISSA_MASK; + unsigned short int FP8_EXP_BIAS; + unsigned long long int FP8_SIGNIFICAND_BITS; + const unsigned long long int DP_INF_BITS = 0x7FF0000000000000ULL; + unsigned long long int FP8_MINDENORM_O2; + unsigned long long int FP8_OVERFLOW_THRESHOLD; + unsigned long long int FP8_MINNORM; + + if (fp8_interpretation == __NV_E4M3) { + FP8_EXP_BIAS = 7U; + FP8_SIGNIFICAND_BITS = 4ULL; + FP8_MANTISSA_MASK = 0x7U; + FP8_MINDENORM_O2 = 0x3F50000000000000ULL; // mindenorm/2 = 2^-10 + FP8_OVERFLOW_THRESHOLD = + 0x407D000000000000ULL; // maxnorm + 1/2ulp = 0x1.Cp+8 + 0x1p+4 + FP8_MAXNORM = 0x7EU; + FP8_MINNORM = 0x3F90000000000000ULL; // minnorm = 2^-6 + } else { //__NV_E5M2 + FP8_EXP_BIAS = 15U; + FP8_SIGNIFICAND_BITS = 3ULL; + FP8_MANTISSA_MASK = 0x3U; + FP8_MINDENORM_O2 = 0x3EE0000000000000ULL; // mindenorm/2 = 2^-17 + FP8_OVERFLOW_THRESHOLD = + 0x40EE000000000000ULL - + 1ULL; // maxnorm + 1/2ulp = 0x1.Ep+15, and -1 to have common code + FP8_MAXNORM = 0x7BU; + FP8_MINNORM = 0x3F10000000000000ULL; // minnorm = 2^-14 + } + + // 1/2 LSB of the target format, positioned in double precision mantissa + // helpful in midpoints detection during round-to-nearest-even step + const unsigned long long int FP8_DP_HALF_ULP = + (unsigned long long int)1ULL << (53ULL - FP8_SIGNIFICAND_BITS - 1ULL); + // prepare sign bit in target format + unsigned char sign = (unsigned char)((xbits >> 63ULL) << 7U); + // prepare exponent field in target format + unsigned char exp = + (unsigned char)((((unsigned short int)(xbits >> 52ULL)) & 0x7FFU) - + 1023U + FP8_EXP_BIAS); + // round mantissa to target format width, rounding towards zero + unsigned char mantissa = + (unsigned char)(xbits >> (53ULL - FP8_SIGNIFICAND_BITS)) & + FP8_MANTISSA_MASK; + unsigned long long int absx = xbits & 0x7FFFFFFFFFFFFFFFULL; + + if (absx <= FP8_MINDENORM_O2) { + // zero or underflow + res = 0U; + } else if (absx > DP_INF_BITS) { + // NaN + if (fp8_interpretation == __NV_E4M3) { + res = 0x7FU; + } else { + // NaN --> QNaN + res = 0x7EU | mantissa; + } + } else if (absx > FP8_OVERFLOW_THRESHOLD) { + if (saturate == __NV_SATFINITE) { + res = FP8_MAXNORM; + } else { + // __NV_NOSAT + if (fp8_interpretation == __NV_E4M3) { + // no Inf in E4M3 + res = 0x7FU; // NaN + } else { + res = 0x7CU; // Inf in E5M2 + } + } + } else if (absx >= FP8_MINNORM) { + res = (unsigned char)((exp << (FP8_SIGNIFICAND_BITS - 1U)) | mantissa); + // rounded-off bits + unsigned long long int round = + xbits & ((FP8_DP_HALF_ULP << 1ULL) - 1ULL); + // round-to-nearest-even adjustment + if ((round > FP8_DP_HALF_ULP) || + ((round == FP8_DP_HALF_ULP) && (mantissa & 1U))) { + res = (unsigned char)(res + 1U); + } + } else // Denormal range + { + unsigned char shift = (unsigned char)(1U - exp); + // add implicit leading bit + mantissa |= (unsigned char)(1U << (FP8_SIGNIFICAND_BITS - 1U)); + // additional round-off due to denormalization + res = (unsigned char)(mantissa >> shift); + + // rounded-off bits, including implicit leading bit + unsigned long long int round = + (xbits | ((unsigned long long int)1ULL << (53ULL - 1ULL))) & + ((FP8_DP_HALF_ULP << (shift + 1ULL)) - 1ULL); + // round-to-nearest-even adjustment + if ((round > (FP8_DP_HALF_ULP << shift)) || + ((round == (FP8_DP_HALF_ULP << shift)) && (res & 1U))) { + res = (unsigned char)(res + 1U); + } + } + + res |= sign; + + return (__nv_fp8_storage_t)res; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t +__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation) { + __nv_fp8x2_storage_t storage = (__nv_fp8x2_storage_t)__nv_cvt_double_to_fp8( + x.y, saturate, fp8_interpretation); + storage = (__nv_fp8x2_storage_t)(storage << 8U); + storage = (__nv_fp8x2_storage_t)(storage | + __nv_cvt_double_to_fp8( + x.x, saturate, fp8_interpretation)); + return storage; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t +__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation) { + __nv_fp8_storage_t res = 0U; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + if (saturate == __NV_SATFINITE) { + __nv_fp8x2_storage_t storage; + if (fp8_interpretation == __NV_E5M2) { + asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n" + : "=h"(storage) + : "f"(x), "f"(0.0f)); + } else { + asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n" + : "=h"(storage) + : "f"(x), "f"(0.0f)); + } + res = (__nv_fp8_storage_t)storage; + } else +#endif + { + unsigned int xbits; +#if defined(__CUDACC__) || (!defined __cplusplus) + (void)memcpy(&xbits, &x, sizeof(x)); +#else + (void)std::memcpy(&xbits, &x, sizeof(x)); +#endif + + // isnan + if ((xbits & 0x7FFFFFFFU) > 0x7F800000U) { + // Canonical NaN + xbits = 0x7FFFFFFFU; + } + + float fx; +#if defined(__CUDACC__) || (!defined __cplusplus) + (void)memcpy(&fx, &xbits, sizeof(xbits)); +#else + (void)std::memcpy(&fx, &xbits, sizeof(xbits)); +#endif + + const double dx = (double)fx; + res = __nv_cvt_double_to_fp8(dx, saturate, fp8_interpretation); + } + return res; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t +__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation) { + __nv_fp8x2_storage_t storage; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + if (saturate == __NV_SATFINITE) { + if (fp8_interpretation == __NV_E5M2) { + asm("{cvt.rn.satfinite.e5m2x2.f32 %0, %2, %1;}\n" + : "=h"(storage) + : "f"(x.x), "f"(x.y)); + } else { + asm("{cvt.rn.satfinite.e4m3x2.f32 %0, %2, %1;}\n" + : "=h"(storage) + : "f"(x.x), "f"(x.y)); + } + } else +#endif + { + storage = (__nv_fp8x2_storage_t)__nv_cvt_float_to_fp8( + x.y, saturate, fp8_interpretation); + storage = (__nv_fp8x2_storage_t)(storage << 8U); + storage = (__nv_fp8x2_storage_t)(storage | __nv_cvt_float_to_fp8( + x.x, saturate, + fp8_interpretation)); + } + return storage; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ float +__internal_halfraw_to_float(const __half_raw x) { + float f; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) + asm("{cvt.f32.f16 %0, %1;}\n" : "=f"(f) : "h"(x.x)); +#else + const unsigned int ux = (unsigned int)x.x; + unsigned int sign = (ux >> 15U) & 1U; + unsigned int exponent = (ux >> 10U) & 0x1fU; + unsigned int mantissa = (ux & 0x3ffU) << 13U; + if (exponent == 0x1fU) { /* NaN or Inf */ + /* discard sign of a NaN */ + sign = ((mantissa != 0U) ? (sign >> 1U) : sign); + mantissa = ((mantissa != 0U) ? 0x7fffffU : 0U); + exponent = 0xffU; + } else if (exponent == 0U) { /* Denorm or Zero */ + if (mantissa != 0U) { + unsigned int msb; + exponent = 0x71U; + do { + msb = (mantissa & 0x400000U); + mantissa <<= 1U; /* normalize */ + --exponent; + } while (msb == 0U); + mantissa &= 0x7fffffU; /* 1.mantissa is implicit */ + } + } else { + exponent += 0x70U; + } + const unsigned int u = ((sign << 31U) | (exponent << 23U) | mantissa); +#if defined(__CUDACC__) || (!defined __cplusplus) + (void)memcpy(&f, &u, sizeof(u)); +#else + (void)std::memcpy(&f, &u, sizeof(u)); +#endif +#endif /* (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 530) */ + return f; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ float2 +__internal_halfraw2_to_float2(const __half2_raw x) { + __half_raw raw; + float2 res; + raw.x = x.x; + res.x = __internal_halfraw_to_float(raw); + raw.x = x.y; + res.y = __internal_halfraw_to_float(raw); + return res; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t +__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation) { + __nv_fp8_storage_t res = 0U; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + if (saturate == __NV_SATFINITE) { + unsigned int half2_storage = (unsigned int)(x.x); + __nv_fp8x2_storage_t tmp; + if (fp8_interpretation == __NV_E5M2) { + asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n" + : "=h"(tmp) + : "r"(half2_storage)); + } else { + asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n" + : "=h"(tmp) + : "r"(half2_storage)); + } + res = (__nv_fp8_storage_t)tmp; + } else +#endif + { + float fx = __internal_halfraw_to_float(x); + res = __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation); + } + return res; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2( + const __half2_raw x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation) { + __nv_fp8x2_storage_t tmp; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + if (saturate == __NV_SATFINITE) { + unsigned int half2_storage; + (void)memcpy(&half2_storage, &x, sizeof(x)); + + if (fp8_interpretation == __NV_E5M2) { + asm("{cvt.rn.satfinite.e5m2x2.f16x2 %0, %1;}\n" + : "=h"(tmp) + : "r"(half2_storage)); + } else { + asm("{cvt.rn.satfinite.e4m3x2.f16x2 %0, %1;}\n" + : "=h"(tmp) + : "r"(half2_storage)); + } + } else +#endif + { + __half_raw raw; + raw.x = x.x; + __nv_fp8_storage_t lo = + __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation); + raw.x = x.y; + __nv_fp8_storage_t hi = + __nv_cvt_halfraw_to_fp8(raw, saturate, fp8_interpretation); + tmp = hi; + tmp = (__nv_fp8x2_storage_t)(tmp << 8U); + tmp = (__nv_fp8x2_storage_t)(tmp | lo); + } + return tmp; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ float +__internal_bf16raw_to_float(const __nv_bfloat16_raw x) { + const unsigned int ux = ((unsigned int)x.x) << 16U; + float fx; +#if defined(__CUDACC__) || (!defined __cplusplus) + (void)memcpy(&fx, &ux, sizeof(ux)); +#else + (void)std::memcpy(&fx, &ux, sizeof(ux)); +#endif + return fx; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw +__internal_float_to_bf16raw_rz(const float x) { + unsigned int ux; + __nv_bfloat16_raw r; +#if defined(__CUDACC__) || (!defined __cplusplus) + (void)memcpy(&ux, &x, sizeof(x)); +#else + (void)std::memcpy(&ux, &x, sizeof(x)); +#endif + r.x = (unsigned short int)(ux >> 16U); + return r; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8( + const __nv_bfloat16_raw x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation) { + const float fx = __internal_bf16raw_to_float(x); + const __nv_fp8_storage_t res = + __nv_cvt_float_to_fp8(fx, saturate, fp8_interpretation); + return res; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t +__nv_cvt_bfloat16raw2_to_fp8x2( + const __nv_bfloat162_raw x, const __nv_saturation_t saturate, + const __nv_fp8_interpretation_t fp8_interpretation) { + __nv_bfloat16_raw raw; + raw.x = x.y; + __nv_fp8x2_storage_t storage = + (__nv_fp8x2_storage_t)__nv_cvt_bfloat16raw_to_fp8(raw, saturate, + fp8_interpretation); + storage = (__nv_fp8x2_storage_t)(storage << 8U); + raw.x = x.x; + storage = (__nv_fp8x2_storage_t)(storage | + __nv_cvt_bfloat16raw_to_fp8( + raw, saturate, fp8_interpretation)); + return storage; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw +__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x, + const __nv_fp8_interpretation_t fp8_interpretation); +__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw +__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x, + const __nv_fp8_interpretation_t fp8_interpretation) { + __half_raw res; + res.x = 0U; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + res.x = + __nv_cvt_fp8x2_to_halfraw2((__nv_fp8x2_storage_t)x, fp8_interpretation) + .x; +#else + unsigned short int ur = (unsigned short int)x; + ur = (unsigned short int)(ur << 8U); + + if (fp8_interpretation == __NV_E5M2) { + if ((ur & 0x7FFFU) > 0x7C00U) { + /* If NaN, return canonical NaN */ + ur = 0x7FFFU; + } + } else { // __NV_E4M3 + unsigned short int sign = ur & 0x8000U; + unsigned short int exponent = + (unsigned short int)(((ur & 0x7800U) >> 1U) + 0x2000U); + unsigned short int mantissa = (ur & 0x0700U) >> 1U; + unsigned char absx = 0x7FU & (unsigned char)x; + + if (absx == 0x7FU) // NaN + { + ur = 0x7FFFU; // fp16 canonical NaN, discard sign + } else if (exponent == 0x2000U) { + // zero or denormal + if (mantissa != 0U) { + // normalize + mantissa = (unsigned short int)(mantissa << 1U); + while ((mantissa & 0x0400U) == 0U) { + mantissa = (unsigned short int)(mantissa << 1U); + exponent = (unsigned short int)(exponent - 0x0400U); + } + // discard implicit leading bit + mantissa &= 0x03FFU; + } else { // Zero + exponent = 0U; + } + + ur = (sign | exponent) | mantissa; + } else { + ur = (sign | exponent) | mantissa; + } + } + res.x = ur; +#endif + return res; +} + +__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw +__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x, + const __nv_fp8_interpretation_t fp8_interpretation) { + __half2_raw res; +#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) + unsigned int half2_storage; + if (fp8_interpretation == __NV_E5M2) { + asm("{cvt.rn.f16x2.e5m2x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x)); + } else { + asm("{cvt.rn.f16x2.e4m3x2 %0, %1;}\n" : "=r"(half2_storage) : "h"(x)); + } + (void)memcpy(&res, &half2_storage, sizeof(half2_storage)); +#else + res.x = + __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)x, fp8_interpretation).x; + res.y = __nv_cvt_fp8_to_halfraw((__nv_fp8_storage_t)(x >> 8U), + fp8_interpretation) + .x; +#endif + return res; +} + +/* All other definitions in this file are only visible to C++ compilers */ +#if defined(__cplusplus) + +/** + * \defgroup CUDA_MATH_FP8_E5M2_STRUCT C++ struct for handling fp8 data type of e5m2 kind. + * \ingroup CUDA_MATH_INTRINSIC_FP8 + */ + +/** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * \brief __nv_fp8_e5m2 datatype + * + * \details This structure implements the datatype for handling + * \p fp8 floating-point numbers of \p e5m2 kind: + * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits. + * + * The structure implements converting constructors and operators. + */ +struct __CUDA_ALIGN__(1) __nv_fp8_e5m2 { + public: + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Storage variable contains the \p fp8 floating-point data. + */ + __nv_fp8_storage_t __x; + + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor by default. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP8) + __nv_fp8_e5m2() = default; +#else + __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2() {} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */ + +#if !defined(__CUDA_NO_FP8_CONVERSIONS__) + + /* Construct from wider FP types */ + /* Note we do avoid constructor init-list because of special host/device + * compilation rules */ + + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p __half data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __half f) { + __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f), + __NV_SATFINITE, __NV_E5M2); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const __nv_bfloat16 f) { + __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f), + __NV_SATFINITE, __NV_E5M2); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior + * for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const float f) { + __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E5M2); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p double data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const double f) { + __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E5M2); + } + + /* Converts from integral */ + + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p unsigned \p short \p int data type, relies on \p + * __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ + __nv_fp8_e5m2(const unsigned short int val) { + __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x; + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p unsigned \p int data type, relies on \p + * __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const unsigned int val) { + __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x; + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p unsigned \p long \p long \p int data type, relies on + * \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ + __nv_fp8_e5m2(const unsigned long long int val) { + __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x; + } + + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p short \p int data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const short int val) { + __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x; + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior + * for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const int val) { + __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x; + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Constructor from \p long \p long \p int data type, relies on \p + * __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e5m2(const long long int val) { + __x = static_cast<__nv_fp8_e5m2>(static_cast<float>(val)).__x; + } + +#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) + /* Widening FP converts */ + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p __half data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const { + return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E5M2)); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p float data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator float() const { + return __internal_halfraw_to_float( + __nv_cvt_fp8_to_halfraw(__x, __NV_E5M2)); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p __nv_bfloat16 data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const { + return static_cast<__nv_bfloat16>( + __internal_float_to_bf16raw_rz(float(*this))); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p double data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator double() const { + return static_cast<double>(float(*this)); + } + + /* Convert to integral */ + + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p unsigned \p char data type. + * Clamps negative and too large inputs to the output range. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const { + unsigned char i; + const float f = float(*this); + const unsigned char max_val = 0xFFU; + const unsigned char min_val = 0U; + const unsigned char bits = (*this).__x; + // saturation fixup + if ((bits & 0x7FU) > 0x7CU) { + // NaN + i = 0; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value + i = static_cast<unsigned char>(f); + } + return i; + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p unsigned \p short \p int data type. + * Clamps negative and too large inputs to the output range. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const { + return __half2ushort_rz(__half(*this)); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p unsigned \p int data type. + * Clamps negative and too large inputs to the output range. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const { + return __half2uint_rz(__half(*this)); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p unsigned \p long \p long \p int data type. + * Clamps negative and too large inputs to the output range. + * \p NaN inputs convert to \p 0x8000000000000000ULL. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const { + return __half2ull_rz(__half(*this)); + } + + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p signed \p char data type. + * Clamps too large inputs to the output range. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const { + signed char i; + const float f = float(*this); + const signed char max_val = (signed char)0x7FU; + const signed char min_val = (signed char)0x80U; + const unsigned char bits = (*this).__x; + // saturation fixup + if ((bits & 0x7FU) > 0x7CU) { + // NaN + i = 0; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value + i = static_cast<signed char>(f); + } + return i; + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p short \p int data type. + * Clamps too large inputs to the output range. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const { + return __half2short_rz(__half(*this)); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p int data type. + * Clamps too large inputs to the output range. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator int() const { + return __half2int_rz(__half(*this)); + } + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p long \p long \p int data type. + * Clamps too large inputs to the output range. + * \p NaN inputs convert to \p 0x8000000000000000LL. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const { + return __half2ll_rz(__half(*this)); + } + + /** + * \ingroup CUDA_MATH_FP8_E5M2_STRUCT + * Conversion operator to \p bool data type. + * +0 and -0 inputs convert to \p false. + * Non-zero inputs convert to \p true. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const { + return (__x & 0x7FU) != 0U; + } +#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */ +#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */ +}; + +/** + * \defgroup CUDA_MATH_FP8X2_E5M2_STRUCT C++ struct for handling vector type of two fp8 values of e5m2 kind. + * \ingroup CUDA_MATH_INTRINSIC_FP8 + */ + +/** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * \brief __nv_fp8x2_e5m2 datatype + * + * \details This structure implements the datatype for handling two + * \p fp8 floating-point numbers of \p e5m2 kind each: + * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits. + * + * The structure implements converting constructors and operators. + */ +struct __CUDA_ALIGN__(2) __nv_fp8x2_e5m2 { + public: + /** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * Storage variable contains the vector of two \p fp8 floating-point data + * values. + */ + __nv_fp8x2_storage_t __x; + + /** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * Constructor by default. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP8) + __nv_fp8x2_e5m2() = default; +#else + __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2() {} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */ + +#if !defined(__CUDA_NO_FP8_CONVERSIONS__) + + /* Construct from wider types */ + + /** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __half2 f) { + __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f), + __NV_SATFINITE, __NV_E5M2); + } + /** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const __nv_bfloat162 f) { + __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f), + __NV_SATFINITE, __NV_E5M2); + } + /** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * Constructor from \p float2 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const float2 f) { + __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2); + } + /** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * Constructor from \p double2 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e5m2(const double2 f) { + __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E5M2); + } + +#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) + /* Widening converts */ + /** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * Conversion operator to \p __half2 data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const { + return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2)); + } + /** + * \ingroup CUDA_MATH_FP8X2_E5M2_STRUCT + * Conversion operator to \p float2 data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const { + return __internal_halfraw2_to_float2( + __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E5M2)); + } +#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */ +#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */ +}; + +__CUDA_HOSTDEVICE_FP8_DECL__ unsigned int +__internal_pack_u16x2_to_u32(const unsigned short int src_lo, + const unsigned short int src_hi) { + unsigned int dst; +#if (defined __CUDACC__) && (defined __CUDA_ARCH__) + asm("{ mov.b32 %0, {%1,%2};}\n" : "=r"(dst) : "h"(src_lo), "h"(src_hi)); +#else + dst = (static_cast<unsigned int>(src_hi) << 16U) | + static_cast<unsigned int>(src_lo); +#endif + return dst; +} + +/** + * \defgroup CUDA_MATH_FP8X4_E5M2_STRUCT C++ struct for handling vector type of four fp8 values of e5m2 kind. + * \ingroup CUDA_MATH_INTRINSIC_FP8 + */ + +/** + * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT + * \brief __nv_fp8x4_e5m2 datatype + * + * \details This structure implements the datatype for handling four + * \p fp8 floating-point numbers of \p e5m2 kind each: + * with 1 sign, 5 exponent, 1 implicit and 2 explicit mantissa bits. + * + * The structure implements converting constructors and operators. + */ +struct __CUDA_ALIGN__(4) __nv_fp8x4_e5m2 { + public: + /** + * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT + * Storage variable contains the vector of four \p fp8 floating-point data + * values. + */ + __nv_fp8x4_storage_t __x; + + /** + * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT + * Constructor by default. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP8) + __nv_fp8x4_e5m2() = default; +#else + __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2() {} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */ + +#if !defined(__CUDA_NO_FP8_CONVERSIONS__) + + /* Construct from wider types */ + + /** + * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT + * Constructor from a pair of \p __half2 data type values, + * relies on \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __half2 flo, + const __half2 fhi) { + const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2( + static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E5M2); + const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2( + static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E5M2); + __x = __internal_pack_u16x2_to_u32(rlo, rhi); + } + /** + * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT + * Constructor from a pair of \p __nv_bfloat162 data type values, + * relies on \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const __nv_bfloat162 flo, + const __nv_bfloat162 fhi) { + const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2( + static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E5M2); + const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2( + static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E5M2); + __x = __internal_pack_u16x2_to_u32(rlo, rhi); + } + /** + * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT + * Constructor from \p float4 vector data type, + * relies on \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const float4 f) { + const float2 flo = {f.x, f.y}; + const float2 fhi = {f.z, f.w}; + const __nv_fp8x2_storage_t rlo = + __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2); + const __nv_fp8x2_storage_t rhi = + __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2); + __x = __internal_pack_u16x2_to_u32(rlo, rhi); + } + /** + * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT + * Constructor from \p double4 vector data type, + * relies on \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e5m2(const double4 f) { + const double2 flo = {f.x, f.y}; + const double2 fhi = {f.z, f.w}; + const __nv_fp8x2_storage_t rlo = + __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E5M2); + const __nv_fp8x2_storage_t rhi = + __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E5M2); + __x = __internal_pack_u16x2_to_u32(rlo, rhi); + } + +#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) + /* Widening converts */ + + /** + * \ingroup CUDA_MATH_FP8X4_E5M2_STRUCT + * Conversion operator to \p float4 vector data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const { + const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x); + const __nv_fp8x2_storage_t shi = + static_cast<__nv_fp8x2_storage_t>(__x >> 16U); + float2 rlo = __internal_halfraw2_to_float2( + __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E5M2)); + float2 rhi = __internal_halfraw2_to_float2( + __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E5M2)); + float4 res = {rlo.x, rlo.y, rhi.x, rhi.y}; + return res; + } +#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */ +#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */ +}; + +/** + * \defgroup CUDA_MATH_FP8_E4M3_STRUCT C++ struct for handling fp8 data type of e4m3 kind. + * \ingroup CUDA_MATH_INTRINSIC_FP8 + */ + +/** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * \brief __nv_fp8_e4m3 datatype + * + * \details This structure implements the datatype for storing + * \p fp8 floating-point numbers of \p e4m3 kind: + * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits. + * The encoding doesn't support Infinity. + * NaNs are limited to 0x7F and 0xFF values. + * + * The structure implements converting constructors and operators. + */ +struct __CUDA_ALIGN__(1) __nv_fp8_e4m3 { + public: + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Storage variable contains the \p fp8 floating-point data. + */ + __nv_fp8_storage_t __x; + + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor by default. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP8) + __nv_fp8_e4m3() = default; +#else + __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3() {} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */ + +#if !defined(__CUDA_NO_FP8_CONVERSIONS__) + + /* Construct from wider FP types */ + /* Note we do avoid constructor init-list because of special host/device + * compilation rules */ + + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p __half data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __half f) { + __x = __nv_cvt_halfraw_to_fp8(static_cast<__half_raw>(f), + __NV_SATFINITE, __NV_E4M3); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p __nv_bfloat16 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const __nv_bfloat16 f) { + __x = __nv_cvt_bfloat16raw_to_fp8(static_cast<__nv_bfloat16_raw>(f), + __NV_SATFINITE, __NV_E4M3); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p float data type, relies on \p __NV_SATFINITE behavior + * for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const float f) { + __x = __nv_cvt_float_to_fp8(f, __NV_SATFINITE, __NV_E4M3); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p double data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const double f) { + __x = __nv_cvt_double_to_fp8(f, __NV_SATFINITE, __NV_E4M3); + } + + /* Converts from integral */ + + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p unsigned \p short \p int data type, relies on \p + * __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ + __nv_fp8_e4m3(const unsigned short int val) { + __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x; + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p unsigned \p int data type, relies on \p + * __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const unsigned int val) { + __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x; + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p unsigned \p long \p long \p int data type, relies on + * \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ + __nv_fp8_e4m3(const unsigned long long int val) { + __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x; + } + + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p short \p int data type, relies on \p + * __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const short int val) { + __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x; + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p int data type, relies on \p __NV_SATFINITE behavior + * for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const int val) { + __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x; + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Constructor from \p long \p long \p int data type, relies on \p + * __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8_e4m3(const long long int val) { + __x = static_cast<__nv_fp8_e4m3>(static_cast<float>(val)).__x; + } + +#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) + /* Widening FP converts */ + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p __half data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator __half() const { + return static_cast<__half>(__nv_cvt_fp8_to_halfraw(__x, __NV_E4M3)); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p float data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator float() const { + return __internal_halfraw_to_float( + __nv_cvt_fp8_to_halfraw(__x, __NV_E4M3)); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p __nv_bfloat16 data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator __nv_bfloat16() const { + return static_cast<__nv_bfloat16>( + __internal_float_to_bf16raw_rz(float(*this))); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p double data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator double() const { + return static_cast<double>(float(*this)); + } + + /* Convert to integral */ + + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p unsigned \p char data type. + * Clamps negative and too large inputs to the output range. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned char() const { + unsigned char i; + const float f = float(*this); + const unsigned char max_val = 0xFFU; + const unsigned char min_val = 0U; + const unsigned char bits = (*this).__x; + // saturation fixup + if ((bits & 0x7FU) == 0x7FU) { + // NaN + i = 0; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value + i = static_cast<unsigned char>(f); + } + return i; + } + + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p unsigned \p short \p int data type. + * Clamps negative inputs to zero. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned short int() const { + return __half2ushort_rz(__half(*this)); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p unsigned \p int data type. + * Clamps negative inputs to zero. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned int() const { + return __half2uint_rz(__half(*this)); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p unsigned \p long \p long \p int data type. + * Clamps negative inputs to zero. + * \p NaN inputs convert to \p 0x8000000000000000ULL. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator unsigned long long int() const { + return __half2ull_rz(__half(*this)); + } + + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p signed \p char data type. + * Clamps too large inputs to the output range. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator signed char() const { + signed char i; + const float f = float(*this); + const signed char max_val = (signed char)0x7FU; + const signed char min_val = (signed char)0x80U; + const unsigned char bits = (*this).__x; + // saturation fixup + if ((bits & 0x7FU) == 0x7FU) { + // NaN + i = 0; + } else if (f > static_cast<float>(max_val)) { + // saturate maximum + i = max_val; + } else if (f < static_cast<float>(min_val)) { + // saturate minimum + i = min_val; + } else { + // normal value + i = static_cast<signed char>(f); + } + return i; + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p short \p int data type. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator short int() const { + return __half2short_rz(__half(*this)); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p int data type. + * \p NaN inputs convert to \p zero. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator int() const { + return __half2int_rz(__half(*this)); + } + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p long \p long \p int data type. + * \p NaN inputs convert to \p 0x8000000000000000LL. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator long long int() const { + return __half2ll_rz(__half(*this)); + } + + /** + * \ingroup CUDA_MATH_FP8_E4M3_STRUCT + * Conversion operator to \p bool data type. + * +0 and -0 inputs convert to \p false. + * Non-zero inputs convert to \p true. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator bool() const { + return (__x & 0x7FU) != 0U; + } +#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */ +#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */ +}; + +/** + * \defgroup CUDA_MATH_FP8X2_E4M3_STRUCT C++ struct for handling vector type of two fp8 values of e4m3 kind. + * \ingroup CUDA_MATH_INTRINSIC_FP8 + */ + +/** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * \brief __nv_fp8x2_e4m3 datatype + * + * \details This structure implements the datatype for storage + * and operations on the vector of two \p fp8 values of \p e4m3 kind each: + * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits. + * The encoding doesn't support Infinity. + * NaNs are limited to 0x7F and 0xFF values. + */ +struct __CUDA_ALIGN__(2) __nv_fp8x2_e4m3 { + public: + /** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * Storage variable contains the vector of two \p fp8 floating-point data + * values. + */ + __nv_fp8x2_storage_t __x; + + /** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * Constructor by default. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP8) + __nv_fp8x2_e4m3() = default; +#else + __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3() {} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */ + +#if !defined(__CUDA_NO_FP8_CONVERSIONS__) + + /* Construct from wider types */ + + /** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * Constructor from \p __half2 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __half2 f) { + __x = __nv_cvt_halfraw2_to_fp8x2(static_cast<__half2_raw>(f), + __NV_SATFINITE, __NV_E4M3); + } + /** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * Constructor from \p __nv_bfloat162 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const __nv_bfloat162 f) { + __x = __nv_cvt_bfloat16raw2_to_fp8x2(static_cast<__nv_bfloat162_raw>(f), + __NV_SATFINITE, __NV_E4M3); + } + /** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * Constructor from \p float2 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const float2 f) { + __x = __nv_cvt_float2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3); + } + /** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * Constructor from \p double2 data type, relies on \p __NV_SATFINITE + * behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x2_e4m3(const double2 f) { + __x = __nv_cvt_double2_to_fp8x2(f, __NV_SATFINITE, __NV_E4M3); + } + +#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) + /* Widening converts */ + /** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * Conversion operator to \p __half2 data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator __half2() const { + return static_cast<__half2>(__nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3)); + } + /** + * \ingroup CUDA_MATH_FP8X2_E4M3_STRUCT + * Conversion operator to \p float2 data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator float2() const { + return __internal_halfraw2_to_float2( + __nv_cvt_fp8x2_to_halfraw2(__x, __NV_E4M3)); + } +#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */ +#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */ +}; + +/** + * \defgroup CUDA_MATH_FP8X4_E4M3_STRUCT C++ struct for handling vector type of four fp8 values of e4m3 kind. + * \ingroup CUDA_MATH_INTRINSIC_FP8 + */ + +/** + * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT + * \brief __nv_fp8x4_e4m3 datatype + * + * \details This structure implements the datatype for storage + * and operations on the vector of four \p fp8 values of \p e4m3 kind each: + * with 1 sign, 4 exponent, 1 implicit and 3 explicit mantissa bits. + * The encoding doesn't support Infinity. + * NaNs are limited to 0x7F and 0xFF values. + */ +struct __CUDA_ALIGN__(4) __nv_fp8x4_e4m3 { + public: + /** + * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT + * Storage variable contains the vector of four \p fp8 floating-point data + * values. + */ + __nv_fp8x4_storage_t __x; + + /** + * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT + * Constructor by default. + */ +#if defined(__CPP_VERSION_AT_LEAST_11_FP8) + __nv_fp8x4_e4m3() = default; +#else + __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3() {} +#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */ + +#if !defined(__CUDA_NO_FP8_CONVERSIONS__) + + /* Construct from wider types */ + + /** + * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT + * Constructor from a pair of \p __half2 data type values, + * relies on \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __half2 flo, + const __half2 fhi) { + const __nv_fp8x2_storage_t rlo = __nv_cvt_halfraw2_to_fp8x2( + static_cast<__half2_raw>(flo), __NV_SATFINITE, __NV_E4M3); + const __nv_fp8x2_storage_t rhi = __nv_cvt_halfraw2_to_fp8x2( + static_cast<__half2_raw>(fhi), __NV_SATFINITE, __NV_E4M3); + __x = __internal_pack_u16x2_to_u32(rlo, rhi); + } + /** + * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT + * Constructor from a pair of \p __nv_bfloat162 data type values, + * relies on \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const __nv_bfloat162 flo, + const __nv_bfloat162 fhi) { + const __nv_fp8x2_storage_t rlo = __nv_cvt_bfloat16raw2_to_fp8x2( + static_cast<__nv_bfloat162_raw>(flo), __NV_SATFINITE, __NV_E4M3); + const __nv_fp8x2_storage_t rhi = __nv_cvt_bfloat16raw2_to_fp8x2( + static_cast<__nv_bfloat162_raw>(fhi), __NV_SATFINITE, __NV_E4M3); + __x = __internal_pack_u16x2_to_u32(rlo, rhi); + } + /** + * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT + * Constructor from \p float4 vector data type, + * relies on \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const float4 f) { + const float2 flo = {f.x, f.y}; + const float2 fhi = {f.z, f.w}; + const __nv_fp8x2_storage_t rlo = + __nv_cvt_float2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3); + const __nv_fp8x2_storage_t rhi = + __nv_cvt_float2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3); + __x = __internal_pack_u16x2_to_u32(rlo, rhi); + } + /** + * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT + * Constructor from \p double4 vector data type, + * relies on \p __NV_SATFINITE behavior for out-of-range values. + */ + explicit __CUDA_HOSTDEVICE_FP8__ __nv_fp8x4_e4m3(const double4 f) { + const double2 flo = {f.x, f.y}; + const double2 fhi = {f.z, f.w}; + const __nv_fp8x2_storage_t rlo = + __nv_cvt_double2_to_fp8x2(flo, __NV_SATFINITE, __NV_E4M3); + const __nv_fp8x2_storage_t rhi = + __nv_cvt_double2_to_fp8x2(fhi, __NV_SATFINITE, __NV_E4M3); + __x = __internal_pack_u16x2_to_u32(rlo, rhi); + } + +#if !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) + /* Widening converts */ + + /** + * \ingroup CUDA_MATH_FP8X4_E4M3_STRUCT + * Conversion operator to \p float4 vector data type. + */ + explicit __CUDA_HOSTDEVICE_FP8__ operator float4() const { + const __nv_fp8x2_storage_t slo = static_cast<__nv_fp8x2_storage_t>(__x); + const __nv_fp8x2_storage_t shi = + static_cast<__nv_fp8x2_storage_t>(__x >> 16U); + float2 rlo = __internal_halfraw2_to_float2( + __nv_cvt_fp8x2_to_halfraw2(slo, __NV_E4M3)); + float2 rhi = __internal_halfraw2_to_float2( + __nv_cvt_fp8x2_to_halfraw2(shi, __NV_E4M3)); + float4 res = {rlo.x, rlo.y, rhi.x, rhi.y}; + return res; + } +#endif /* !defined(__CUDA_NO_FP8_CONVERSION_OPERATORS__) */ +#endif /* !defined(__CUDA_NO_FP8_CONVERSIONS__) */ +}; + +#endif /* defined(__cplusplus) */ + +#endif /* end of include guard: __CUDA_FP8_HPP__ */ diff --git a/ext/cudart/include/cuda_gl_interop.h b/ext/cudart/include/cuda_gl_interop.h new file mode 100644 index 0000000000000000000000000000000000000000..68f6ac94f3b7912f42f01b775a102ac323427fe1 --- /dev/null +++ b/ext/cudart/include/cuda_gl_interop.h @@ -0,0 +1,508 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_GL_INTEROP_H__) +#define __CUDA_GL_INTEROP_H__ + +#include "cuda_runtime_api.h" + +#if defined(__APPLE__) + +#include <OpenGL/gl.h> + +#else /* __APPLE__ */ + +#if defined(__arm__) || defined(__aarch64__) +#ifndef GL_VERSION +#error Please include the appropriate gl headers before including cuda_gl_interop.h +#endif +#else +#include <GL/gl.h> +#endif + +#endif /* __APPLE__ */ + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/** + * \addtogroup CUDART_OPENGL OpenGL Interoperability + * This section describes the OpenGL interoperability functions of the CUDA + * runtime application programming interface. Note that mapping of OpenGL + * resources is performed with the graphics API agnostic, resource mapping + * interface described in \ref CUDART_INTEROP "Graphics Interopability". + * + * @{ + */ + +/** + * CUDA devices corresponding to the current OpenGL context + */ +enum cudaGLDeviceList +{ + cudaGLDeviceListAll = 1, /**< The CUDA devices for all GPUs used by the current OpenGL context */ + cudaGLDeviceListCurrentFrame = 2, /**< The CUDA devices for the GPUs used by the current OpenGL context in its currently rendering frame */ + cudaGLDeviceListNextFrame = 3 /**< The CUDA devices for the GPUs to be used by the current OpenGL context in the next frame */ +}; + +/** + * \brief Gets the CUDA devices associated with the current OpenGL context + * + * Returns in \p *pCudaDeviceCount the number of CUDA-compatible devices + * corresponding to the current OpenGL context. Also returns in \p *pCudaDevices + * at most \p cudaDeviceCount of the CUDA-compatible devices corresponding to + * the current OpenGL context. If any of the GPUs being used by the current OpenGL + * context are not CUDA capable then the call will return ::cudaErrorNoDevice. + * + * \param pCudaDeviceCount - Returned number of CUDA devices corresponding to the + * current OpenGL context + * \param pCudaDevices - Returned CUDA devices corresponding to the current + * OpenGL context + * \param cudaDeviceCount - The size of the output device array \p pCudaDevices + * \param deviceList - The set of devices to return. This set may be + * ::cudaGLDeviceListAll for all devices, + * ::cudaGLDeviceListCurrentFrame for the devices used to + * render the current frame (in SLI), or + * ::cudaGLDeviceListNextFrame for the devices used to + * render the next frame (in SLI). + * + * \return + * ::cudaSuccess, + * ::cudaErrorNoDevice, + * ::cudaErrorInvalidGraphicsContext, + * ::cudaErrorUnknown + * + * \note This function is not supported on Mac OS X. + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGLGetDevices + */ +extern __host__ cudaError_t CUDARTAPI cudaGLGetDevices(unsigned int *pCudaDeviceCount, int *pCudaDevices, unsigned int cudaDeviceCount, enum cudaGLDeviceList deviceList); + +/** + * \brief Register an OpenGL texture or renderbuffer object + * + * Registers the texture or renderbuffer object specified by \p image for access by CUDA. + * A handle to the registered object is returned as \p resource. + * + * \p target must match the type of the object, and must be one of ::GL_TEXTURE_2D, + * ::GL_TEXTURE_RECTANGLE, ::GL_TEXTURE_CUBE_MAP, ::GL_TEXTURE_3D, ::GL_TEXTURE_2D_ARRAY, + * or ::GL_RENDERBUFFER. + * + * The register flags \p flags specify the intended usage, as follows: + * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA. This is the default value. + * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA + * will not write to this resource. + * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that + * CUDA will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously + * stored in the resource will be preserved. + * - ::cudaGraphicsRegisterFlagsSurfaceLoadStore: Specifies that CUDA will + * bind this resource to a surface reference. + * - ::cudaGraphicsRegisterFlagsTextureGather: Specifies that CUDA will perform + * texture gather operations on this resource. + * + * The following image formats are supported. For brevity's sake, the list is abbreviated. + * For ex., {GL_R, GL_RG} X {8, 16} would expand to the following 4 formats + * {GL_R8, GL_R16, GL_RG8, GL_RG16} : + * - GL_RED, GL_RG, GL_RGBA, GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY + * - {GL_R, GL_RG, GL_RGBA} X {8, 16, 16F, 32F, 8UI, 16UI, 32UI, 8I, 16I, 32I} + * - {GL_LUMINANCE, GL_ALPHA, GL_LUMINANCE_ALPHA, GL_INTENSITY} X + * {8, 16, 16F_ARB, 32F_ARB, 8UI_EXT, 16UI_EXT, 32UI_EXT, 8I_EXT, 16I_EXT, 32I_EXT} + * + * The following image classes are currently disallowed: + * - Textures with borders + * - Multisampled renderbuffers + * + * \param resource - Pointer to the returned object handle + * \param image - name of texture or renderbuffer object to be registered + * \param target - Identifies the type of object specified by \p image + * \param flags - Register flags + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cuGraphicsGLRegisterImage + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterImage(struct cudaGraphicsResource **resource, GLuint image, GLenum target, unsigned int flags); + +/** + * \brief Registers an OpenGL buffer object + * + * Registers the buffer object specified by \p buffer for access by + * CUDA. A handle to the registered object is returned as \p + * resource. The register flags \p flags specify the intended usage, + * as follows: + * + * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this + * resource will be used. It is therefore assumed that this resource will be + * read from and written to by CUDA. This is the default value. + * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA + * will not write to this resource. + * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that + * CUDA will not read from this resource and will write over the + * entire contents of the resource, so none of the data previously + * stored in the resource will be preserved. + * + * \param resource - Pointer to the returned object handle + * \param buffer - name of buffer object to be registered + * \param flags - Register flags + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa + * ::cudaGraphicsUnregisterResource, + * ::cudaGraphicsMapResources, + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsGLRegisterBuffer + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsGLRegisterBuffer(struct cudaGraphicsResource **resource, GLuint buffer, unsigned int flags); + +#ifdef _WIN32 +#ifndef WGL_NV_gpu_affinity +typedef void* HGPUNV; +#endif + +/** + * \brief Gets the CUDA device associated with hGpu + * + * Returns the CUDA device associated with a hGpu, if applicable. + * + * \param device - Returns the device associated with hGpu, or -1 if hGpu is + * not a compute device. + * \param hGpu - Handle to a GPU, as queried via WGL_NV_gpu_affinity + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa + * ::WGL_NV_gpu_affinity, + * ::cuWGLGetDevice + */ +extern __host__ cudaError_t CUDARTAPI cudaWGLGetDevice(int *device, HGPUNV hGpu); +#endif + +/** @} */ /* END CUDART_OPENGL */ + +/** + * \addtogroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] + * This section describes deprecated OpenGL interoperability functionality. + * + * @{ + */ + +/** + * CUDA GL Map Flags + */ +enum cudaGLMapFlags +{ + cudaGLMapFlagsNone = 0, /**< Default; Assume resource can be read/written */ + cudaGLMapFlagsReadOnly = 1, /**< CUDA kernels will not write to this resource */ + cudaGLMapFlagsWriteDiscard = 2 /**< CUDA kernels will only write to and will not read from this resource */ +}; + +/** + * \brief Sets a CUDA device to use OpenGL interoperability + * + * \deprecated This function is deprecated as of CUDA 5.0. + * + * This function is deprecated and should no longer be used. It is + * no longer necessary to associate a CUDA device with an OpenGL + * context in order to achieve maximum interoperability performance. + * + * \param device - Device to use for OpenGL interoperability + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorSetOnActiveProcess + * \notefnerr + * + * \sa ::cudaGraphicsGLRegisterBuffer, ::cudaGraphicsGLRegisterImage + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetGLDevice(int device); + +/** + * \brief Registers a buffer object for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Registers the buffer object of ID \p bufObj for access by + * CUDA. This function must be called before CUDA can map the buffer + * object. The OpenGL context used to create the buffer, or another + * context from the same share group, must be bound to the current + * thread when this is called. + * + * \param bufObj - Buffer object ID to register + * + * \return + * ::cudaSuccess, + * ::cudaErrorInitializationError + * \notefnerr + * + * \sa ::cudaGraphicsGLRegisterBuffer + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLRegisterBufferObject(GLuint bufObj); + +/** + * \brief Maps a buffer object for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Maps the buffer object of ID \p bufObj into the address space of + * CUDA and returns in \p *devPtr the base pointer of the resulting + * mapping. The buffer must have previously been registered by + * calling ::cudaGLRegisterBufferObject(). While a buffer is mapped + * by CUDA, any OpenGL operation which references the buffer will + * result in undefined behavior. The OpenGL context used to create + * the buffer, or another context from the same share group, must be + * bound to the current thread when this is called. + * + * All streams in the current thread are synchronized with the current + * GL context. + * + * \param devPtr - Returned device pointer to CUDA object + * \param bufObj - Buffer object ID to map + * + * \return + * ::cudaSuccess, + * ::cudaErrorMapBufferObjectFailed + * \notefnerr + * + * \sa ::cudaGraphicsMapResources + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObject(void **devPtr, GLuint bufObj); + +/** + * \brief Unmaps a buffer object for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unmaps the buffer object of ID \p bufObj for access by CUDA. When + * a buffer is unmapped, the base address returned by + * ::cudaGLMapBufferObject() is invalid and subsequent references to + * the address result in undefined behavior. The OpenGL context used + * to create the buffer, or another context from the same share group, + * must be bound to the current thread when this is called. + * + * All streams in the current thread are synchronized with the current + * GL context. + * + * \param bufObj - Buffer object to unmap + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnmapBufferObjectFailed + * \notefnerr + * + * \sa ::cudaGraphicsUnmapResources + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObject(GLuint bufObj); + +/** + * \brief Unregisters a buffer object for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unregisters the buffer object of ID \p bufObj for access by CUDA + * and releases any CUDA resources associated with the buffer. Once a + * buffer is unregistered, it may no longer be mapped by CUDA. The GL + * context used to create the buffer, or another context from the + * same share group, must be bound to the current thread when this is + * called. + * + * \param bufObj - Buffer object to unregister + * + * \return + * ::cudaSuccess + * \notefnerr + * + * \sa ::cudaGraphicsUnregisterResource + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnregisterBufferObject(GLuint bufObj); + +/** + * \brief Set usage flags for mapping an OpenGL buffer + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Set flags for mapping the OpenGL buffer \p bufObj + * + * Changes to flags will take effect the next time \p bufObj is mapped. + * The \p flags argument may be any of the following: + * + * - ::cudaGLMapFlagsNone: Specifies no hints about how this buffer will + * be used. It is therefore assumed that this buffer will be read from and + * written to by CUDA kernels. This is the default value. + * - ::cudaGLMapFlagsReadOnly: Specifies that CUDA kernels which access this + * buffer will not write to the buffer. + * - ::cudaGLMapFlagsWriteDiscard: Specifies that CUDA kernels which access + * this buffer will not read from the buffer and will write over the + * entire contents of the buffer, so none of the data previously stored in + * the buffer will be preserved. + * + * If \p bufObj has not been registered for use with CUDA, then + * ::cudaErrorInvalidResourceHandle is returned. If \p bufObj is presently + * mapped for access by CUDA, then ::cudaErrorUnknown is returned. + * + * \param bufObj - Registered buffer object to set flags for + * \param flags - Parameters for buffer mapping + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * + * \sa ::cudaGraphicsResourceSetMapFlags + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLSetBufferObjectMapFlags(GLuint bufObj, unsigned int flags); + +/** + * \brief Maps a buffer object for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Maps the buffer object of ID \p bufObj into the address space of + * CUDA and returns in \p *devPtr the base pointer of the resulting + * mapping. The buffer must have previously been registered by + * calling ::cudaGLRegisterBufferObject(). While a buffer is mapped + * by CUDA, any OpenGL operation which references the buffer will + * result in undefined behavior. The OpenGL context used to create + * the buffer, or another context from the same share group, must be + * bound to the current thread when this is called. + * + * Stream /p stream is synchronized with the current GL context. + * + * \param devPtr - Returned device pointer to CUDA object + * \param bufObj - Buffer object ID to map + * \param stream - Stream to synchronize + * + * \return + * ::cudaSuccess, + * ::cudaErrorMapBufferObjectFailed + * \notefnerr + * + * \sa ::cudaGraphicsMapResources + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLMapBufferObjectAsync(void **devPtr, GLuint bufObj, cudaStream_t stream); + +/** + * \brief Unmaps a buffer object for access by CUDA + * + * \deprecated This function is deprecated as of CUDA 3.0. + * + * Unmaps the buffer object of ID \p bufObj for access by CUDA. When + * a buffer is unmapped, the base address returned by + * ::cudaGLMapBufferObject() is invalid and subsequent references to + * the address result in undefined behavior. The OpenGL context used + * to create the buffer, or another context from the same share group, + * must be bound to the current thread when this is called. + * + * Stream /p stream is synchronized with the current GL context. + * + * \param bufObj - Buffer object to unmap + * \param stream - Stream to synchronize + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnmapBufferObjectFailed + * \notefnerr + * + * \sa ::cudaGraphicsUnmapResources + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGLUnmapBufferObjectAsync(GLuint bufObj, cudaStream_t stream); + +/** @} */ /* END CUDART_OPENGL_DEPRECATED */ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#undef __CUDA_DEPRECATED + +#endif /* __CUDA_GL_INTEROP_H__ */ + diff --git a/ext/cudart/include/cuda_occupancy.h b/ext/cudart/include/cuda_occupancy.h new file mode 100644 index 0000000000000000000000000000000000000000..ffe55709f8ccdebf7341180f043006b68c08e104 --- /dev/null +++ b/ext/cudart/include/cuda_occupancy.h @@ -0,0 +1,1958 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/** + * CUDA Occupancy Calculator + * + * NAME + * + * cudaOccMaxActiveBlocksPerMultiprocessor, + * cudaOccMaxPotentialOccupancyBlockSize, + * cudaOccMaxPotentialOccupancyBlockSizeVariableSMem + * cudaOccAvailableDynamicSMemPerBlock + * + * DESCRIPTION + * + * The CUDA occupancy calculator provides a standalone, programmatical + * interface to compute the occupancy of a function on a device. It can also + * provide occupancy-oriented launch configuration suggestions. + * + * The function and device are defined by the user through + * cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState + * structures. All APIs require all 3 of them. + * + * See the structure definition for more details about the device / function + * descriptors. + * + * See each API's prototype for API usage. + * + * COMPATIBILITY + * + * The occupancy calculator will be updated on each major CUDA toolkit + * release. It does not provide forward compatibility, i.e. new hardwares + * released after this implementation's release will not be supported. + * + * NOTE + * + * If there is access to CUDA runtime, and the sole intent is to calculate + * occupancy related values on one of the accessible CUDA devices, using CUDA + * runtime's occupancy calculation APIs is recommended. + * + */ + +#ifndef __cuda_occupancy_h__ +#define __cuda_occupancy_h__ + +#include <stddef.h> +#include <limits.h> +#include <string.h> + + +// __OCC_INLINE will be undefined at the end of this header +// +#ifdef __CUDACC__ +#define __OCC_INLINE inline __host__ __device__ +#elif defined _MSC_VER +#define __OCC_INLINE __inline +#else // GNUCC assumed +#define __OCC_INLINE inline +#endif + +enum cudaOccError_enum { + CUDA_OCC_SUCCESS = 0, // no error encountered + CUDA_OCC_ERROR_INVALID_INPUT = 1, // input parameter is invalid + CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, // requested device is not supported in + // current implementation or device is + // invalid +}; +typedef enum cudaOccError_enum cudaOccError; + +typedef struct cudaOccResult cudaOccResult; +typedef struct cudaOccDeviceProp cudaOccDeviceProp; +typedef struct cudaOccFuncAttributes cudaOccFuncAttributes; +typedef struct cudaOccDeviceState cudaOccDeviceState; + +/** + * The CUDA occupancy calculator computes the occupancy of the function + * described by attributes with the given block size (blockSize), static device + * properties (properties), dynamic device states (states) and per-block dynamic + * shared memory allocation (dynamicSMemSize) in bytes, and output it through + * result along with other useful information. The occupancy is computed in + * terms of the maximum number of active blocks per multiprocessor. The user can + * then convert it to other metrics, such as number of active warps. + * + * RETURN VALUE + * + * The occupancy and related information is returned through result. + * + * If result->activeBlocksPerMultiprocessor is 0, then the given parameter + * combination cannot run on the device. + * + * ERRORS + * + * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. + * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in + * current implementation or device is invalid + */ +static __OCC_INLINE +cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor( + cudaOccResult *result, // out + const cudaOccDeviceProp *properties, // in + const cudaOccFuncAttributes *attributes, // in + const cudaOccDeviceState *state, // in + int blockSize, // in + size_t dynamicSmemSize); // in + +/** + * The CUDA launch configurator C API suggests a grid / block size pair (in + * minGridSize and blockSize) that achieves the best potential occupancy + * (i.e. maximum number of active warps with the smallest number of blocks) for + * the given function described by attributes, on a device described by + * properties with settings in state. + * + * If per-block dynamic shared memory allocation is not needed, the user should + * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0. + * + * If per-block dynamic shared memory allocation is needed, then if the dynamic + * shared memory size is constant regardless of block size, the size should be + * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be + * NULL. + * + * Otherwise, if the per-block dynamic shared memory size varies with different + * block sizes, the user needs to provide a pointer to an unary function through + * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by + * a block of the function for any given block size. dynamicSMemSize is + * ignored. An example signature is: + * + * // Take block size, returns dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * + * RETURN VALUE + * + * The suggested block size and the minimum number of blocks needed to achieve + * the maximum occupancy are returned through blockSize and minGridSize. + * + * If *blockSize is 0, then the given combination cannot run on the device. + * + * ERRORS + * + * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. + * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in + * current implementation or device is invalid + * + */ +static __OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSize( + int *minGridSize, // out + int *blockSize, // out + const cudaOccDeviceProp *properties, // in + const cudaOccFuncAttributes *attributes, // in + const cudaOccDeviceState *state, // in + size_t (*blockSizeToDynamicSMemSize)(int), // in + size_t dynamicSMemSize); // in + +/** + * The CUDA launch configurator C++ API suggests a grid / block size pair (in + * minGridSize and blockSize) that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number of blocks) + * for the given function described by attributes, on a device described by + * properties with settings in state. + * + * If per-block dynamic shared memory allocation is 0 or constant regardless of + * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to + * configure the launch. A constant dynamic shared memory allocation size in + * bytes can be passed through dynamicSMemSize. + * + * Otherwise, if the per-block dynamic shared memory size varies with different + * block sizes, the user needs to use + * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a + * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that + * computes the dynamic shared memory needed by func for any given block + * size. An example signature is: + * + * // Take block size, returns per-block dynamic shared memory needed + * size_t blockToSmem(int blockSize); + * + * RETURN VALUE + * + * The suggested block size and the minimum number of blocks needed to achieve + * the maximum occupancy are returned through blockSize and minGridSize. + * + * If *blockSize is 0, then the given combination cannot run on the device. + * + * ERRORS + * + * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. + * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in + * current implementation or device is invalid + * + */ + +#if defined(__cplusplus) +namespace { + +__OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSize( + int *minGridSize, // out + int *blockSize, // out + const cudaOccDeviceProp *properties, // in + const cudaOccFuncAttributes *attributes, // in + const cudaOccDeviceState *state, // in + size_t dynamicSMemSize = 0); // in + +template <typename UnaryFunction> +__OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem( + int *minGridSize, // out + int *blockSize, // out + const cudaOccDeviceProp *properties, // in + const cudaOccFuncAttributes *attributes, // in + const cudaOccDeviceState *state, // in + UnaryFunction blockSizeToDynamicSMemSize); // in + +} // namespace anonymous +#endif // defined(__cplusplus) + +/** + * + * The CUDA dynamic shared memory calculator computes the maximum size of + * per-block dynamic shared memory if we want to place numBlocks blocks + * on an SM. + * + * RETURN VALUE + * + * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow + * numBlocks blocks per SM. + * + * ERRORS + * + * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid. + * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in + * current implementation or device is invalid + * + */ +static __OCC_INLINE +cudaOccError cudaOccAvailableDynamicSMemPerBlock( + size_t *dynamicSmemSize, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + int numBlocks, + int blockSize); + +/** + * Data structures + * + * These structures are subject to change for future architecture and CUDA + * releases. C users should initialize the structure as {0}. + * + */ + +/** + * Device descriptor + * + * This structure describes a device. + */ +struct cudaOccDeviceProp { + int computeMajor; // Compute capability major version + int computeMinor; // Compute capability minor + // version. None supported minor version + // may cause error + int maxThreadsPerBlock; // Maximum number of threads per block + int maxThreadsPerMultiprocessor; // Maximum number of threads per SM + // i.e. (Max. number of warps) x (warp + // size) + int regsPerBlock; // Maximum number of registers per block + int regsPerMultiprocessor; // Maximum number of registers per SM + int warpSize; // Warp size + size_t sharedMemPerBlock; // Maximum shared memory size per block + size_t sharedMemPerMultiprocessor; // Maximum shared memory size per SM + int numSms; // Number of SMs available + size_t sharedMemPerBlockOptin; // Maximum optin shared memory size per block + size_t reservedSharedMemPerBlock; // Shared memory per block reserved by driver + +#ifdef __cplusplus + // This structure can be converted from a cudaDeviceProp structure for users + // that use this header in their CUDA applications. + // + // If the application have access to the CUDA Runtime API, the application + // can obtain the device properties of a CUDA device through + // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the + // cudaDeviceProp structure. + // + // Example: + /* + { + cudaDeviceProp prop; + + cudaGetDeviceProperties(&prop, ...); + + cudaOccDeviceProp occProp = prop; + + ... + + cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...); + } + */ + // + template<typename DeviceProp> + __OCC_INLINE + cudaOccDeviceProp(const DeviceProp &props) + : computeMajor (props.major), + computeMinor (props.minor), + maxThreadsPerBlock (props.maxThreadsPerBlock), + maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor), + regsPerBlock (props.regsPerBlock), + regsPerMultiprocessor (props.regsPerMultiprocessor), + warpSize (props.warpSize), + sharedMemPerBlock (props.sharedMemPerBlock), + sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor), + numSms (props.multiProcessorCount), + sharedMemPerBlockOptin (props.sharedMemPerBlockOptin), + reservedSharedMemPerBlock (props.reservedSharedMemPerBlock) + {} + + __OCC_INLINE + cudaOccDeviceProp() + : computeMajor (0), + computeMinor (0), + maxThreadsPerBlock (0), + maxThreadsPerMultiprocessor (0), + regsPerBlock (0), + regsPerMultiprocessor (0), + warpSize (0), + sharedMemPerBlock (0), + sharedMemPerMultiprocessor (0), + numSms (0), + sharedMemPerBlockOptin (0), + reservedSharedMemPerBlock (0) + {} +#endif // __cplusplus +}; + +/** + * Partitioned global caching option + */ +typedef enum cudaOccPartitionedGCConfig_enum { + PARTITIONED_GC_OFF, // Disable partitioned global caching + PARTITIONED_GC_ON, // Prefer partitioned global caching + PARTITIONED_GC_ON_STRICT // Force partitioned global caching +} cudaOccPartitionedGCConfig; + +/** + * Per function opt in maximum dynamic shared memory limit + */ +typedef enum cudaOccFuncShmemConfig_enum { + FUNC_SHMEM_LIMIT_DEFAULT, // Default shmem limit + FUNC_SHMEM_LIMIT_OPTIN, // Use the optin shmem limit +} cudaOccFuncShmemConfig; + +/** + * Function descriptor + * + * This structure describes a CUDA function. + */ +struct cudaOccFuncAttributes { + int maxThreadsPerBlock; // Maximum block size the function can work with. If + // unlimited, use INT_MAX or any value greater than + // or equal to maxThreadsPerBlock of the device + int numRegs; // Number of registers used. When the function is + // launched on device, the register count may change + // due to internal tools requirements. + size_t sharedSizeBytes; // Number of static shared memory used + + cudaOccPartitionedGCConfig partitionedGCConfig; + // Partitioned global caching is required to enable + // caching on certain chips, such as sm_52 + // devices. Partitioned global caching can be + // automatically disabled if the occupancy + // requirement of the launch cannot support caching. + // + // To override this behavior with caching on and + // calculate occupancy strictly according to the + // preference, set partitionedGCConfig to + // PARTITIONED_GC_ON_STRICT. This is especially + // useful for experimenting and finding launch + // configurations (MaxPotentialOccupancyBlockSize) + // that allow global caching to take effect. + // + // This flag only affects the occupancy calculation. + + cudaOccFuncShmemConfig shmemLimitConfig; + // Certain chips like sm_70 allow a user to opt into + // a higher per block limit of dynamic shared memory + // This optin is performed on a per function basis + // using the cuFuncSetAttribute function + + size_t maxDynamicSharedSizeBytes; + // User set limit on maximum dynamic shared memory + // usable by the kernel + // This limit is set using the cuFuncSetAttribute + // function. + + int numBlockBarriers; // Number of block barriers used (default to 1) +#ifdef __cplusplus + // This structure can be converted from a cudaFuncAttributes structure for + // users that use this header in their CUDA applications. + // + // If the application have access to the CUDA Runtime API, the application + // can obtain the function attributes of a CUDA kernel function through + // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the + // cudaFuncAttributes structure. + // + // Example: + /* + __global__ void foo() {...} + + ... + + { + cudaFuncAttributes attr; + + cudaFuncGetAttributes(&attr, foo); + + cudaOccFuncAttributes occAttr = attr; + + ... + + cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...); + } + */ + // + template<typename FuncAttributes> + __OCC_INLINE + cudaOccFuncAttributes(const FuncAttributes &attr) + : maxThreadsPerBlock (attr.maxThreadsPerBlock), + numRegs (attr.numRegs), + sharedSizeBytes (attr.sharedSizeBytes), + partitionedGCConfig (PARTITIONED_GC_OFF), + shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN), + maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes), + numBlockBarriers (1) + {} + + __OCC_INLINE + cudaOccFuncAttributes() + : maxThreadsPerBlock (0), + numRegs (0), + sharedSizeBytes (0), + partitionedGCConfig (PARTITIONED_GC_OFF), + shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT), + maxDynamicSharedSizeBytes (0), + numBlockBarriers (0) + {} +#endif +}; + +typedef enum cudaOccCacheConfig_enum { + CACHE_PREFER_NONE = 0x00, // no preference for shared memory or L1 (default) + CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache + CACHE_PREFER_L1 = 0x02, // prefer larger L1 cache and smaller shared memory + CACHE_PREFER_EQUAL = 0x03 // prefer equal sized L1 cache and shared memory +} cudaOccCacheConfig; + +typedef enum cudaOccCarveoutConfig_enum { + SHAREDMEM_CARVEOUT_DEFAULT = -1, // no preference for shared memory or L1 (default) + SHAREDMEM_CARVEOUT_MAX_SHARED = 100, // prefer maximum available shared memory, minimum L1 cache + SHAREDMEM_CARVEOUT_MAX_L1 = 0, // prefer maximum available L1 cache, minimum shared memory + SHAREDMEM_CARVEOUT_HALF = 50 // prefer half of maximum available shared memory, with the rest as L1 cache +} cudaOccCarveoutConfig; + +/** + * Device state descriptor + * + * This structure describes device settings that affect occupancy calculation. + */ +struct cudaOccDeviceState +{ + // Cache / shared memory split preference. Deprecated on Volta + cudaOccCacheConfig cacheConfig; + // Shared memory / L1 split preference. Supported on only Volta + int carveoutConfig; + +#ifdef __cplusplus + __OCC_INLINE + cudaOccDeviceState() + : cacheConfig (CACHE_PREFER_NONE), + carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT) + {} +#endif +}; + +typedef enum cudaOccLimitingFactor_enum { + // Occupancy limited due to: + OCC_LIMIT_WARPS = 0x01, // - warps available + OCC_LIMIT_REGISTERS = 0x02, // - registers available + OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available + OCC_LIMIT_BLOCKS = 0x08, // - blocks available + OCC_LIMIT_BARRIERS = 0x10 // - barrier available +} cudaOccLimitingFactor; + +/** + * Occupancy output + * + * This structure contains occupancy calculator's output. + */ +struct cudaOccResult { + int activeBlocksPerMultiprocessor; // Occupancy + unsigned int limitingFactors; // Factors that limited occupancy. A bit + // field that counts the limiting + // factors, see cudaOccLimitingFactor + int blockLimitRegs; // Occupancy due to register + // usage, INT_MAX if the kernel does not + // use any register. + int blockLimitSharedMem; // Occupancy due to shared memory + // usage, INT_MAX if the kernel does not + // use shared memory. + int blockLimitWarps; // Occupancy due to block size limit + int blockLimitBlocks; // Occupancy due to maximum number of blocks + // managable per SM + int blockLimitBarriers; // Occupancy due to block barrier usage + int allocatedRegistersPerBlock; // Actual number of registers allocated per + // block + size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated + // per block + cudaOccPartitionedGCConfig partitionedGCConfig; + // Report if partitioned global caching + // is actually enabled. +}; + +/** + * Partitioned global caching support + * + * See cudaOccPartitionedGlobalCachingModeSupport + */ +typedef enum cudaOccPartitionedGCSupport_enum { + PARTITIONED_GC_NOT_SUPPORTED, // Partitioned global caching is not supported + PARTITIONED_GC_SUPPORTED, // Partitioned global caching is supported +} cudaOccPartitionedGCSupport; + +/** + * Implementation + */ + +/** + * Max compute capability supported + */ +#define __CUDA_OCC_MAJOR__ 9 +#define __CUDA_OCC_MINOR__ 0 + +////////////////////////////////////////// +// Mathematical Helper Functions // +////////////////////////////////////////// + +static __OCC_INLINE int __occMin(int lhs, int rhs) +{ + return rhs < lhs ? rhs : lhs; +} + +static __OCC_INLINE int __occDivideRoundUp(int x, int y) +{ + return (x + (y - 1)) / y; +} + +static __OCC_INLINE int __occRoundUp(int x, int y) +{ + return y * __occDivideRoundUp(x, y); +} + +////////////////////////////////////////// +// Architectural Properties // +////////////////////////////////////////// + +/** + * Granularity of shared memory allocation + */ +static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + case 5: + case 6: + case 7: + value = 256; + break; + case 8: + case 9: + value = 128; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + +/** + * Maximum number of registers per thread + */ +static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + case 5: + case 6: + value = 255; + break; + case 7: + case 8: + case 9: + value = 256; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + +/** + * Granularity of register allocation + */ +static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + case 5: + case 6: + case 7: + case 8: + case 9: + value = 256; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + +/** + * Number of sub-partitions + */ +static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + case 5: + case 7: + case 8: + case 9: + value = 4; + break; + case 6: + value = properties->computeMinor ? 4 : 2; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + + +/** + * Maximum number of blocks that can run simultaneously on a multiprocessor + */ +static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties) +{ + int value; + + switch(properties->computeMajor) { + case 3: + value = 16; + break; + case 5: + case 6: + value = 32; + break; + case 7: { + int isTuring = properties->computeMinor == 5; + value = (isTuring) ? 16 : 32; + break; + } + case 8: + if (properties->computeMinor == 0) { + value = 32; + } + else if (properties->computeMinor == 9) { + value = 24; + } + else { + value = 16; + } + break; + case 9: + value = 32; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = value; + + return CUDA_OCC_SUCCESS; +} + +/** + * Align up shared memory based on compute major configurations + */ +static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties) +{ + // Volta and Turing have shared L1 cache / shared memory, and support cache + // configuration to trade one for the other. These values are needed to + // map carveout config ratio to the next available architecture size + size_t size = *shMemSize; + + switch (properties->computeMajor) { + case 7: { + // Turing supports 32KB and 64KB shared mem. + int isTuring = properties->computeMinor == 5; + if (isTuring) { + if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + } + // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem. + else { + if (size == 0) { + *shMemSize = 0; + } + else if (size <= 8 * 1024) { + *shMemSize = 8 * 1024; + } + else if (size <= 16 * 1024) { + *shMemSize = 16 * 1024; + } + else if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else if (size <= 96 * 1024) { + *shMemSize = 96 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + } + break; + } + case 8: + if (properties->computeMinor == 0 || properties->computeMinor == 7) { + if (size == 0) { + *shMemSize = 0; + } + else if (size <= 8 * 1024) { + *shMemSize = 8 * 1024; + } + else if (size <= 16 * 1024) { + *shMemSize = 16 * 1024; + } + else if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else if (size <= 100 * 1024) { + *shMemSize = 100 * 1024; + } + else if (size <= 132 * 1024) { + *shMemSize = 132 * 1024; + } + else if (size <= 164 * 1024) { + *shMemSize = 164 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + } + else { + if (size == 0) { + *shMemSize = 0; + } + else if (size <= 8 * 1024) { + *shMemSize = 8 * 1024; + } + else if (size <= 16 * 1024) { + *shMemSize = 16 * 1024; + } + else if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else if (size <= 100 * 1024) { + *shMemSize = 100 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + } + break; + case 9: { + if (size == 0) { + *shMemSize = 0; + } + else if (size <= 8 * 1024) { + *shMemSize = 8 * 1024; + } + else if (size <= 16 * 1024) { + *shMemSize = 16 * 1024; + } + else if (size <= 32 * 1024) { + *shMemSize = 32 * 1024; + } + else if (size <= 64 * 1024) { + *shMemSize = 64 * 1024; + } + else if (size <= 100 * 1024) { + *shMemSize = 100 * 1024; + } + else if (size <= 132 * 1024) { + *shMemSize = 132 * 1024; + } + else if (size <= 164 * 1024) { + *shMemSize = 164 * 1024; + } + else if (size <= 196 * 1024) { + *shMemSize = 196 * 1024; + } + else if (size <= 228 * 1024) { + *shMemSize = 228 * 1024; + } + else { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + break; + } + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + return CUDA_OCC_SUCCESS; +} + +/** + * Shared memory based on the new carveoutConfig API introduced with Volta + */ +static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + size_t preferenceShmemSize; + + // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported + // devices. This preference will take precedence over the older cacheConfig setting. + // Map cacheConfig to its effective preference value. + int effectivePreference = state->carveoutConfig; + if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) { + switch (state->cacheConfig) + { + case CACHE_PREFER_L1: + effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1; + break; + case CACHE_PREFER_SHARED: + effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED; + break; + case CACHE_PREFER_EQUAL: + effectivePreference = SHAREDMEM_CARVEOUT_HALF; + break; + default: + effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT; + break; + } + } + + if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) { + preferenceShmemSize = properties->sharedMemPerMultiprocessor; + } + else { + preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100; + } + + status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties); + *limit = preferenceShmemSize; + return status; +} + +/** + * Shared memory based on the cacheConfig + */ +static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) +{ + size_t bytes = 0; + size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor; + cudaOccCacheConfig cacheConfig = state->cacheConfig; + + // Kepler has shared L1 cache / shared memory, and support cache + // configuration to trade one for the other. These values are needed to + // calculate the correct shared memory size for user requested cache + // configuration. + // + size_t minCacheSize = 16384; + size_t maxCacheSize = 49152; + size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize; + size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize; + + switch (properties->computeMajor) { + case 3: + // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest + // is shared memory. + // + switch (cacheConfig) { + default : + case CACHE_PREFER_NONE: + case CACHE_PREFER_SHARED: + bytes = sharedMemPerMultiprocessorHigh; + break; + case CACHE_PREFER_L1: + bytes = sharedMemPerMultiprocessorLow; + break; + case CACHE_PREFER_EQUAL: + // Equal is the mid-point between high and low. It should be + // equivalent to low + 16KB. + // + bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2; + break; + } + break; + case 5: + case 6: + // Maxwell and Pascal have dedicated shared memory. + // + bytes = sharedMemPerMultiprocessorHigh; + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + *limit = bytes; + + return CUDA_OCC_SUCCESS; +} + +/** + * Shared memory based on config requested by User + */ +static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state) +{ + // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference, + // it is handled separately from the cache config preference. + if (properties->computeMajor >= 7) { + return cudaOccSMemPreferenceVoltaPlus(limit, properties, state); + } + return cudaOccSMemPreference(limit, properties, state); +} + +/** + * Return the per block shared memory limit based on function config + */ +static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta) +{ + switch (properties->computeMajor) { + case 2: + case 3: + case 4: + case 5: + case 6: + *limit = properties->sharedMemPerBlock; + break; + case 7: + case 8: + case 9: + switch (shmemLimitConfig) { + default: + case FUNC_SHMEM_LIMIT_DEFAULT: + *limit = properties->sharedMemPerBlock; + break; + case FUNC_SHMEM_LIMIT_OPTIN: + if (smemPerCta > properties->sharedMemPerBlock) { + *limit = properties->sharedMemPerBlockOptin; + } + else { + *limit = properties->sharedMemPerBlock; + } + break; + } + break; + default: + return CUDA_OCC_ERROR_UNKNOWN_DEVICE; + } + + // Starting Ampere, CUDA driver reserves additional shared memory per block + if (properties->computeMajor >= 8) { + *limit += properties->reservedSharedMemPerBlock; + } + + return CUDA_OCC_SUCCESS; +} + +/** + * Partitioned global caching mode support + */ +static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties) +{ + *limit = PARTITIONED_GC_NOT_SUPPORTED; + + if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) || + properties->computeMajor == 6) { + *limit = PARTITIONED_GC_SUPPORTED; + } + + if (properties->computeMajor == 6 && properties->computeMinor == 0) { + *limit = PARTITIONED_GC_NOT_SUPPORTED; + } + + return CUDA_OCC_SUCCESS; +} + +/////////////////////////////////////////////// +// User Input Sanity // +/////////////////////////////////////////////// + +static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties) +{ + // Verify device properties + // + // Each of these limits must be a positive number. + // + // Compute capacity is checked during the occupancy calculation + // + if (properties->maxThreadsPerBlock <= 0 || + properties->maxThreadsPerMultiprocessor <= 0 || + properties->regsPerBlock <= 0 || + properties->regsPerMultiprocessor <= 0 || + properties->warpSize <= 0 || + properties->sharedMemPerBlock <= 0 || + properties->sharedMemPerMultiprocessor <= 0 || + properties->numSms <= 0) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes) +{ + // Verify function attributes + // + if (attributes->maxThreadsPerBlock <= 0 || + attributes->numRegs < 0) { // Compiler may choose not to use + // any register (empty kernels, + // etc.) + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state) +{ + (void)state; // silence unused-variable warning + // Placeholder + // + + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE cudaOccError cudaOccInputCheck( + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + + status = cudaOccDevicePropCheck(properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + status = cudaOccFuncAttributesCheck(attributes); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + status = cudaOccDeviceStateCheck(state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + return status; +} + +/////////////////////////////////////////////// +// Occupancy calculation Functions // +/////////////////////////////////////////////// + +static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected( + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes) +{ + cudaOccPartitionedGCSupport gcSupport; + cudaOccPartitionedGCConfig gcConfig; + + cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties); + + gcConfig = attributes->partitionedGCConfig; + + if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) { + gcConfig = PARTITIONED_GC_OFF; + } + + return gcConfig; +} + +// Warp limit +// +static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit( + int *limit, + cudaOccPartitionedGCConfig gcConfig, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + int blockSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int maxWarpsPerSm; + int warpsAllocatedPerCTA; + int maxBlocks; + (void)attributes; // silence unused-variable warning + + if (blockSize > properties->maxThreadsPerBlock) { + maxBlocks = 0; + } + else { + maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize; + warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize); + maxBlocks = 0; + + if (gcConfig != PARTITIONED_GC_OFF) { + int maxBlocksPerSmPartition; + int maxWarpsPerSmPartition; + + // If partitioned global caching is on, then a CTA can only use a SM + // partition (a half SM), and thus a half of the warp slots + // available per SM + // + maxWarpsPerSmPartition = maxWarpsPerSm / 2; + maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA; + maxBlocks = maxBlocksPerSmPartition * 2; + } + // On hardware that supports partitioned global caching, each half SM is + // guaranteed to support at least 32 warps (maximum number of warps of a + // CTA), so caching will not cause 0 occupancy due to insufficient warp + // allocation slots. + // + else { + maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA; + } + } + + *limit = maxBlocks; + + return status; +} + +// Shared memory limit +// +static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit( + int *limit, + cudaOccResult *result, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + int blockSize, + size_t dynamicSmemSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int allocationGranularity; + size_t userSmemPreference = 0; + size_t totalSmemUsagePerCTA; + size_t maxSmemUsagePerCTA; + size_t smemAllocatedPerCTA; + size_t staticSmemSize; + size_t sharedMemPerMultiprocessor; + size_t smemLimitPerCTA; + int maxBlocks; + int dynamicSmemSizeExceeded = 0; + int totalSmemSizeExceeded = 0; + (void)blockSize; // silence unused-variable warning + + status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Obtain the user preferred shared memory size. This setting is ignored if + // user requests more shared memory than preferred. + // + status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock; + totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize; + smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity); + + maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes; + + dynamicSmemSizeExceeded = 0; + totalSmemSizeExceeded = 0; + + // Obtain the user set maximum dynamic size if it exists + // If so, the current launch dynamic shared memory must not + // exceed the set limit + if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT && + dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) { + dynamicSmemSizeExceeded = 1; + } + + status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + if (smemAllocatedPerCTA > smemLimitPerCTA) { + totalSmemSizeExceeded = 1; + } + + if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) { + maxBlocks = 0; + } + else { + // User requested shared memory limit is used as long as it is greater + // than the total shared memory used per CTA, i.e. as long as at least + // one CTA can be launched. + if (userSmemPreference >= smemAllocatedPerCTA) { + sharedMemPerMultiprocessor = userSmemPreference; + } + else { + // On Volta+, user requested shared memory will limit occupancy + // if it's less than shared memory per CTA. Otherwise, the + // maximum shared memory limit is used. + if (properties->computeMajor >= 7) { + sharedMemPerMultiprocessor = smemAllocatedPerCTA; + status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + } + else { + sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor; + } + } + + if (smemAllocatedPerCTA > 0) { + maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA); + } + else { + maxBlocks = INT_MAX; + } + } + + result->allocatedSharedMemPerBlock = smemAllocatedPerCTA; + + *limit = maxBlocks; + + return status; +} + +static __OCC_INLINE +cudaOccError cudaOccMaxBlocksPerSMRegsLimit( + int *limit, + cudaOccPartitionedGCConfig *gcConfig, + cudaOccResult *result, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + int blockSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int allocationGranularity; + int warpsAllocatedPerCTA; + int regsAllocatedPerCTA; + int regsAssumedPerCTA; + int regsPerWarp; + int regsAllocatedPerWarp; + int numSubPartitions; + int numRegsPerSubPartition; + int numWarpsPerSubPartition; + int numWarpsPerSM; + int maxBlocks; + int maxRegsPerThread; + + status = cudaOccRegAllocationGranularity( + &allocationGranularity, + properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + status = cudaOccRegAllocationMaxPerThread( + &maxRegsPerThread, + properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize); + + // GPUs of compute capability 2.x and higher allocate registers to warps + // + // Number of regs per warp is regs per thread x warp size, rounded up to + // register allocation granularity + // + regsPerWarp = attributes->numRegs * properties->warpSize; + regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity); + regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA; + + // Hardware verifies if a launch fits the per-CTA register limit. For + // historical reasons, the verification logic assumes register + // allocations are made to all partitions simultaneously. Therefore, to + // simulate the hardware check, the warp allocation needs to be rounded + // up to the number of partitions. + // + regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions); + + if (properties->regsPerBlock < regsAssumedPerCTA || // Hardware check + properties->regsPerBlock < regsAllocatedPerCTA || // Software check + attributes->numRegs > maxRegsPerThread) { // Per thread limit check + maxBlocks = 0; + } + else { + if (regsAllocatedPerWarp > 0) { + // Registers are allocated in each sub-partition. The max number + // of warps that can fit on an SM is equal to the max number of + // warps per sub-partition x number of sub-partitions. + // + numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions; + numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp; + + maxBlocks = 0; + + if (*gcConfig != PARTITIONED_GC_OFF) { + int numSubPartitionsPerSmPartition; + int numWarpsPerSmPartition; + int maxBlocksPerSmPartition; + + // If partitioned global caching is on, then a CTA can only + // use a half SM, and thus a half of the registers available + // per SM + // + numSubPartitionsPerSmPartition = numSubPartitions / 2; + numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition; + maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA; + maxBlocks = maxBlocksPerSmPartition * 2; + } + + // Try again if partitioned global caching is not enabled, or if + // the CTA cannot fit on the SM with caching on (maxBlocks == 0). In the latter + // case, the device will automatically turn off caching, except + // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate + // occupancy and launch configuration. + // + if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) { + // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since + // this is what it will be if we spread CTA across partitions. + // + *gcConfig = PARTITIONED_GC_OFF; + numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions; + maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA; + } + } + else { + maxBlocks = INT_MAX; + } + } + + + result->allocatedRegistersPerBlock = regsAllocatedPerCTA; + + *limit = maxBlocks; + + return status; +} + +// Barrier limit +// +static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit( + int *limit, + int ctaLimitBlocks, + const cudaOccFuncAttributes *attributes) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int numBarriersAvailable = ctaLimitBlocks * 2; + int numBarriersUsed = attributes->numBlockBarriers; + int maxBlocks = INT_MAX; + + if (numBarriersUsed) { + maxBlocks = numBarriersAvailable / numBarriersUsed; + } + + *limit = maxBlocks; + + return status; +} + +/////////////////////////////////// +// API Implementations // +/////////////////////////////////// + +static __OCC_INLINE +cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor( + cudaOccResult *result, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + int blockSize, + size_t dynamicSmemSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + int ctaLimitWarps = 0; + int ctaLimitBlocks = 0; + int ctaLimitSMem = 0; + int ctaLimitRegs = 0; + int ctaLimitBars = 0; + int ctaLimit = 0; + unsigned int limitingFactors = 0; + + cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF; + + if (!result || !properties || !attributes || !state || blockSize <= 0) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + /////////////////////////// + // Check user input + /////////////////////////// + + status = cudaOccInputCheck(properties, attributes, state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + /////////////////////////// + // Initialization + /////////////////////////// + + gcConfig = cudaOccPartitionedGCExpected(properties, attributes); + + /////////////////////////// + // Compute occupancy + /////////////////////////// + + // Limits due to registers/SM + // Also compute if partitioned global caching has to be turned off + // + status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4. + // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x. + // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0), + // we do not let it run on any Pascal processor, even though it may be able to run on GP100. + // Therefore, we check the occupancy on GP10x when it can run on GP100 + // + if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) { + cudaOccDeviceProp propertiesGP10x; + cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig; + int ctaLimitRegsGP10x = 0; + + // Set up properties for GP10x + memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x)); + propertiesGP10x.computeMinor = 1; + + status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + if (ctaLimitRegsGP10x == 0) { + ctaLimitRegs = 0; + } + } + + // Limits due to warps/SM + // + status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Limits due to blocks/SM + // + status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Limits due to shared memory/SM + // + status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + /////////////////////////// + // Overall occupancy + /////////////////////////// + + // Overall limit is min() of limits due to above reasons + // + ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks))); + + // Determine occupancy limiting factors + // + if (ctaLimit == ctaLimitWarps) { + limitingFactors |= OCC_LIMIT_WARPS; + } + if (ctaLimit == ctaLimitRegs) { + limitingFactors |= OCC_LIMIT_REGISTERS; + } + if (ctaLimit == ctaLimitSMem) { + limitingFactors |= OCC_LIMIT_SHARED_MEMORY; + } + if (ctaLimit == ctaLimitBlocks) { + limitingFactors |= OCC_LIMIT_BLOCKS; + } + + // For Hopper onwards compute the limits to occupancy based on block barrier count + // + if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) { + // Limits due to barrier/SM + // + status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Recompute overall limit based on barrier/SM + // + ctaLimit = __occMin(ctaLimitBars, ctaLimit); + + // Determine if this is occupancy limiting factor + // + if (ctaLimit == ctaLimitBars) { + limitingFactors |= OCC_LIMIT_BARRIERS; + } + } + else { + ctaLimitBars = INT_MAX; + } + + // Fill in the return values + // + result->limitingFactors = limitingFactors; + + result->blockLimitRegs = ctaLimitRegs; + result->blockLimitSharedMem = ctaLimitSMem; + result->blockLimitWarps = ctaLimitWarps; + result->blockLimitBlocks = ctaLimitBlocks; + result->blockLimitBarriers = ctaLimitBars; + result->partitionedGCConfig = gcConfig; + + // Final occupancy + result->activeBlocksPerMultiprocessor = ctaLimit; + + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE +cudaOccError cudaOccAvailableDynamicSMemPerBlock( + size_t *bytesAvailable, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + int numBlocks, + int blockSize) +{ + int allocationGranularity; + size_t smemLimitPerBlock; + size_t smemAvailableForDynamic; + size_t userSmemPreference = 0; + size_t sharedMemPerMultiprocessor; + cudaOccResult result; + cudaOccError status = CUDA_OCC_SUCCESS; + + if (numBlocks <= 0) + return CUDA_OCC_ERROR_INVALID_INPUT; + + // First compute occupancy of potential kernel launch. + // + status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + // Check if occupancy is achievable given user requested number of blocks. + // + if (result.activeBlocksPerMultiprocessor < numBlocks) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // Return the per block shared memory limit based on function config. + // + status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW + // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user + // preference sets the total limit of available shared memory. + // + cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state); + if (numBlocks == 1) { + sharedMemPerMultiprocessor = smemLimitPerBlock; + } + else { + if (!userSmemPreference) { + userSmemPreference = 1 ; + status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + } + sharedMemPerMultiprocessor = userSmemPreference; + } + + // Compute total shared memory available per SM + // + smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks; + smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity; + + // Cap shared memory + // + if (smemAvailableForDynamic > smemLimitPerBlock) { + smemAvailableForDynamic = smemLimitPerBlock; + } + + // Now compute dynamic shared memory size + smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes; + + // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute() + // + if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes) + smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes; + + *bytesAvailable = smemAvailableForDynamic; + return CUDA_OCC_SUCCESS; +} + +static __OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSize( + int *minGridSize, + int *blockSize, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + size_t (*blockSizeToDynamicSMemSize)(int), + size_t dynamicSMemSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + cudaOccResult result; + + // Limits + int occupancyLimit; + int granularity; + int blockSizeLimit; + + // Recorded maximum + int maxBlockSize = 0; + int numBlocks = 0; + int maxOccupancy = 0; + + // Temporary + int blockSizeToTryAligned; + int blockSizeToTry; + int blockSizeLimitAligned; + int occupancyInBlocks; + int occupancyInThreads; + + /////////////////////////// + // Check user input + /////////////////////////// + + if (!minGridSize || !blockSize || !properties || !attributes || !state) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + status = cudaOccInputCheck(properties, attributes, state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Try each block size, and pick the block size with maximum occupancy + ///////////////////////////////////////////////////////////////////////////////// + + occupancyLimit = properties->maxThreadsPerMultiprocessor; + granularity = properties->warpSize; + + blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock); + blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); + + for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { + blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); + + // Ignore dynamicSMemSize if the user provides a mapping + // + if (blockSizeToDynamicSMemSize) { + dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry); + } + + status = cudaOccMaxActiveBlocksPerMultiprocessor( + &result, + properties, + attributes, + state, + blockSizeToTry, + dynamicSMemSize); + + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + occupancyInBlocks = result.activeBlocksPerMultiprocessor; + occupancyInThreads = blockSizeToTry * occupancyInBlocks; + + if (occupancyInThreads > maxOccupancy) { + maxBlockSize = blockSizeToTry; + numBlocks = occupancyInBlocks; + maxOccupancy = occupancyInThreads; + } + + // Early out if we have reached the maximum + // + if (occupancyLimit == maxOccupancy) { + break; + } + } + + /////////////////////////// + // Return best available + /////////////////////////// + + // Suggested min grid size to achieve a full machine launch + // + *minGridSize = numBlocks * properties->numSms; + *blockSize = maxBlockSize; + + return status; +} + + +#if defined(__cplusplus) + +namespace { + +__OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSize( + int *minGridSize, + int *blockSize, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + size_t dynamicSMemSize) +{ + return cudaOccMaxPotentialOccupancyBlockSize( + minGridSize, + blockSize, + properties, + attributes, + state, + NULL, + dynamicSMemSize); +} + +template <typename UnaryFunction> +__OCC_INLINE +cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem( + int *minGridSize, + int *blockSize, + const cudaOccDeviceProp *properties, + const cudaOccFuncAttributes *attributes, + const cudaOccDeviceState *state, + UnaryFunction blockSizeToDynamicSMemSize) +{ + cudaOccError status = CUDA_OCC_SUCCESS; + cudaOccResult result; + + // Limits + int occupancyLimit; + int granularity; + int blockSizeLimit; + + // Recorded maximum + int maxBlockSize = 0; + int numBlocks = 0; + int maxOccupancy = 0; + + // Temporary + int blockSizeToTryAligned; + int blockSizeToTry; + int blockSizeLimitAligned; + int occupancyInBlocks; + int occupancyInThreads; + size_t dynamicSMemSize; + + /////////////////////////// + // Check user input + /////////////////////////// + + if (!minGridSize || !blockSize || !properties || !attributes || !state) { + return CUDA_OCC_ERROR_INVALID_INPUT; + } + + status = cudaOccInputCheck(properties, attributes, state); + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + ///////////////////////////////////////////////////////////////////////////////// + // Try each block size, and pick the block size with maximum occupancy + ///////////////////////////////////////////////////////////////////////////////// + + occupancyLimit = properties->maxThreadsPerMultiprocessor; + granularity = properties->warpSize; + blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock); + blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity); + + for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { + blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned); + + dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); + + status = cudaOccMaxActiveBlocksPerMultiprocessor( + &result, + properties, + attributes, + state, + blockSizeToTry, + dynamicSMemSize); + + if (status != CUDA_OCC_SUCCESS) { + return status; + } + + occupancyInBlocks = result.activeBlocksPerMultiprocessor; + + occupancyInThreads = blockSizeToTry * occupancyInBlocks; + + if (occupancyInThreads > maxOccupancy) { + maxBlockSize = blockSizeToTry; + numBlocks = occupancyInBlocks; + maxOccupancy = occupancyInThreads; + } + + // Early out if we have reached the maximum + // + if (occupancyLimit == maxOccupancy) { + break; + } + } + + /////////////////////////// + // Return best available + /////////////////////////// + + // Suggested min grid size to achieve a full machine launch + // + *minGridSize = numBlocks * properties->numSms; + *blockSize = maxBlockSize; + + return status; +} + +} // namespace anonymous + +#endif /*__cplusplus */ + +#undef __OCC_INLINE + +#endif /*__cuda_occupancy_h__*/ diff --git a/ext/cudart/include/cuda_pipeline.h b/ext/cudart/include/cuda_pipeline.h new file mode 100644 index 0000000000000000000000000000000000000000..46bc89e4499576f1ae58848cd8684ba3e32420cf --- /dev/null +++ b/ext/cudart/include/cuda_pipeline.h @@ -0,0 +1,224 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_PIPELINE_H_ +# define _CUDA_PIPELINE_H_ + +# include "cuda_pipeline_primitives.h" + +# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER) +# error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \ + -std=c++11 compiler option. +# endif + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) +# include "cuda_awbarrier.h" +# endif + +// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>. + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) +# if defined(_LIBCUDACXX_CUDA_ABI_VERSION) +# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION +# else +# define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4 +# endif + +# define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y +# define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y) +# define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION) + +namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE { + struct __block_scope_barrier_base; +}} + +# endif + +_CUDA_PIPELINE_BEGIN_NAMESPACE + +template<size_t N, typename T> +_CUDA_PIPELINE_QUALIFIER +auto segment(T* ptr) -> T(*)[N]; + +class pipeline { +public: + pipeline(const pipeline&) = delete; + pipeline(pipeline&&) = delete; + pipeline& operator=(const pipeline&) = delete; + pipeline& operator=(pipeline&&) = delete; + + _CUDA_PIPELINE_QUALIFIER pipeline(); + _CUDA_PIPELINE_QUALIFIER size_t commit(); + _CUDA_PIPELINE_QUALIFIER void commit_and_wait(); + _CUDA_PIPELINE_QUALIFIER void wait(size_t batch); + template<unsigned N> + _CUDA_PIPELINE_QUALIFIER void wait_prior(); + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) + _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier); + _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier); +# endif + +private: + size_t current_batch; +}; + +template<class T> +_CUDA_PIPELINE_QUALIFIER +void memcpy_async(T& dst, const T& src, pipeline& pipe); + +template<class T, size_t DstN, size_t SrcN> +_CUDA_PIPELINE_QUALIFIER +void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe); + +template<size_t N, typename T> +_CUDA_PIPELINE_QUALIFIER +auto segment(T* ptr) -> T(*)[N] +{ + return (T(*)[N])ptr; +} + +_CUDA_PIPELINE_QUALIFIER +pipeline::pipeline() + : current_batch(0) +{ +} + +_CUDA_PIPELINE_QUALIFIER +size_t pipeline::commit() +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit(); + return this->current_batch++; +} + +_CUDA_PIPELINE_QUALIFIER +void pipeline::commit_and_wait() +{ + (void)pipeline::commit(); + pipeline::wait_prior<0>(); +} + +_CUDA_PIPELINE_QUALIFIER +void pipeline::wait(size_t batch) +{ + const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0; + + switch (prior) { + case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break; + case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break; + case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break; + case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break; + case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break; + case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break; + case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break; + case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break; + default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break; + } +} + +template<unsigned N> +_CUDA_PIPELINE_QUALIFIER +void pipeline::wait_prior() +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>(); +} + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) +_CUDA_PIPELINE_QUALIFIER +void pipeline::arrive_on(awbarrier& barrier) +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier); +} + +_CUDA_PIPELINE_QUALIFIER +void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier) +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier)); +} +# endif + +template<class T> +_CUDA_PIPELINE_QUALIFIER +void memcpy_async(T& dst, const T& src, pipeline& pipe) +{ + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1))); + + if (__is_trivially_copyable(T)) { + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>( + reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src)); + } else { + dst = src; + } +} + +template<class T, size_t DstN, size_t SrcN> +_CUDA_PIPELINE_QUALIFIER +void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe) +{ + constexpr size_t dst_size = sizeof(*dst); + constexpr size_t src_size = sizeof(*src); + static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size."); + static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size."); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1))); + + if (__is_trivially_copyable(T)) { + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>( + reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src)); + } else { + for (size_t i = 0; i < DstN; ++i) { + (*dst)[i] = (i < SrcN) ? (*src)[i] : T(); + } + } +} + +_CUDA_PIPELINE_END_NAMESPACE + +#endif /* !_CUDA_PIPELINE_H_ */ diff --git a/ext/cudart/include/cuda_pipeline_helpers.h b/ext/cudart/include/cuda_pipeline_helpers.h new file mode 100644 index 0000000000000000000000000000000000000000..488264076c2b1bd850a14a85871dc22b7f6d36ce --- /dev/null +++ b/ext/cudart/include/cuda_pipeline_helpers.h @@ -0,0 +1,373 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_PIPELINE_HELPERS_H_ +# define _CUDA_PIPELINE_HELPERS_H_ + +# define _CUDA_PIPELINE_NAMESPACE nvcuda::experimental +# define _CUDA_PIPELINE_BEGIN_NAMESPACE namespace nvcuda { namespace experimental { +# define _CUDA_PIPELINE_END_NAMESPACE } } + +# define _CUDA_PIPELINE_INTERNAL_NAMESPACE _CUDA_PIPELINE_NAMESPACE::__pipeline_internal +# define _CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE _CUDA_PIPELINE_BEGIN_NAMESPACE namespace __pipeline_internal { +# define _CUDA_PIPELINE_END_INTERNAL_NAMESPACE } _CUDA_PIPELINE_END_NAMESPACE + +# if !defined(_CUDA_PIPELINE_QUALIFIER) +# define _CUDA_PIPELINE_QUALIFIER inline __device__ +# endif +# if !defined(_CUDA_PIPELINE_STATIC_QUALIFIER) +# define _CUDA_PIPELINE_STATIC_QUALIFIER static inline __device__ +# endif + +# if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) +# define _CUDA_PIPELINE_ARCH_700_OR_LATER +# endif + +# if (__CUDA_ARCH__ >= 800) +# define _CUDA_PIPELINE_HAS_ASYNC_COPY 1 +# else +# define _CUDA_PIPELINE_HAS_ASYNC_COPY 0 +# endif + +# if !defined(_CUDA_PIPELINE_MAX_STAGES) +# define _CUDA_PIPELINE_MAX_STAGES 8 +# endif + +# if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900))) +# define _CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER +# endif + +# if !defined(_CUDA_PIPELINE_DEBUG) +# if defined(__CUDACC_DEBUG__) +# define _CUDA_PIPELINE_DEBUG 1 +# else +# define _CUDA_PIPELINE_DEBUG 0 +# endif +# endif + +# if defined(_CUDA_PIPELINE_DEBUG) && (_CUDA_PIPELINE_DEBUG == 1) && !defined(NDEBUG) +# if !defined(__CUDACC_RTC__) +# include <cassert> +# endif +# define _CUDA_PIPELINE_ASSERT(x) assert((x)); +# define _CUDA_PIPELINE_ABORT() assert(0); +# else +# define _CUDA_PIPELINE_ASSERT(x) +# define _CUDA_PIPELINE_ABORT() __trap(); +# endif + +# if defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER) +# define _CUDA_PIPELINE_STATIC_ASSERT(c, m) static_assert(c, m) +# else +# define _CUDA_PIPELINE_STATIC_ASSERT(c, m) +# endif + +# if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__) +# define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "r" +# else +# define _CUDA_PIPELINE_ASM_PTR_CONSTRAINT "l" +# endif + +# if defined(__CUDACC_RTC__) +typedef unsigned int uint32_t; +typedef unsigned long long uint64_t; +typedef uint64_t uintptr_t; +# else +# include <stdint.h> +# endif + +_CUDA_PIPELINE_BEGIN_INTERNAL_NAMESPACE + +_CUDA_PIPELINE_STATIC_ASSERT(sizeof(short) == 2, "Size mismatch for type 'short'"); +_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int) == 4, "Size mismatch for type 'int'"); +_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int2) == 8, "Size mismatch for type 'int2'"); +_CUDA_PIPELINE_STATIC_ASSERT(sizeof(int4) == 16, "Size mismatch for type 'int4'"); + +extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *); + +template<size_t CopySize, size_t SourceSize> +_CUDA_PIPELINE_QUALIFIER +void pipeline_memcpy_sync(void* __restrict__ dst, const void* __restrict__ src) +{ + _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); + _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size"); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1))); + + char* const d = reinterpret_cast<char*>(dst); + const char* const s = reinterpret_cast<const char*>(src); + + size_t copy_step_size; + if (SourceSize == 0) { + copy_step_size = CopySize; + } else if (SourceSize == 2 || SourceSize == 4 || SourceSize == 8 || SourceSize == 16) { + copy_step_size = SourceSize; + } else { + copy_step_size = 1; + } + + for (size_t i = 0; i < CopySize; i += copy_step_size) { + const bool copy_source = SourceSize && (i < SourceSize); + + switch (copy_step_size) { + case 1: + d[i] = copy_source ? s[i] : char(); + break; + case 2: + *reinterpret_cast<short*>(d + i) = copy_source ? *reinterpret_cast<const short*>(s + i) : short(); + break; + case 4: + *reinterpret_cast<int*>(d + i) = copy_source ? *reinterpret_cast<const int*>(s + i) : int(); + break; + case 8: + *reinterpret_cast<int2*>(d + i) = copy_source ? *reinterpret_cast<const int2*>(s + i) : int2(); + break; + case 16: + *reinterpret_cast<int4*>(d + i) = copy_source ? *reinterpret_cast<const int4*>(s + i) : int4(); + break; + } + } +} + +template<bool UseHwAsyncCopy> +struct ImplementationChooser; + +template<> +struct ImplementationChooser<true> { + template<size_t CopySize, size_t SourceSize> + struct CpAsyncChooser { + _CUDA_PIPELINE_STATIC_QUALIFIER + void cp_async(void* __restrict__ dst, const void* __restrict__ src) + { + asm volatile ("cp.async.ca.shared.global [%0], [%1], %2, %3;" + : + : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(CopySize), + "n"(SourceSize) + : "memory"); + } + }; + + template<size_t SourceSize> + struct CpAsyncChooser<16, SourceSize> { + _CUDA_PIPELINE_STATIC_QUALIFIER + void cp_async(void* __restrict__ dst, const void* __restrict__ src) + { + asm volatile ("cp.async.cg.shared.global [%0], [%1], %2, %3;" + : + : "r"(__nvvm_get_smem_pointer(dst)), _CUDA_PIPELINE_ASM_PTR_CONSTRAINT(src), "n"(16), "n"(SourceSize) + : "memory"); + } + }; + + template<size_t CopySize, size_t SourceSize> + _CUDA_PIPELINE_STATIC_QUALIFIER + void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src) + { + _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); + _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size"); + _CUDA_PIPELINE_ASSERT(__isShared(dst)); + _CUDA_PIPELINE_ASSERT(__isGlobal(src)); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1))); + + CpAsyncChooser<CopySize, SourceSize>::cp_async(dst, src); + } + + _CUDA_PIPELINE_STATIC_QUALIFIER + void pipeline_commit() + { + asm volatile ("cp.async.commit_group;"); + } + + template<unsigned N> + _CUDA_PIPELINE_STATIC_QUALIFIER + void pipeline_wait_prior() + { + asm volatile ("cp.async.wait_group %0;" + : + : "n"(N < _CUDA_PIPELINE_MAX_STAGES ? N : _CUDA_PIPELINE_MAX_STAGES)); + } + + _CUDA_PIPELINE_STATIC_QUALIFIER + void pipeline_arrive_on(uint64_t* barrier) + { + _CUDA_PIPELINE_ASSERT(__isShared(barrier)); + + asm volatile ("cp.async.mbarrier.arrive.shared.b64 [%0];" + : + : "r"(__nvvm_get_smem_pointer(barrier))); + } +}; + +template<> +struct ImplementationChooser<false> { + template<size_t CopySize, size_t SourceSize> + _CUDA_PIPELINE_STATIC_QUALIFIER + void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src) + { + _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); + _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size"); + _CUDA_PIPELINE_ASSERT(__isShared(dst)); + _CUDA_PIPELINE_ASSERT(__isGlobal(src)); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1))); + + pipeline_memcpy_sync<CopySize, SourceSize>(dst, src); + } + + _CUDA_PIPELINE_STATIC_QUALIFIER + void pipeline_commit() + { + } + + template<unsigned N> + _CUDA_PIPELINE_STATIC_QUALIFIER + void pipeline_wait_prior() + { + } + + _CUDA_PIPELINE_STATIC_QUALIFIER + void pipeline_arrive_on(uint64_t* barrier) + { + } +}; + +template<size_t CopySize, size_t SourceSize> +_CUDA_PIPELINE_QUALIFIER +void pipeline_memcpy_async(void* __restrict__ dst, const void* __restrict__ src) +{ + _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); + _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size"); + _CUDA_PIPELINE_ASSERT(__isShared(dst)); + _CUDA_PIPELINE_ASSERT(__isGlobal(src)); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1))); + + ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_memcpy_async<CopySize, SourceSize>(dst, src); +} + +_CUDA_PIPELINE_QUALIFIER +void pipeline_commit() +{ + ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_commit(); +} + +template<unsigned N> +_CUDA_PIPELINE_QUALIFIER +void pipeline_wait_prior() +{ + ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_wait_prior<N>(); +} + +_CUDA_PIPELINE_QUALIFIER +void pipeline_arrive_on(uint64_t* barrier) +{ + ImplementationChooser<_CUDA_PIPELINE_HAS_ASYNC_COPY>::pipeline_arrive_on(barrier); +} + +template<size_t CopySize, size_t SourceSize> +_CUDA_PIPELINE_QUALIFIER +void pipeline_copy_strict(void* __restrict__ dst, const void* __restrict__ src) +{ + _CUDA_PIPELINE_STATIC_ASSERT(CopySize == 4 || CopySize == 8 || CopySize == 16, "Unsupported copy size."); + _CUDA_PIPELINE_STATIC_ASSERT(SourceSize <= CopySize, "Source size must be less than or equal to copy size."); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (CopySize - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (CopySize - 1))); + + if (__isGlobal(src) && __isShared(dst)) { + pipeline_memcpy_async<CopySize, SourceSize>(dst, src); + } else { + pipeline_memcpy_sync<CopySize, SourceSize>(dst, src); + } +} + +template<size_t CopySize, size_t Align> +_CUDA_PIPELINE_QUALIFIER +void pipeline_copy_relaxed(void* __restrict__ dst, const void* __restrict__ src) +{ + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (Align - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (Align - 1))); + + const char* s = reinterpret_cast<const char*>(src); + char* d = reinterpret_cast<char*>(dst); + size_t remaining = CopySize; + + while (remaining) { + if ((Align >= 16) && (remaining >= 16)) { + pipeline_copy_strict<16, 16>(dst, src); + d += 16; + s += 16; + remaining -= 16; + } else if ((Align >= 8) && (remaining >= 8)) { + pipeline_copy_strict<8, 8>(dst, src); + d += 8; + s += 8; + remaining -= 8; + } else if ((Align >= 4) && (remaining >= 4)) { + pipeline_copy_strict<4, 4>(dst, src); + d += 4; + s += 4; + remaining -= 4; + } else if ((Align >= 2) && (remaining >= 2)) { + *reinterpret_cast<short*>(d) = *reinterpret_cast<const short*>(s); + d += 2; + s += 2; + remaining -= 2; + } else { + *d = *s; + d += 1; + s += 1; + remaining -= 1; + } + } +} + +_CUDA_PIPELINE_END_INTERNAL_NAMESPACE + +#endif /* !_CUDA_PIPELINE_HELPERS_H_ */ diff --git a/ext/cudart/include/cuda_pipeline_primitives.h b/ext/cudart/include/cuda_pipeline_primitives.h new file mode 100644 index 0000000000000000000000000000000000000000..eaba0cfb5ac9184bec5e837d2ec2f9db11d873ae --- /dev/null +++ b/ext/cudart/include/cuda_pipeline_primitives.h @@ -0,0 +1,148 @@ +/* + * Copyright 1993-2019 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef _CUDA_PIPELINE_PRIMITIVES_H_ +# define _CUDA_PIPELINE_PRIMITIVES_H_ + +# include "cuda_pipeline_helpers.h" + +_CUDA_PIPELINE_STATIC_QUALIFIER +void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align, + size_t zfill = 0) +{ + _CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16); + _CUDA_PIPELINE_ASSERT(zfill <= size_and_align); + _CUDA_PIPELINE_ASSERT(__isShared(dst_shared)); + _CUDA_PIPELINE_ASSERT(__isGlobal(src_global)); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1))); + _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1))); + + switch (size_and_align) { + case 16: + switch (zfill) { + case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return; + case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return; + case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return; + case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return; + case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return; + case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return; + case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return; + case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 9>(dst_shared, src_global); return; + case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 8>(dst_shared, src_global); return; + case 9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 7>(dst_shared, src_global); return; + case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 6>(dst_shared, src_global); return; + case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 5>(dst_shared, src_global); return; + case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 4>(dst_shared, src_global); return; + case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 3>(dst_shared, src_global); return; + case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 2>(dst_shared, src_global); return; + case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 1>(dst_shared, src_global); return; + case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 0>(dst_shared, src_global); return; + default: _CUDA_PIPELINE_ABORT(); return; + } + case 8: + switch (zfill) { + case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 8>(dst_shared, src_global); return; + case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 7>(dst_shared, src_global); return; + case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 6>(dst_shared, src_global); return; + case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 5>(dst_shared, src_global); return; + case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 4>(dst_shared, src_global); return; + case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 3>(dst_shared, src_global); return; + case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 2>(dst_shared, src_global); return; + case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 1>(dst_shared, src_global); return; + case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 0>(dst_shared, src_global); return; + default: _CUDA_PIPELINE_ABORT(); return; + } + case 4: + switch (zfill) { + case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 4>(dst_shared, src_global); return; + case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 3>(dst_shared, src_global); return; + case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 2>(dst_shared, src_global); return; + case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 1>(dst_shared, src_global); return; + case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 0>(dst_shared, src_global); return; + default: _CUDA_PIPELINE_ABORT(); return; + } + default: + _CUDA_PIPELINE_ABORT(); + return; + } +} + +_CUDA_PIPELINE_STATIC_QUALIFIER +void __pipeline_commit() +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit(); +} + +_CUDA_PIPELINE_STATIC_QUALIFIER +void __pipeline_wait_prior(size_t prior) +{ + switch (prior) { + case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return; + case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return; + case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return; + case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return; + case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return; + case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return; + case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return; + case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return; + default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return; + } +} + +# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER) +# include "cuda_awbarrier_primitives.h" + +_CUDA_PIPELINE_STATIC_QUALIFIER +void __pipeline_arrive_on(__mbarrier_t* barrier) +{ + _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier); +} +# endif + +#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */ diff --git a/ext/cudart/include/cuda_runtime.h b/ext/cudart/include/cuda_runtime.h new file mode 100644 index 0000000000000000000000000000000000000000..8d297bb6c4140f2056618cedd9b34bdc42cd6367 --- /dev/null +++ b/ext/cudart/include/cuda_runtime.h @@ -0,0 +1,2725 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_RUNTIME_H__) +#define __CUDA_RUNTIME_H__ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__ +#endif + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) +#pragma GCC diagnostic push +#endif +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2))) +#pragma GCC diagnostic ignored "-Wunused-function" +#endif +#elif defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable: 4820) +#endif +#endif + +#ifdef __QNX__ +#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7) +typedef unsigned size_t; +#endif +#endif +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "crt/host_config.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "builtin_types.h" +#include "library_types.h" +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "channel_descriptor.h" +#include "cuda_runtime_api.h" +#include "driver_functions.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "crt/host_defines.h" +#include "vector_functions.h" + +#if defined(__CUDACC__) + +#if defined(__CUDACC_RTC__) +#include "nvrtc_device_runtime.h" +#include "crt/device_functions.h" +#include "crt/common_functions.h" +#include "cuda_surface_types.h" +#include "cuda_texture_types.h" +#include "device_launch_parameters.h" + +#else /* !__CUDACC_RTC__ */ +#define EXCLUDE_FROM_RTC +#include "crt/common_functions.h" +#include "cuda_surface_types.h" +#include "cuda_texture_types.h" +#include "crt/device_functions.h" +#include "device_launch_parameters.h" + +#if defined(__CUDACC_EXTENDED_LAMBDA__) +#include <functional> +#include <utility> +struct __device_builtin__ __nv_lambda_preheader_injection { }; +#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */ + +#undef EXCLUDE_FROM_RTC +#endif /* __CUDACC_RTC__ */ + +#endif /* __CUDACC__ */ + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +#if defined(__cplusplus) && !defined(__CUDACC_RTC__) + +#if __cplusplus >= 201103 +#include <utility> +#endif + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * \addtogroup CUDART_HIGHLEVEL + * @{ + */ + +/** + *\brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region + * of memory from which the actual parameter will be copied. + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory (defaults to 0) + * \param stream - Stream identifier (defaults to NULL) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorUnsupportedPtxVersion, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound, + * ::cudaErrorJitCompilationDisabled + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)" + */ +template<class T> +static __inline__ __host__ cudaError_t cudaLaunchKernel( + const T *func, + dim3 gridDim, + dim3 blockDim, + void **args, + size_t sharedMem = 0, + cudaStream_t stream = 0 +) +{ + return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); +} + + +#if __cplusplus >= 201103 || defined(__DOXYGEN_ONLY__) +/** + * \brief Launches a CUDA function with launch-time configuration + * + * Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x + * × \p config->gridDim.y × \p config->gridDim.z) grid of blocks. + * Each block contains \p config->blockDim (\p config->blockDim.x × + * \p config->blockDim.y × \p config->blockDim.z) threads. + * + * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that + * will be available to each thread block. + * + * \p config->stream specifies a stream the invocation is associated to. + * + * Configuration beyond grid and block dimensions, dynamic shared memory size, + * and stream can be provided with the following two fields of \p config: + * + * \p config->attrs is an array of \p config->numAttrs contiguous + * ::cudaLaunchAttribute elements. The value of this pointer is not considered + * if \p config->numAttrs is zero. However, in that case, it is recommended to + * set the pointer to NULL. + * \p config->numAttrs is the number of attributes populating the first + * \p config->numAttrs positions of the \p config->attrs array. + * + * The kernel arguments should be passed as arguments to this function via the + * \p args parameter pack. + * + * The C API version of this function, \p cudaLaunchKernelExC, is also available + * for pre-C++11 compilers and for use cases where the ability to pass kernel + * parameters via void* array is preferable. + * + * \param config - Launch configuration + * \param func - Kernel to launch + * \param args - Parameter pack of kernel parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorUnsupportedPtxVersion, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound, + * ::cudaErrorJitCompilationDisabled + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args) "cudaLaunchKernelEx (C API)", + * ::cuLaunchKernelEx + */ +template<typename... ExpTypes, typename... ActTypes> +static __inline__ __host__ cudaError_t cudaLaunchKernelEx( + const cudaLaunchConfig_t *config, + void (*kernel)(ExpTypes...), + ActTypes &&... args +) +{ + return [&](ExpTypes... coercedArgs){ + void *pArgs[] = { &coercedArgs... }; + return ::cudaLaunchKernelExC(config, (const void *)kernel, pArgs); + }(std::forward<ActTypes>(args)...); +} +# endif + +/** + *\brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::cudaDevAttrCooperativeLaunch. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region + * of memory from which the actual parameter will be copied. + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory (defaults to 0) + * \param stream - Stream identifier (defaults to NULL) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)" + */ +template<class T> +static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel( + const T *func, + dim3 gridDim, + dim3 blockDim, + void **args, + size_t sharedMem = 0, + cudaStream_t stream = 0 +) +{ + return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream); +} + +/** + * \brief \hl Creates an event object with the specified flags + * + * Creates an event object with the specified flags. Valid flags include: + * - ::cudaEventDefault: Default event creation flag. + * - ::cudaEventBlockingSync: Specifies that event should use blocking + * synchronization. A host thread that uses ::cudaEventSynchronize() to wait + * on an event created with this flag will block until the event actually + * completes. + * - ::cudaEventDisableTiming: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::cudaEventBlockingSync flag not specified will provide the best + * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery(). + * + * \param event - Newly created event + * \param flags - Flags for new event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent + */ +static __inline__ __host__ cudaError_t cudaEventCreate( + cudaEvent_t *event, + unsigned int flags +) +{ + return ::cudaEventCreateWithFlags(event, flags); +} + +/** + * \brief \hl Allocates page-locked memory on the host + * + * Allocates \p size bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it + * can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between host + * and device. + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaHostAllocDefault: This flag's value is defined to be 0. + * - ::cudaHostAllocPortable: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space. + * The device pointer to the memory may be obtained by calling + * ::cudaHostGetDevicePointer(). + * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). + * WC memory can be transferred across the PCI Express bus more quickly on some + * system configurations, but cannot be read efficiently by most CPUs. WC + * memory is a good option for buffers that will be written by the CPU and read + * by the device via mapped pinned memory or host->device transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost + * flag in order for the ::cudaHostAllocMapped flag to have any effect. + * + * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices + * that do not support mapped pinned memory. The failure is deferred to + * ::cudaHostGetDevicePointer() because the memory may be mapped into other + * CUDA contexts via the ::cudaHostAllocPortable flag. + * + * Memory allocated by this function must be freed with ::cudaFreeHost(). + * + * \param ptr - Device pointer to allocated memory + * \param size - Requested allocation size in bytes + * \param flags - Requested properties of allocated memory + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaSetDeviceFlags, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc + */ +static __inline__ __host__ cudaError_t cudaMallocHost( + void **ptr, + size_t size, + unsigned int flags +) +{ + return ::cudaHostAlloc(ptr, size, flags); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaHostAlloc( + T **ptr, + size_t size, + unsigned int flags +) +{ + return ::cudaHostAlloc((void**)(void*)ptr, size, flags); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaHostGetDevicePointer( + T **pDevice, + void *pHost, + unsigned int flags +) +{ + return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags); +} + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p size bytes of managed memory on the device and returns in + * \p *devPtr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::cudaErrorNotSupported is returned. Support + * for managed memory can be queried using the device attribute + * ::cudaDevAttrManagedMemory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p size + * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The + * default value for \p flags is ::cudaMemAttachGlobal. + * If ::cudaMemAttachGlobal is specified, then this memory is accessible from + * any stream on any device. If ::cudaMemAttachHost is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to + * ::cudaStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cudaStreamAttachMemAsync to + * a single stream, the default association, as specifed during ::cudaMallocManaged, + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::cudaMemAttachGlobal. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cudaMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero + * value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all devices used in + * that process that support managed memory have to be peer-to-peer compatible + * with each other. The error ::cudaErrorInvalidDevice will be returned if a device + * that supports managed memory is used and it is not peer-to-peer compatible with + * any of the other managed memory supporting devices that were previously used in + * that process, even if ::cudaDeviceReset has been called on those devices. These + * environment variables are described in the CUDA programming guide under the + * "CUDA environment variables" section. + * - On ARM, managed memory is not available on discrete gpu with Drive PX-2. + * + * \param devPtr - Pointer to allocated device memory + * \param size - Requested allocation size in bytes + * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal) + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * ::cudaMalloc3D, ::cudaMalloc3DArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync + */ +template<class T> +static __inline__ __host__ cudaError_t cudaMallocManaged( + T **devPtr, + size_t size, + unsigned int flags = cudaMemAttachGlobal +) +{ + return ::cudaMallocManaged((void**)(void*)devPtr, size, flags); +} + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p stream to specify stream association of + * \p length bytes of memory starting from \p devPtr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p devPtr must point to an one of the following types of memories: + * - managed memory declared using the __managed__ keyword or allocated with + * ::cudaMallocManaged. + * - a valid host-accessible region of system-allocated pageable memory. This + * type of memory may only be specified if the device associated with the + * stream reports a non-zero value for the device attribute + * ::cudaDevAttrPageableMemoryAccess. + * + * For managed allocations, \p length must be either zero or the entire + * allocation's size. Both indicate that the entire allocation's stream + * association is being changed. Currently, it is not possible to change stream + * association for a portion of a managed allocation. + * + * For pageable allocations, \p length must be non-zero. + * + * The stream association is specified using \p flags which must be + * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle. + * The default value for \p flags is ::cudaMemAttachSingle + * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with + * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess, + * the program makes a guarantee that it will only access the memory on the device + * from \p stream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p stream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p stream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cudaMallocManaged. For __managed__ variables, the default + * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param stream - Stream in which to enqueue the attach operation + * \param devPtr - Pointer to memory (must be a pointer to managed memory or + * to a valid host-accessible region of system-allocated + * memory) + * \param length - Length of memory (defaults to zero) + * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle) + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged + */ +template<class T> +static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync( + cudaStream_t stream, + T *devPtr, + size_t length = 0, + unsigned int flags = cudaMemAttachSingle +) +{ + return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaMalloc( + T **devPtr, + size_t size +) +{ + return ::cudaMalloc((void**)(void*)devPtr, size); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaMallocHost( + T **ptr, + size_t size, + unsigned int flags = 0 +) +{ + return cudaMallocHost((void**)(void*)ptr, size, flags); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaMallocPitch( + T **devPtr, + size_t *pitch, + size_t width, + size_t height +) +{ + return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height); +} + +/** + * \brief Allocate from a pool + * + * This is an alternate spelling for cudaMallocFromPoolAsync + * made available through operator overloading. + * + * \sa ::cudaMallocFromPoolAsync, + * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaStream_t hStream) "cudaMallocAsync (C API)" + */ +static __inline__ __host__ cudaError_t cudaMallocAsync( + void **ptr, + size_t size, + cudaMemPool_t memPool, + cudaStream_t stream +) +{ + return ::cudaMallocFromPoolAsync(ptr, size, memPool, stream); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaMallocAsync( + T **ptr, + size_t size, + cudaMemPool_t memPool, + cudaStream_t stream +) +{ + return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaMallocAsync( + T **ptr, + size_t size, + cudaStream_t stream +) +{ + return ::cudaMallocAsync((void**)(void*)ptr, size, stream); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaMallocFromPoolAsync( + T **ptr, + size_t size, + cudaMemPool_t memPool, + cudaStream_t stream +) +{ + return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream); +} + +#if defined(__CUDACC__) + +/** + * \brief \hl Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. + * + * \param symbol - Device symbol reference + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + */ +template<class T> +static __inline__ __host__ cudaError_t cudaMemcpyToSymbol( + const T &symbol, + const void *src, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyHostToDevice +) +{ + return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief \hl Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice. + * + * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy + * may overlap with operations in other streams. + * + * \param symbol - Device symbol reference + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyFromSymbolAsync + */ +template<class T> +static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync( + const T &symbol, + const void *src, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyHostToDevice, + cudaStream_t stream = 0 +) +{ + return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream); +} + +/** + * \brief \hl Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. + * + * \param dst - Destination memory address + * \param symbol - Device symbol reference + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync + */ +template<class T> +static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol( + void *dst, + const T &symbol, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost +) +{ + return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind); +} + +/** + * \brief \hl Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that resides in + * global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice. + * + * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap + * with operations in other streams. + * + * \param dst - Destination memory address + * \param symbol - Device symbol reference + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync + */ +template<class T> +static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync( + void *dst, + const T &symbol, + size_t count, + size_t offset = 0, + enum cudaMemcpyKind kind = cudaMemcpyDeviceToHost, + cudaStream_t stream = 0 +) +{ + return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream); +} + +/** + * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph + * + * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with + * \p numDependencies dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p src to the memory area pointed to by \p offset bytes from the start + * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyToSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeFromSymbol, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemsetNode + */ +template<class T> +static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeToSymbol( + cudaGraphNode_t *pGraphNode, + cudaGraph_t graph, + const cudaGraphNode_t *pDependencies, + size_t numDependencies, + const T &symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphAddMemcpyNodeToSymbol(pGraphNode, graph, pDependencies, numDependencies, (const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph + * + * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with + * \p numDependencies dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area + * pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable + * that resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyFromSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeToSymbol, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemsetNode + */ +template<class T> +static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeFromSymbol( + cudaGraphNode_t* pGraphNode, + cudaGraph_t graph, + const cudaGraphNode_t* pDependencies, + size_t numDependencies, + void* dst, + const T &symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphAddMemcpyNodeFromSymbol(pGraphNode, graph, pDependencies, numDependencies, dst, (const void*)&symbol, count, offset, kind); +} + +/** + * \brief Sets a memcpy node's parameters to copy to a symbol on the device + * + * Sets the parameters of memcpy node \p node to the copy described by the provided parameters. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p src to the memory area pointed to by \p offset bytes from the start + * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * \param node - Node to set the parameters for + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyToSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams + */ +template<class T> +static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol( + cudaGraphNode_t node, + const T &symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphMemcpyNodeSetParamsToSymbol(node, (const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief Sets a memcpy node's parameters to copy from a symbol on the device + * + * Sets the parameters of memcpy node \p node to the copy described by the provided parameters. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area + * pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable + * that resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param node - Node to set the parameters for + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyFromSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams + */ +template<class T> +static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsFromSymbol( + cudaGraphNode_t node, + void* dst, + const T &symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphMemcpyNodeSetParamsFromSymbol(node, dst, (const void*)&symbol, count, offset, kind); +} + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained the given params at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * \p src and \p symbol must be allocated from the same contexts as the original source and + * destination memory. The instantiation-time memory operands must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or + * the original memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memcpy node from the graph which was used to instantiate graphExec + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeToSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphInstantiate, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams + */ +template<class T> +static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol( + cudaGraphExec_t hGraphExec, + cudaGraphNode_t node, + const T &symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphExecMemcpyNodeSetParamsToSymbol(hGraphExec, node, (const void*)&symbol, src, count, offset, kind); +} + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained the given params at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * \p symbol and \p dst must be allocated from the same contexts as the original source and + * destination memory. The instantiation-time memory operands must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or + * the original memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memcpy node from the graph which was used to instantiate graphExec + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeFromSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphInstantiate, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParamsToSymbol, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams + */ +template<class T> +static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsFromSymbol( + cudaGraphExec_t hGraphExec, + cudaGraphNode_t node, + void* dst, + const T &symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind) +{ + return ::cudaGraphExecMemcpyNodeSetParamsFromSymbol(hGraphExec, node, dst, (const void*)&symbol, count, offset, kind); +} + +#if __cplusplus >= 201103 + +/** + * \brief Creates a user object by wrapping a C++ object + * + * TODO detail + * + * \param object_out - Location to return the user object handle + * \param objectToWrap - This becomes the \ptr argument to ::cudaUserObjectCreate. A + * lambda will be passed for the \p destroy argument, which calls + * delete on this object pointer. + * \param initialRefcount - The initial refcount to create the object with, typically 1. The + * initial references are owned by the calling thread. + * \param flags - Currently it is required to pass cudaUserObjectNoDestructorSync, + * which is the only defined flag. This indicates that the destroy + * callback cannot be waited on by any CUDA API. Users requiring + * synchronization of the callback should signal its completion + * manually. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaUserObjectCreate + */ +template<class T> +static __inline__ __host__ cudaError_t cudaUserObjectCreate( + cudaUserObject_t *object_out, + T *objectToWrap, + unsigned int initialRefcount, + unsigned int flags) +{ + return ::cudaUserObjectCreate( + object_out, + objectToWrap, + [](void *vpObj) { delete reinterpret_cast<T *>(vpObj); }, + initialRefcount, + flags); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaUserObjectCreate( + cudaUserObject_t *object_out, + T *objectToWrap, + unsigned int initialRefcount, + cudaUserObjectFlags flags) +{ + return cudaUserObjectCreate(object_out, objectToWrap, initialRefcount, (unsigned int)flags); +} + +#endif + +/** + * \brief \hl Finds the address associated with a CUDA symbol + * + * Returns in \p *devPtr the address of symbol \p symbol on the device. + * \p symbol can either be a variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared + * in the global or constant memory space, \p *devPtr is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param devPtr - Return device pointer associated with symbol + * \param symbol - Device symbol reference + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)", + * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)" + */ +template<class T> +static __inline__ __host__ cudaError_t cudaGetSymbolAddress( + void **devPtr, + const T &symbol +) +{ + return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol); +} + +/** + * \brief \hl Finds the size of the object associated with a CUDA symbol + * + * Returns in \p *size the size of symbol \p symbol. \p symbol must be a + * variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared + * in global or constant memory space, \p *size is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param size - Size of object associated with symbol + * \param symbol - Device symbol reference + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)", + * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)" + */ +template<class T> +static __inline__ __host__ cudaError_t cudaGetSymbolSize( + size_t *size, + const T &symbol +) +{ + return ::cudaGetSymbolSize(size, (const void*)&symbol); +} + +/** + * \brief \hl Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to texture + * reference \p tex. \p desc describes how the memory is interpreted when + * fetching values from the texture. The \p offset parameter is an optional + * byte offset as with the low-level + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" + * function. Any memory previously bound to \p tex is unbound. + * + * \param offset - Offset in bytes + * \param tex - Texture to bind + * \param devPtr - Memory area on device + * \param desc - Channel format + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture( + size_t *offset, + const struct texture<T, dim, readMode> &tex, + const void *devPtr, + const struct cudaChannelFormatDesc &desc, + size_t size = UINT_MAX +) +{ + return ::cudaBindTexture(offset, &tex, devPtr, &desc, size); +} + +/** + * \brief \hl Binds a memory area to a texture + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to texture + * reference \p tex. The channel descriptor is inherited from the texture + * reference type. The \p offset parameter is an optional byte offset as with + * the low-level + * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) + * function. Any memory previously bound to \p tex is unbound. + * + * \param offset - Offset in bytes + * \param tex - Texture to bind + * \param devPtr - Memory area on device + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture( + size_t *offset, + const struct texture<T, dim, readMode> &tex, + const void *devPtr, + size_t size = UINT_MAX +) +{ + return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size); +} + +/** + * \brief \hl Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p tex. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. \p desc describes how the memory is interpreted when fetching values + * from the texture. Any memory previously bound to \p tex is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \param offset - Offset in bytes + * \param tex - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param desc - Channel format + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture2D( + size_t *offset, + const struct texture<T, dim, readMode> &tex, + const void *devPtr, + const struct cudaChannelFormatDesc &desc, + size_t width, + size_t height, + size_t pitch +) +{ + return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch); +} + +/** + * \brief \hl Binds a 2D memory area to a texture + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p tex. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. The channel descriptor is inherited from the texture reference + * type. Any memory previously bound to \p tex is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \param offset - Offset in bytes + * \param tex - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture2D( + size_t *offset, + const struct texture<T, dim, readMode> &tex, + const void *devPtr, + size_t width, + size_t height, + size_t pitch +) +{ + return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch); +} + +/** + * \brief \hl Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p tex. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA array previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray( + const struct texture<T, dim, readMode> &tex, + cudaArray_const_t array, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindTextureToArray(&tex, array, &desc); +} + +/** + * \brief \hl Binds an array to a texture + * + * Binds the CUDA array \p array to the texture reference \p tex. + * The channel descriptor is inherited from the CUDA array. Any CUDA array + * previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray( + const struct texture<T, dim, readMode> &tex, + cudaArray_const_t array +) +{ + struct cudaChannelFormatDesc desc; + cudaError_t err = ::cudaGetChannelDesc(&desc, array); + + return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err; +} + +/** + * \brief \hl Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( + const struct texture<T, dim, readMode> &tex, + cudaMipmappedArray_const_t mipmappedArray, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc); +} + +/** + * \brief \hl Binds a mipmapped array to a texture + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex. + * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array + * previously bound to \p tex is unbound. + * + * \param tex - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray( + const struct texture<T, dim, readMode> &tex, + cudaMipmappedArray_const_t mipmappedArray +) +{ + struct cudaChannelFormatDesc desc; + cudaArray_t levelArray; + cudaError_t err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0); + + if (err != cudaSuccess) { + return err; + } + err = ::cudaGetChannelDesc(&desc, levelArray); + + return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err; +} + +/** + * \brief \hl Unbinds a texture + * + * Unbinds the texture bound to \p tex. If \p texref is not currently bound, no operation is performed. + * + * \param tex - Texture to unbind + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaUnbindTexture( + const struct texture<T, dim, readMode> &tex +) +{ + return ::cudaUnbindTexture(&tex); +} + +/** + * \brief \hl Get the alignment offset of a texture + * + * Returns in \p *offset the offset that was returned when texture reference + * \p tex was bound. + * + * \param offset - Offset of texture reference in bytes + * \param tex - Texture to get offset of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)", + * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" + */ +template<class T, int dim, enum cudaTextureReadMode readMode> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset( + size_t *offset, + const struct texture<T, dim, readMode> &tex +) +{ + return ::cudaGetTextureAlignmentOffset(offset, &tex); +} + +/** + * \brief \hl Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache configuration + * for the function specified via \p func. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute \p func. + * + * \p func must be a pointer to a function that executes on the device. + * The parameter specified by \p func must be declared as a \p __global__ + * function. If the specified function does not exist, + * then ::cudaErrorInvalidDeviceFunction is returned. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * + * \param func - device function pointer + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost, + * ::cudaThreadGetCacheConfig, + * ::cudaThreadSetCacheConfig + */ +template<class T> +static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig( + T *func, + enum cudaFuncCache cacheConfig +) +{ + return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig); +} + +template<class T> +static __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig( + T *func, + enum cudaSharedMemConfig config +) +{ + return ::cudaFuncSetSharedMemConfig((const void*)func, config); +} + +#endif // __CUDACC__ + +/** + * \brief Returns occupancy for a device function + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calulated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +template<class T> +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor( + int *numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize) +{ + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault); +} + +/** + * \brief Returns occupancy for a device function with the specified flags + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * + * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calulated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +template<class T> +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int *numBlocks, + T func, + int blockSize, + size_t dynamicSMemSize, + unsigned int flags) +{ + return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags); +} + +/** + * Helper functor for cudaOccupancyMaxPotentialBlockSize + */ +class __cudaOccupancyB2DHelper { + size_t n; +public: + inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {} + inline __host__ CUDART_DEVICE size_t operator()(int) + { + return n; + } +}; + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ + +template<typename UnaryFunction, class T> +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags( + int *minGridSize, + int *blockSize, + T func, + UnaryFunction blockSizeToDynamicSMemSize, + int blockSizeLimit = 0, + unsigned int flags = 0) +{ + cudaError_t status; + + // Device and function properties + int device; + struct cudaFuncAttributes attr; + + // Limits + int maxThreadsPerMultiProcessor; + int warpSize; + int devMaxThreadsPerBlock; + int multiProcessorCount; + int funcMaxThreadsPerBlock; + int occupancyLimit; + int granularity; + + // Recorded maximum + int maxBlockSize = 0; + int numBlocks = 0; + int maxOccupancy = 0; + + // Temporary + int blockSizeToTryAligned; + int blockSizeToTry; + int blockSizeLimitAligned; + int occupancyInBlocks; + int occupancyInThreads; + size_t dynamicSMemSize; + + /////////////////////////// + // Check user input + /////////////////////////// + + if (!minGridSize || !blockSize || !func) { + return cudaErrorInvalidValue; + } + + ////////////////////////////////////////////// + // Obtain device and function properties + ////////////////////////////////////////////// + + status = ::cudaGetDevice(&device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &maxThreadsPerMultiProcessor, + cudaDevAttrMaxThreadsPerMultiProcessor, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &warpSize, + cudaDevAttrWarpSize, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &devMaxThreadsPerBlock, + cudaDevAttrMaxThreadsPerBlock, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaDeviceGetAttribute( + &multiProcessorCount, + cudaDevAttrMultiProcessorCount, + device); + if (status != cudaSuccess) { + return status; + } + + status = cudaFuncGetAttributes(&attr, func); + if (status != cudaSuccess) { + return status; + } + + funcMaxThreadsPerBlock = attr.maxThreadsPerBlock; + + ///////////////////////////////////////////////////////////////////////////////// + // Try each block size, and pick the block size with maximum occupancy + ///////////////////////////////////////////////////////////////////////////////// + + occupancyLimit = maxThreadsPerMultiProcessor; + granularity = warpSize; + + if (blockSizeLimit == 0) { + blockSizeLimit = devMaxThreadsPerBlock; + } + + if (devMaxThreadsPerBlock < blockSizeLimit) { + blockSizeLimit = devMaxThreadsPerBlock; + } + + if (funcMaxThreadsPerBlock < blockSizeLimit) { + blockSizeLimit = funcMaxThreadsPerBlock; + } + + blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity; + + for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) { + // This is needed for the first iteration, because + // blockSizeLimitAligned could be greater than blockSizeLimit + // + if (blockSizeLimit < blockSizeToTryAligned) { + blockSizeToTry = blockSizeLimit; + } else { + blockSizeToTry = blockSizeToTryAligned; + } + + dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry); + + status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + &occupancyInBlocks, + func, + blockSizeToTry, + dynamicSMemSize, + flags); + + if (status != cudaSuccess) { + return status; + } + + occupancyInThreads = blockSizeToTry * occupancyInBlocks; + + if (occupancyInThreads > maxOccupancy) { + maxBlockSize = blockSizeToTry; + numBlocks = occupancyInBlocks; + maxOccupancy = occupancyInThreads; + } + + // Early out if we have reached the maximum + // + if (occupancyLimit == maxOccupancy) { + break; + } + } + + /////////////////////////// + // Return best available + /////////////////////////// + + // Suggested min grid size to achieve a full machine launch + // + *minGridSize = numBlocks * multiProcessorCount; + *blockSize = maxBlockSize; + + return status; +} + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ + +template<typename UnaryFunction, class T> +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem( + int *minGridSize, + int *blockSize, + T func, + UnaryFunction blockSizeToDynamicSMemSize, + int blockSizeLimit = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault); +} + +/** + * \brief Returns grid and block size that achieves maximum potential occupancy for a device function + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the + * amount of per-block dynamic shared memory changes with different + * block sizes. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +template<class T> +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault); +} + +/** + * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM. + * + * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. + * + * \param dynamicSmemSize - Returned maximum dynamic shared memory + * \param func - Kernel function for which occupancy is calculated + * \param numBlocks - Number of blocks to fit on SM + * \param blockSize - Size of the block + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + */ +template<class T> +static __inline__ __host__ cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock( + size_t *dynamicSmemSize, + T func, + int numBlocks, + int blockSize) +{ + return ::cudaOccupancyAvailableDynamicSMemPerBlock(dynamicSmemSize, (const void*)func, numBlocks, blockSize); +} + +/** + * \brief Returns grid and block size that achived maximum potential occupancy for a device function with the specified flags + * + * Returns in \p *minGridSize and \p *blocksize a suggested grid / + * block size pair that achieves the best potential occupancy + * (i.e. the maximum number of active warps with the smallest number + * of blocks). + * + * The \p flags parameter controls how special cases are handle. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxPotentialBlockSize + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the + * amount of per-block dynamic shared memory changes with different + * block sizes. + * + * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy + * \param blockSize - Returned block size + * \param func - Device function symbol + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param blockSizeLimit - The maximum block size \p func is designed to work with. 0 means no limit. + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxPotentialBlockSize + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem + * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags + * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +template<class T> +static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags( + int *minGridSize, + int *blockSize, + T func, + size_t dynamicSMemSize = 0, + int blockSizeLimit = 0, + unsigned int flags = 0) +{ + return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags); +} + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum cluster size in \p *clusterSize. + * + * The cluster dimensions in \p config are ignored. If func has a required + * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect + * the required cluster size. + * + * By default this function will always return a value that's portable on + * future hardware. A higher value may be returned if the kernel function + * allows non-portable cluster sizes. + * + * This function will respect the compile time launch bounds. + * + * \param clusterSize - Returned maximum cluster size that can be launched + * for the given kernel function and launch configuration + * \param func - Kernel function for which maximum cluster + * size is calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaFuncGetAttributes + */ +template<class T> +static __inline__ __host__ cudaError_t cudaOccupancyMaxPotentialClusterSize( + int *clusterSize, + T *func, + const cudaLaunchConfig_t *config) +{ + return ::cudaOccupancyMaxPotentialClusterSize(clusterSize, (const void*)func, config); +} + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum number of clusters that could co-exist + * on the target device in \p *numClusters. + * + * If the function has required cluster size already set (see + * ::cudaFuncGetAttributes), the cluster size from config must either be + * unspecified or match the required size. + * Without required sizes, the cluster size must be specified in config, + * else the function will return an error. + * + * Note that various attributes of the kernel function may affect occupancy + * calculation. Runtime environment may affect how the hardware schedules + * the clusters, so the calculated occupancy is not guaranteed to be achievable. + * + * \param numClusters - Returned maximum number of clusters that + * could co-exist on the target device + * \param func - Kernel function for which maximum number + * of clusters are calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidClusterSize, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaFuncGetAttributes + */ +template<class T> +static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveClusters( + int *numClusters, + T *func, + const cudaLaunchConfig_t *config) +{ + return ::cudaOccupancyMaxActiveClusters(numClusters, (const void*)func, config); +} + +#if defined __CUDACC__ + +/** + * \brief \hl Find out attributes for a given function + * + * This function obtains the attributes of a function specified via \p entry. + * The parameter \p entry must be a pointer to a function that executes + * on the device. The parameter specified by \p entry must be declared as a \p __global__ + * function. The fetched attributes are placed in \p attr. If the specified + * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * + * Note that some function attributes such as + * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock" + * may vary based on the device that is currently being used. + * + * \param attr - Return pointer to function's attributes + * \param entry - Function to get attributes of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost + */ +template<class T> +static __inline__ __host__ cudaError_t cudaFuncGetAttributes( + struct cudaFuncAttributes *attr, + T *entry +) +{ + return ::cudaFuncGetAttributes(attr, (const void*)entry); +} + +/** + * \brief \hl Set attributes for a given function + * + * This function sets the attributes of a function specified via \p entry. + * The parameter \p entry must be a pointer to a function that executes + * on the device. The parameter specified by \p entry must be declared as a \p __global__ + * function. The enumeration defined by \p attr is set to the value defined by \p value. + * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * If the specified attribute cannot be written, or if the value is incorrect, + * then ::cudaErrorInvalidValue is returned. + * + * Valid values for \p attr are: + * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes + * cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture. + * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * - ::cudaFuncAttributeRequiredClusterWidth: The required cluster width in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return cudaErrorNotPermitted. + * - ::cudaFuncAttributeRequiredClusterHeight: The required cluster height in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return cudaErrorNotPermitted. + * - ::cudaFuncAttributeRequiredClusterDepth: The required cluster depth in + * blocks. The width, height, and depth values must either all be 0 or all be + * positive. The validity of the cluster dimensions is checked at launch time. + * If the value is set during compile time, it cannot be set at runtime. + * Setting it at runtime will return cudaErrorNotPermitted. + * - ::cudaFuncAttributeClusterSchedulingPolicyPreference: The block + * scheduling policy of a function. The value type is cudaClusterSchedulingPolicy. + * + * \param entry - Function to get attributes of + * \param attr - Attribute to set + * \param value - Value to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice, + * ::cudaSetDoubleForHost + */ +template<class T> +static __inline__ __host__ cudaError_t cudaFuncSetAttribute( + T *entry, + enum cudaFuncAttribute attr, + int value +) +{ + return ::cudaFuncSetAttribute((const void*)entry, attr, value); +} + +/** + * \brief \hl Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surf. + * \p desc describes how the memory is interpreted when dealing with + * the surface. Any CUDA array previously bound to \p surf is unbound. + * + * \param surf - Surface to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)" + */ +template<class T, int dim> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray( + const struct surface<T, dim> &surf, + cudaArray_const_t array, + const struct cudaChannelFormatDesc &desc +) +{ + return ::cudaBindSurfaceToArray(&surf, array, &desc); +} + +/** + * \brief \hl Binds an array to a surface + * + * Binds the CUDA array \p array to the surface reference \p surf. + * The channel descriptor is inherited from the CUDA array. Any CUDA array + * previously bound to \p surf is unbound. + * + * \param surf - Surface to bind + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)" + */ +template<class T, int dim> +static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray( + const struct surface<T, dim> &surf, + cudaArray_const_t array +) +{ + struct cudaChannelFormatDesc desc; + cudaError_t err = ::cudaGetChannelDesc(&desc, array); + + return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err; +} + +#endif /* __CUDACC__ */ + +/** @} */ /* END CUDART_HIGHLEVEL */ + +#endif /* __cplusplus && !__CUDACC_RTC__ */ + +#if !defined(__CUDACC_RTC__) +#if defined(__GNUC__) +#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))) +#pragma GCC diagnostic pop +#endif +#elif defined(_MSC_VER) +#pragma warning(pop) +#endif +#endif + +#undef __CUDA_DEPRECATED + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__ +#endif + +#endif /* !__CUDA_RUNTIME_H__ */ diff --git a/ext/cudart/include/cuda_runtime_api.h b/ext/cudart/include/cuda_runtime_api.h new file mode 100644 index 0000000000000000000000000000000000000000..9e5501ac5586aa77e843c30b4825cc08c1aa4abe --- /dev/null +++ b/ext/cudart/include/cuda_runtime_api.h @@ -0,0 +1,13380 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + + + +#if !defined(__CUDA_RUNTIME_API_H__) +#define __CUDA_RUNTIME_API_H__ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__ +#endif + +/** + * \latexonly + * \page sync_async API synchronization behavior + * + * \section memcpy_sync_async_behavior Memcpy + * The API provides memcpy/memset functions in both synchronous and asynchronous forms, + * the latter having an \e "Async" suffix. This is a misnomer as each function + * may exhibit synchronous or asynchronous behavior depending on the arguments + * passed to the function. In the reference documentation, each memcpy function is + * categorized as \e synchronous or \e asynchronous, corresponding to the definitions + * below. + * + * \subsection MemcpySynchronousBehavior Synchronous + * + * <ol> + * <li> For transfers from pageable host memory to device memory, a stream sync is performed + * before the copy is initiated. The function will return once the pageable + * buffer has been copied to the staging memory for DMA transfer to device memory, + * but the DMA to final destination may not have completed. + * + * <li> For transfers from pinned host memory to device memory, the function is synchronous + * with respect to the host. + * + * <li> For transfers from device to either pageable or pinned host memory, the function returns + * only once the copy has completed. + * + * <li> For transfers from device memory to device memory, no host-side synchronization is + * performed. + * + * <li> For transfers from any host memory to any host memory, the function is fully + * synchronous with respect to the host. + * </ol> + * + * \subsection MemcpyAsynchronousBehavior Asynchronous + * + * <ol> + * <li> For transfers from device memory to pageable host memory, the function + * will return only once the copy has completed. + * + * <li> For transfers from any host memory to any host memory, the function is fully + * synchronous with respect to the host. + * + * <li> For all other transfers, the function is fully asynchronous. If pageable + * memory must first be staged to pinned memory, this will be handled + * asynchronously with a worker thread. + * </ol> + * + * \section memset_sync_async_behavior Memset + * The cudaMemset functions are asynchronous with respect to the host + * except when the target memory is pinned host memory. The \e Async + * versions are always asynchronous with respect to the host. + * + * \section kernel_launch_details Kernel Launches + * Kernel launches are asynchronous with respect to the host. Details of + * concurrent kernel execution and data transfers can be found in the CUDA + * Programmers Guide. + * + * \endlatexonly + */ + +/** + * There are two levels for the runtime API. + * + * The C API (<i>cuda_runtime_api.h</i>) is + * a C-style interface that does not require compiling with \p nvcc. + * + * The \ref CUDART_HIGHLEVEL "C++ API" (<i>cuda_runtime.h</i>) is a + * C++-style interface built on top of the C API. It wraps some of the + * C API routines, using overloading, references and default arguments. + * These wrappers can be used from C++ code and can be compiled with any C++ + * compiler. The C++ API also has some CUDA-specific wrappers that wrap + * C API routines that deal with symbols, textures, and device functions. + * These wrappers require the use of \p nvcc because they depend on code being + * generated by the compiler. For example, the execution configuration syntax + * to invoke kernels is only available in source code compiled with \p nvcc. + */ + +/** CUDA Runtime API Version */ +#define CUDART_VERSION 11080 + +#if defined(__CUDA_API_VER_MAJOR__) && defined(__CUDA_API_VER_MINOR__) +# define __CUDART_API_VERSION ((__CUDA_API_VER_MAJOR__ * 1000) + (__CUDA_API_VER_MINOR__ * 10)) +#else +# define __CUDART_API_VERSION CUDART_VERSION +#endif + +#ifndef __DOXYGEN_ONLY__ +#include "crt/host_defines.h" +#endif +#include "builtin_types.h" + +#include "cuda_device_runtime_api.h" + +#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM) || defined(__CUDA_API_VERSION_INTERNAL) + #define __CUDART_API_PER_THREAD_DEFAULT_STREAM + #define __CUDART_API_PTDS(api) api ## _ptds + #define __CUDART_API_PTSZ(api) api ## _ptsz +#else + #define __CUDART_API_PTDS(api) api + #define __CUDART_API_PTSZ(api) api +#endif + +#define cudaSignalExternalSemaphoresAsync __CUDART_API_PTSZ(cudaSignalExternalSemaphoresAsync_v2) +#define cudaWaitExternalSemaphoresAsync __CUDART_API_PTSZ(cudaWaitExternalSemaphoresAsync_v2) + +#if defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM) + #define cudaMemcpy __CUDART_API_PTDS(cudaMemcpy) + #define cudaMemcpyToSymbol __CUDART_API_PTDS(cudaMemcpyToSymbol) + #define cudaMemcpyFromSymbol __CUDART_API_PTDS(cudaMemcpyFromSymbol) + #define cudaMemcpy2D __CUDART_API_PTDS(cudaMemcpy2D) + #define cudaMemcpyToArray __CUDART_API_PTDS(cudaMemcpyToArray) + #define cudaMemcpy2DToArray __CUDART_API_PTDS(cudaMemcpy2DToArray) + #define cudaMemcpyFromArray __CUDART_API_PTDS(cudaMemcpyFromArray) + #define cudaMemcpy2DFromArray __CUDART_API_PTDS(cudaMemcpy2DFromArray) + #define cudaMemcpyArrayToArray __CUDART_API_PTDS(cudaMemcpyArrayToArray) + #define cudaMemcpy2DArrayToArray __CUDART_API_PTDS(cudaMemcpy2DArrayToArray) + #define cudaMemcpy3D __CUDART_API_PTDS(cudaMemcpy3D) + #define cudaMemcpy3DPeer __CUDART_API_PTDS(cudaMemcpy3DPeer) + #define cudaMemset __CUDART_API_PTDS(cudaMemset) + #define cudaMemset2D __CUDART_API_PTDS(cudaMemset2D) + #define cudaMemset3D __CUDART_API_PTDS(cudaMemset3D) + #define cudaGraphUpload __CUDART_API_PTSZ(cudaGraphUpload) + #define cudaGraphLaunch __CUDART_API_PTSZ(cudaGraphLaunch) + #define cudaStreamBeginCapture __CUDART_API_PTSZ(cudaStreamBeginCapture) + #define cudaStreamEndCapture __CUDART_API_PTSZ(cudaStreamEndCapture) + #define cudaStreamGetCaptureInfo __CUDART_API_PTSZ(cudaStreamGetCaptureInfo) + #define cudaStreamGetCaptureInfo_v2 __CUDART_API_PTSZ(cudaStreamGetCaptureInfo_v2) + #define cudaStreamIsCapturing __CUDART_API_PTSZ(cudaStreamIsCapturing) + #define cudaMemcpyAsync __CUDART_API_PTSZ(cudaMemcpyAsync) + #define cudaMemcpyToSymbolAsync __CUDART_API_PTSZ(cudaMemcpyToSymbolAsync) + #define cudaMemcpyFromSymbolAsync __CUDART_API_PTSZ(cudaMemcpyFromSymbolAsync) + #define cudaMemcpy2DAsync __CUDART_API_PTSZ(cudaMemcpy2DAsync) + #define cudaMemcpyToArrayAsync __CUDART_API_PTSZ(cudaMemcpyToArrayAsync) + #define cudaMemcpy2DToArrayAsync __CUDART_API_PTSZ(cudaMemcpy2DToArrayAsync) + #define cudaMemcpyFromArrayAsync __CUDART_API_PTSZ(cudaMemcpyFromArrayAsync) + #define cudaMemcpy2DFromArrayAsync __CUDART_API_PTSZ(cudaMemcpy2DFromArrayAsync) + #define cudaMemcpy3DAsync __CUDART_API_PTSZ(cudaMemcpy3DAsync) + #define cudaMemcpy3DPeerAsync __CUDART_API_PTSZ(cudaMemcpy3DPeerAsync) + #define cudaMemsetAsync __CUDART_API_PTSZ(cudaMemsetAsync) + #define cudaMemset2DAsync __CUDART_API_PTSZ(cudaMemset2DAsync) + #define cudaMemset3DAsync __CUDART_API_PTSZ(cudaMemset3DAsync) + #define cudaStreamQuery __CUDART_API_PTSZ(cudaStreamQuery) + #define cudaStreamGetFlags __CUDART_API_PTSZ(cudaStreamGetFlags) + #define cudaStreamGetPriority __CUDART_API_PTSZ(cudaStreamGetPriority) + #define cudaEventRecord __CUDART_API_PTSZ(cudaEventRecord) + #define cudaEventRecordWithFlags __CUDART_API_PTSZ(cudaEventRecordWithFlags) + #define cudaStreamWaitEvent __CUDART_API_PTSZ(cudaStreamWaitEvent) + #define cudaStreamAddCallback __CUDART_API_PTSZ(cudaStreamAddCallback) + #define cudaStreamAttachMemAsync __CUDART_API_PTSZ(cudaStreamAttachMemAsync) + #define cudaStreamSynchronize __CUDART_API_PTSZ(cudaStreamSynchronize) + #define cudaLaunchKernel __CUDART_API_PTSZ(cudaLaunchKernel) + #define cudaLaunchKernelExC __CUDART_API_PTSZ(cudaLaunchKernelExC) + #define cudaLaunchHostFunc __CUDART_API_PTSZ(cudaLaunchHostFunc) + #define cudaMemPrefetchAsync __CUDART_API_PTSZ(cudaMemPrefetchAsync) + #define cudaLaunchCooperativeKernel __CUDART_API_PTSZ(cudaLaunchCooperativeKernel) + #define cudaStreamCopyAttributes __CUDART_API_PTSZ(cudaStreamCopyAttributes) + #define cudaStreamGetAttribute __CUDART_API_PTSZ(cudaStreamGetAttribute) + #define cudaStreamSetAttribute __CUDART_API_PTSZ(cudaStreamSetAttribute) + #define cudaMallocAsync __CUDART_API_PTSZ(cudaMallocAsync) + #define cudaFreeAsync __CUDART_API_PTSZ(cudaFreeAsync) + #define cudaMallocFromPoolAsync __CUDART_API_PTSZ(cudaMallocFromPoolAsync) + #define cudaGetDriverEntryPoint __CUDART_API_PTSZ(cudaGetDriverEntryPoint) +#endif + +/** \cond impl_private */ +#if !defined(__dv) + +#if defined(__cplusplus) + +#define __dv(v) \ + = v + +#else /* __cplusplus */ + +#define __dv(v) + +#endif /* __cplusplus */ + +#endif /* !__dv */ +/** \endcond impl_private */ + +#if (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 350)) /** Visible to SM>=3.5 and "__host__ __device__" only **/ + +#define CUDART_DEVICE __device__ + +#else + +#define CUDART_DEVICE + +#endif /** CUDART_DEVICE */ + +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +/** + * \defgroup CUDART_DEVICE Device Management + * + * ___MANBRIEF___ device management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the device management functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Destroy all allocations and reset all state on the current device + * in the current process. + * + * Explicitly destroys and cleans up all resources associated with the current + * device in the current process. It is the caller's responsibility to ensure + * that the resources are not accessed or passed in subsequent API calls and + * doing so will result in undefined behavior. These resources include CUDA types + * such as ::cudaStream_t, ::cudaEvent_t, ::cudaArray_t, ::cudaMipmappedArray_t, + * ::cudaTextureObject_t, ::cudaSurfaceObject_t, ::textureReference, ::surfaceReference, + * ::cudaExternalMemory_t, ::cudaExternalSemaphore_t and ::cudaGraphicsResource_t. + * Any subsequent API call to this device will reinitialize the device. + * + * Note that this function will reset the device immediately. It is the caller's + * responsibility to ensure that the device is not being accessed by any + * other host threads from the process when this function is called. + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceSynchronize + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceReset(void); + +/** + * \brief Wait for compute device to finish + * + * Blocks until the device has completed all preceding requested tasks. + * ::cudaDeviceSynchronize() returns an error if one of the preceding tasks + * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for + * this device, the host thread will block until the device has finished + * its work. + * + * \return + * ::cudaSuccess + * \note_device_sync_deprecated + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDeviceReset, + * ::cuCtxSynchronize + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceSynchronize(void); + +/** + * \brief Set resource limits + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the device. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cudaDeviceGetLimit() to find out + * exactly what the limit has been set to. + * + * Setting each ::cudaLimit has its own specific restrictions, so each is + * discussed here. + * + * - ::cudaLimitStackSize controls the stack size in bytes of each GPU thread. + * + * - ::cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO + * used by the ::printf() device system call. Setting + * ::cudaLimitPrintfFifoSize must not be performed after launching any kernel + * that uses the ::printf() device system call - in such case + * ::cudaErrorInvalidValue will be returned. + * + * - ::cudaLimitMallocHeapSize controls the size in bytes of the heap used by + * the ::malloc() and ::free() device system calls. Setting + * ::cudaLimitMallocHeapSize must not be performed after launching any kernel + * that uses the ::malloc() or ::free() device system calls - in such case + * ::cudaErrorInvalidValue will be returned. + * + * - ::cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a + * grid at which a thread can safely call ::cudaDeviceSynchronize(). Setting + * this limit must be performed before any launch of a kernel that uses the + * device runtime and calls ::cudaDeviceSynchronize() above the default sync + * depth, two levels of grids. Calls to ::cudaDeviceSynchronize() will fail + * with error code ::cudaErrorSyncDepthExceeded if the limitation is + * violated. This limit can be set smaller than the default or up the maximum + * launch depth of 24. When setting this limit, keep in mind that additional + * levels of sync depth require the runtime to reserve large amounts of + * device memory which can no longer be used for user allocations. If these + * reservations of device memory fail, ::cudaDeviceSetLimit will return + * ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::cudaErrorUnsupportedLimit being + * returned. + * + * - ::cudaLimitDevRuntimePendingLaunchCount controls the maximum number of + * outstanding device runtime launches that can be made from the current + * device. A grid is outstanding from the point of launch up until the grid + * is known to have been completed. Device runtime launches which violate + * this limitation fail and return ::cudaErrorLaunchPendingCountExceeded when + * ::cudaGetLastError() is called after launch. If more pending launches than + * the default (2048 launches) are needed for a module using the device + * runtime, this limit can be increased. Keep in mind that being able to + * sustain additional pending launches will require the runtime to reserve + * larger amounts of device memory upfront which can no longer be used for + * allocations. If these reservations fail, ::cudaDeviceSetLimit will return + * ::cudaErrorMemoryAllocation, and the limit can be reset to a lower value. + * This limit is only applicable to devices of compute capability 3.5 and + * higher. Attempting to set this limit on devices of compute capability less + * than 3.5 will result in the error ::cudaErrorUnsupportedLimit being + * returned. + * + * - ::cudaLimitMaxL2FetchGranularity controls the L2 cache fetch granularity. + * Values can range from 0B to 128B. This is purely a performance hint and + * it can be ignored or clamped depending on the platform. + * + * - ::cudaLimitPersistingL2CacheSize controls size in bytes available + * for persisting L2 cache. This is purely a performance hint and it + * can be ignored or clamped depending on the platform. + * + * \param limit - Limit to set + * \param value - Size of limit + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDeviceGetLimit, + * ::cuCtxSetLimit + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceSetLimit(enum cudaLimit limit, size_t value); + +/** + * \brief Returns resource limits + * * Returns in \p *pValue the current size of \p limit. The following ::cudaLimit values are supported. + * - ::cudaLimitStackSize is the stack size in bytes of each GPU thread. + * - ::cudaLimitPrintfFifoSize is the size in bytes of the shared FIFO used by the + * ::printf() device system call. + * - ::cudaLimitMallocHeapSize is the size in bytes of the heap used by the + * ::malloc() and ::free() device system calls. + * - ::cudaLimitDevRuntimeSyncDepth is the maximum grid depth at which a + * thread can isssue the device runtime call ::cudaDeviceSynchronize() + * to wait on child grid launches to complete. + * - ::cudaLimitDevRuntimePendingLaunchCount is the maximum number of outstanding + * device runtime launches. + * - ::cudaLimitMaxL2FetchGranularity is the L2 cache fetch granularity. + * - ::cudaLimitPersistingL2CacheSize is the persisting L2 cache size in bytes. + * + * \param limit - Limit to query + * \param pValue - Returned size of the limit + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDeviceSetLimit, + * ::cuCtxGetLimit + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetLimit(size_t *pValue, enum cudaLimit limit); + +/** + * \brief Returns the maximum number of elements allocatable in a 1D linear texture for a given element size. + * + * Returns in \p maxWidthInElements the maximum number of elements allocatable in a 1D linear texture + * for given format descriptor \p fmtDesc. + * + * \param maxWidthInElements - Returns maximum number of texture elements allocatable for given \p fmtDesc. + * \param fmtDesc - Texture format description. + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cuDeviceGetMaxTexture1DLinear, + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, const struct cudaChannelFormatDesc *fmtDesc, int device); +#endif + +/** + * \brief Returns the preferred cache configuration for the current device. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this returns through \p pCacheConfig the preferred cache + * configuration for the current device. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute functions. + * + * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory + * + * \param pCacheConfig - Returned cache configuration + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceSetCacheConfig, + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * ::cuCtxGetCacheConfig + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetCacheConfig(enum cudaFuncCache *pCacheConfig); + +/** + * \brief Returns numerical values that correspond to the least and + * greatest stream priorities. + * + * Returns in \p *leastPriority and \p *greatestPriority the numerical values that correspond + * to the least and greatest stream priorities respectively. Stream priorities + * follow a convention where lower numbers imply greater priorities. The range of + * meaningful stream priorities is given by [\p *greatestPriority, \p *leastPriority]. + * If the user attempts to create a stream with a priority value that is + * outside the the meaningful range as specified by this API, the priority is + * automatically clamped down or up to either \p *leastPriority or \p *greatestPriority + * respectively. See ::cudaStreamCreateWithPriority for details on creating a + * priority stream. + * A NULL may be passed in for \p *leastPriority or \p *greatestPriority if the value + * is not desired. + * + * This function will return '0' in both \p *leastPriority and \p *greatestPriority if + * the current context's device does not support stream priorities + * (see ::cudaDeviceGetAttribute). + * + * \param leastPriority - Pointer to an int in which the numerical value for least + * stream priority is returned + * \param greatestPriority - Pointer to an int in which the numerical value for greatest + * stream priority is returned + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreateWithPriority, + * ::cudaStreamGetPriority, + * ::cuCtxGetStreamPriorityRange + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetStreamPriorityRange(int *leastPriority, int *greatestPriority); + +/** + * \brief Sets the preferred cache configuration for the current device. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache + * configuration for the current device. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute the function. Any + * function preference set via + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)" + * or + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)" + * will be preferred over this device-wide setting. Setting the device-wide + * cache configuration to ::cudaFuncCachePreferNone will cause subsequent + * kernel launches to prefer to not change the cache configuration unless + * required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory + * + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceGetCacheConfig, + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * ::cuCtxSetCacheConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceSetCacheConfig(enum cudaFuncCache cacheConfig); + +/** + * \brief Returns the shared memory configuration for the current device. + * + * This function will return in \p pConfig the current size of shared memory banks + * on the current device. On devices with configurable shared memory banks, + * ::cudaDeviceSetSharedMemConfig can be used to change this setting, so that all + * subsequent kernel launches will by default use the new bank size. When + * ::cudaDeviceGetSharedMemConfig is called on devices without configurable shared + * memory, it will return the fixed bank size of the hardware. + * + * The returned bank configurations can be either: + * - ::cudaSharedMemBankSizeFourByte - shared memory bank width is four bytes. + * - ::cudaSharedMemBankSizeEightByte - shared memory bank width is eight bytes. + * + * \param pConfig - Returned cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceSetCacheConfig, + * ::cudaDeviceGetCacheConfig, + * ::cudaDeviceSetSharedMemConfig, + * ::cudaFuncSetCacheConfig, + * ::cuCtxGetSharedMemConfig + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetSharedMemConfig(enum cudaSharedMemConfig *pConfig); + +/** + * \brief Sets the shared memory configuration for the current device. + * + * On devices with configurable shared memory banks, this function will set + * the shared memory bank size which is used for all subsequent kernel launches. + * Any per-function setting of shared memory set via ::cudaFuncSetSharedMemConfig + * will override the device wide setting. + * + * Changing the shared memory configuration between launches may introduce + * a device side synchronization point. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * The supported bank configurations are: + * - ::cudaSharedMemBankSizeDefault: set bank width the device default (currently, + * four bytes) + * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be four bytes + * natively. + * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight + * bytes natively. + * + * \param config - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceSetCacheConfig, + * ::cudaDeviceGetCacheConfig, + * ::cudaDeviceGetSharedMemConfig, + * ::cudaFuncSetCacheConfig, + * ::cuCtxSetSharedMemConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceSetSharedMemConfig(enum cudaSharedMemConfig config); + +/** + * \brief Returns a handle to a compute device + * + * Returns in \p *device a device ordinal given a PCI bus ID string. + * + * \param device - Returned device ordinal + * + * \param pciBusId - String in one of the following forms: + * [domain]:[bus]:[device].[function] + * [domain]:[bus]:[device] + * [bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDeviceGetPCIBusId, + * ::cuDeviceGetByPCIBusId + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceGetByPCIBusId(int *device, const char *pciBusId); + +/** + * \brief Returns a PCI Bus Id string for the device + * + * Returns an ASCII string identifying the device \p dev in the NULL-terminated + * string pointed to by \p pciBusId. \p len specifies the maximum length of the + * string that may be returned. + * + * \param pciBusId - Returned identifier string for the device in the following format + * [domain]:[bus]:[device].[function] + * where \p domain, \p bus, \p device, and \p function are all hexadecimal values. + * pciBusId should be large enough to store 13 characters including the NULL-terminator. + * + * \param len - Maximum length of string to store in \p name + * + * \param device - Device to get identifier string for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDeviceGetByPCIBusId, + * ::cuDeviceGetPCIBusId + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceGetPCIBusId(char *pciBusId, int len, int device); + +/** + * \brief Gets an interprocess handle for a previously allocated event + * + * Takes as input a previously allocated event. This event must have been + * created with the ::cudaEventInterprocess and ::cudaEventDisableTiming + * flags set. This opaque handle may be copied into other processes and + * opened with ::cudaIpcOpenEventHandle to allow efficient hardware + * synchronization between GPU work in different processes. + * + * After the event has been been opened in the importing process, + * ::cudaEventRecord, ::cudaEventSynchronize, ::cudaStreamWaitEvent and + * ::cudaEventQuery may be used in either process. Performing operations + * on the imported event after the exported event has been freed + * with ::cudaEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param handle - Pointer to a user allocated cudaIpcEventHandle + * in which to return the opaque event handle + * \param event - Event allocated with ::cudaEventInterprocess and + * ::cudaEventDisableTiming flags. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaEventCreate, + * ::cudaEventDestroy, + * ::cudaEventSynchronize, + * ::cudaEventQuery, + * ::cudaStreamWaitEvent, + * ::cudaIpcOpenEventHandle, + * ::cudaIpcGetMemHandle, + * ::cudaIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle, + * ::cuIpcGetEventHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcGetEventHandle(cudaIpcEventHandle_t *handle, cudaEvent_t event); + +/** + * \brief Opens an interprocess event handle for use in the current process + * + * Opens an interprocess event handle exported from another process with + * ::cudaIpcGetEventHandle. This function returns a ::cudaEvent_t that behaves like + * a locally created event with the ::cudaEventDisableTiming flag specified. + * This event must be freed with ::cudaEventDestroy. + * + * Performing operations on the imported event after the exported event has + * been freed with ::cudaEventDestroy will result in undefined behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param event - Returns the imported event + * \param handle - Interprocess handle to open + * + * \returns + * ::cudaSuccess, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue, + * ::cudaErrorDeviceUninitialized + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaEventCreate, + * ::cudaEventDestroy, + * ::cudaEventSynchronize, + * ::cudaEventQuery, + * ::cudaStreamWaitEvent, + * ::cudaIpcGetEventHandle, + * ::cudaIpcGetMemHandle, + * ::cudaIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle, + * ::cuIpcOpenEventHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcOpenEventHandle(cudaEvent_t *event, cudaIpcEventHandle_t handle); + + +/** + * \brief Gets an interprocess memory handle for an existing device memory + * allocation + * + * Takes a pointer to the base of an existing device memory allocation created + * with ::cudaMalloc and exports it for use in another process. This is a + * lightweight operation and may be called multiple times on an allocation + * without adverse effects. + * + * If a region of memory is freed with ::cudaFree and a subsequent call + * to ::cudaMalloc returns memory with the same device address, + * ::cudaIpcGetMemHandle will return a unique handle for the + * new memory. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param handle - Pointer to user allocated ::cudaIpcMemHandle to return + * the handle in. + * \param devPtr - Base pointer to previously allocated device memory + * + * \returns + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMalloc, + * ::cudaFree, + * ::cudaIpcGetEventHandle, + * ::cudaIpcOpenEventHandle, + * ::cudaIpcOpenMemHandle, + * ::cudaIpcCloseMemHandle, + * ::cuIpcGetMemHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr); + +/** + * \brief Opens an interprocess memory handle exported from another process + * and returns a device pointer usable in the local process. + * + * Maps memory exported from another process with ::cudaIpcGetMemHandle into + * the current device address space. For contexts on different devices + * ::cudaIpcOpenMemHandle can attempt to enable peer access between the + * devices as if the user called ::cudaDeviceEnablePeerAccess. This behavior is + * controlled by the ::cudaIpcMemLazyEnablePeerAccess flag. + * ::cudaDeviceCanAccessPeer can determine if a mapping is possible. + * + * ::cudaIpcOpenMemHandle can open handles to devices that may not be visible + * in the process calling the API. + * + * Contexts that may open ::cudaIpcMemHandles are restricted in the following way. + * ::cudaIpcMemHandles from each device in a given process may only be opened + * by one context per device per other process. + * + * If the memory handle has already been opened by the current context, the + * reference count on the handle is incremented by 1 and the existing device pointer + * is returned. + * + * Memory returned from ::cudaIpcOpenMemHandle must be freed with + * ::cudaIpcCloseMemHandle. + * + * Calling ::cudaFree on an exported memory region before calling + * ::cudaIpcCloseMemHandle in the importing context will result in undefined + * behavior. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param devPtr - Returned device pointer + * \param handle - ::cudaIpcMemHandle to open + * \param flags - Flags for this operation. Must be specified as ::cudaIpcMemLazyEnablePeerAccess + * + * \returns + * ::cudaSuccess, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorDeviceUninitialized, + * ::cudaErrorTooManyPeers, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \note No guarantees are made about the address returned in \p *devPtr. + * In particular, multiple processes may not receive the same address for the same \p handle. + * + * \sa + * ::cudaMalloc, + * ::cudaFree, + * ::cudaIpcGetEventHandle, + * ::cudaIpcOpenEventHandle, + * ::cudaIpcGetMemHandle, + * ::cudaIpcCloseMemHandle, + * ::cudaDeviceEnablePeerAccess, + * ::cudaDeviceCanAccessPeer, + * ::cuIpcOpenMemHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcOpenMemHandle(void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags); + +/** + * \brief Attempts to close memory mapped with cudaIpcOpenMemHandle + * + * Decrements the reference count of the memory returnd by ::cudaIpcOpenMemHandle by 1. + * When the reference count reaches 0, this API unmaps the memory. The original allocation + * in the exporting process as well as imported mappings in other processes + * will be unaffected. + * + * Any resources used to enable peer access will be freed if this is the + * last mapping using them. + * + * IPC functionality is restricted to devices with support for unified + * addressing on Linux operating systems. IPC functionality is not supported + * on Tegra platforms. + * + * \param devPtr - Device pointer returned by ::cudaIpcOpenMemHandle + * + * \returns + * ::cudaSuccess, + * ::cudaErrorMapBufferObjectFailed, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMalloc, + * ::cudaFree, + * ::cudaIpcGetEventHandle, + * ::cudaIpcOpenEventHandle, + * ::cudaIpcGetMemHandle, + * ::cudaIpcOpenMemHandle, + * ::cuIpcCloseMemHandle + */ +extern __host__ cudaError_t CUDARTAPI cudaIpcCloseMemHandle(void *devPtr); + +/** + * \brief Blocks until remote writes are visible to the specified scope + * + * Blocks until remote writes to the target context via mappings created + * through GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see + * https://docs.nvidia.com/cuda/gpudirect-rdma for more information), are + * visible to the specified scope. + * + * If the scope equals or lies within the scope indicated by + * ::cudaDevAttrGPUDirectRDMAWritesOrdering, the call will be a no-op and + * can be safely omitted for performance. This can be determined by + * comparing the numerical values between the two enums, with smaller + * scopes having smaller values. + * + * Users may query support for this API via ::cudaDevAttrGPUDirectRDMAFlushWritesOptions. + * + * \param target - The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget + * \param scope - The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotSupported, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cuFlushGPUDirectRDMAWrites + */ +#if __CUDART_API_VERSION >= 11030 +extern __host__ cudaError_t CUDARTAPI cudaDeviceFlushGPUDirectRDMAWrites(enum cudaFlushGPUDirectRDMAWritesTarget target, enum cudaFlushGPUDirectRDMAWritesScope scope); +#endif + +/** @} */ /* END CUDART_DEVICE */ + +/** + * \defgroup CUDART_THREAD_DEPRECATED Thread Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated thread management functions of the CUDA runtime + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes deprecated thread management functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Exit and clean up from CUDA launches + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceReset(), which should be used + * instead. + * + * Explicitly destroys all cleans up all resources associated with the current + * device in the current process. Any subsequent API call to this device will + * reinitialize the device. + * + * Note that this function will reset the device immediately. It is the caller's + * responsibility to ensure that the device is not being accessed by any + * other host threads from the process when this function is called. + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceReset + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadExit(void); + +/** + * \brief Wait for compute device to finish + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is similar to the + * non-deprecated function ::cudaDeviceSynchronize(), which should be used + * instead. + * + * Blocks until the device has completed all preceding requested tasks. + * ::cudaThreadSynchronize() returns an error if one of the preceding tasks + * has failed. If the ::cudaDeviceScheduleBlockingSync flag was set for + * this device, the host thread will block until the device has finished + * its work. + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceSynchronize + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSynchronize(void); + +/** + * \brief Set resource limits + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceSetLimit(), which should be used + * instead. + * + * Setting \p limit to \p value is a request by the application to update + * the current limit maintained by the device. The driver is free to + * modify the requested value to meet h/w requirements (this could be + * clamping to minimum or maximum values, rounding up to nearest element + * size, etc). The application can use ::cudaThreadGetLimit() to find out + * exactly what the limit has been set to. + * + * Setting each ::cudaLimit has its own specific restrictions, so each is + * discussed here. + * + * - ::cudaLimitStackSize controls the stack size of each GPU thread. + * + * - ::cudaLimitPrintfFifoSize controls the size of the shared FIFO + * used by the ::printf() device system call. + * Setting ::cudaLimitPrintfFifoSize must be performed before + * launching any kernel that uses the ::printf() device + * system call, otherwise ::cudaErrorInvalidValue will be returned. + * + * - ::cudaLimitMallocHeapSize controls the size of the heap used + * by the ::malloc() and ::free() device system calls. Setting + * ::cudaLimitMallocHeapSize must be performed before launching + * any kernel that uses the ::malloc() or ::free() device system calls, + * otherwise ::cudaErrorInvalidValue will be returned. + * + * \param limit - Limit to set + * \param value - Size in bytes of limit + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceSetLimit + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetLimit(enum cudaLimit limit, size_t value); + +/** + * \brief Returns resource limits + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceGetLimit(), which should be used + * instead. + * + * Returns in \p *pValue the current size of \p limit. The supported + * ::cudaLimit values are: + * - ::cudaLimitStackSize: stack size of each GPU thread; + * - ::cudaLimitPrintfFifoSize: size of the shared FIFO used by the + * ::printf() device system call. + * - ::cudaLimitMallocHeapSize: size of the heap used by the + * ::malloc() and ::free() device system calls; + * + * \param limit - Limit to query + * \param pValue - Returned size in bytes of limit + * + * \return + * ::cudaSuccess, + * ::cudaErrorUnsupportedLimit, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceGetLimit + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetLimit(size_t *pValue, enum cudaLimit limit); + +/** + * \brief Returns the preferred cache configuration for the current device. + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceGetCacheConfig(), which should be + * used instead. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this returns through \p pCacheConfig the preferred cache + * configuration for the current device. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute functions. + * + * This will return a \p pCacheConfig of ::cudaFuncCachePreferNone on devices + * where the size of the L1 cache and shared memory are fixed. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * + * \param pCacheConfig - Returned cache configuration + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceGetCacheConfig + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadGetCacheConfig(enum cudaFuncCache *pCacheConfig); + +/** + * \brief Sets the preferred cache configuration for the current device. + * + * \deprecated + * + * Note that this function is deprecated because its name does not + * reflect its behavior. Its functionality is identical to the + * non-deprecated function ::cudaDeviceSetCacheConfig(), which should be + * used instead. + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache + * configuration for the current device. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute the function. Any + * function preference set via + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)" + * or + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)" + * will be preferred over this device-wide setting. Setting the device-wide + * cache configuration to ::cudaFuncCachePreferNone will cause subsequent + * kernel launches to prefer to not change the cache configuration unless + * required to launch the kernel. + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceSetCacheConfig + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaThreadSetCacheConfig(enum cudaFuncCache cacheConfig); + +/** @} */ /* END CUDART_THREAD_DEPRECATED */ + +/** + * \defgroup CUDART_ERROR Error Handling + * + * ___MANBRIEF___ error handling functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the error handling functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Returns the last error from a runtime call + * + * Returns the last error that has been produced by any of the runtime calls + * in the same host thread and resets it to ::cudaSuccess. + * + * \return + * ::cudaSuccess, + * ::cudaErrorMissingConfiguration, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorInitializationError, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorUnmapBufferObjectFailed, + * ::cudaErrorInvalidDevicePointer, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding, + * ::cudaErrorInvalidChannelDescriptor, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorInvalidFilterSetting, + * ::cudaErrorInvalidNormSetting, + * ::cudaErrorUnknown, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorInsufficientDriver, + * ::cudaErrorNoDevice, + * ::cudaErrorSetOnActiveProcess, + * ::cudaErrorStartupFailure, + * ::cudaErrorInvalidPtx, + * ::cudaErrorUnsupportedPtxVersion, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound, + * ::cudaErrorJitCompilationDisabled + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaPeekAtLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetLastError(void); + +/** + * \brief Returns the last error from a runtime call + * + * Returns the last error that has been produced by any of the runtime calls + * in the same host thread. Note that this call does not reset the error to + * ::cudaSuccess like ::cudaGetLastError(). + * + * \return + * ::cudaSuccess, + * ::cudaErrorMissingConfiguration, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorInitializationError, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorUnmapBufferObjectFailed, + * ::cudaErrorInvalidDevicePointer, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding, + * ::cudaErrorInvalidChannelDescriptor, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorInvalidFilterSetting, + * ::cudaErrorInvalidNormSetting, + * ::cudaErrorUnknown, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorInsufficientDriver, + * ::cudaErrorNoDevice, + * ::cudaErrorSetOnActiveProcess, + * ::cudaErrorStartupFailure, + * ::cudaErrorInvalidPtx, + * ::cudaErrorUnsupportedPtxVersion, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound, + * ::cudaErrorJitCompilationDisabled + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetLastError, ::cudaGetErrorName, ::cudaGetErrorString, ::cudaError + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaPeekAtLastError(void); + +/** + * \brief Returns the string representation of an error code enum name + * + * Returns a string containing the name of an error code in the enum. If the error + * code is not recognized, "unrecognized error code" is returned. + * + * \param error - Error code to convert to string + * + * \return + * \p char* pointer to a NULL-terminated string + * + * \sa ::cudaGetErrorString, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError, + * ::cuGetErrorName + */ +extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorName(cudaError_t error); + +/** + * \brief Returns the description string for an error code + * + * Returns the description string for an error code. If the error + * code is not recognized, "unrecognized error code" is returned. + * + * \param error - Error code to convert to string + * + * \return + * \p char* pointer to a NULL-terminated string + * + * \sa ::cudaGetErrorName, ::cudaGetLastError, ::cudaPeekAtLastError, ::cudaError, + * ::cuGetErrorString + */ +extern __host__ __cudart_builtin__ const char* CUDARTAPI cudaGetErrorString(cudaError_t error); +/** @} */ /* END CUDART_ERROR */ + +/** + * \addtogroup CUDART_DEVICE + * + * @{ + */ + +/** + * \brief Returns the number of compute-capable devices + * + * Returns in \p *count the number of devices with compute capability greater + * or equal to 2.0 that are available for execution. + * + * \param count - Returns the number of devices with compute capability + * greater or equal to 2.0 + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDevice, ::cudaSetDevice, ::cudaGetDeviceProperties, + * ::cudaChooseDevice, + * ::cuDeviceGetCount + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceCount(int *count); + +/** + * \brief Returns information about the compute-device + * + * Returns in \p *prop the properties of device \p dev. The ::cudaDeviceProp + * structure is defined as: + * \code + struct cudaDeviceProp { + char name[256]; + cudaUUID_t uuid; + size_t totalGlobalMem; + size_t sharedMemPerBlock; + int regsPerBlock; + int warpSize; + size_t memPitch; + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int clockRate; + size_t totalConstMem; + int major; + int minor; + size_t textureAlignment; + size_t texturePitchAlignment; + int deviceOverlap; + int multiProcessorCount; + int kernelExecTimeoutEnabled; + int integrated; + int canMapHostMemory; + int computeMode; + int maxTexture1D; + int maxTexture1DMipmap; + int maxTexture1DLinear; + int maxTexture2D[2]; + int maxTexture2DMipmap[2]; + int maxTexture2DLinear[3]; + int maxTexture2DGather[2]; + int maxTexture3D[3]; + int maxTexture3DAlt[3]; + int maxTextureCubemap; + int maxTexture1DLayered[2]; + int maxTexture2DLayered[3]; + int maxTextureCubemapLayered[2]; + int maxSurface1D; + int maxSurface2D[2]; + int maxSurface3D[3]; + int maxSurface1DLayered[2]; + int maxSurface2DLayered[3]; + int maxSurfaceCubemap; + int maxSurfaceCubemapLayered[2]; + size_t surfaceAlignment; + int concurrentKernels; + int ECCEnabled; + int pciBusID; + int pciDeviceID; + int pciDomainID; + int tccDriver; + int asyncEngineCount; + int unifiedAddressing; + int memoryClockRate; + int memoryBusWidth; + int l2CacheSize; + int persistingL2CacheMaxSize; + int maxThreadsPerMultiProcessor; + int streamPrioritiesSupported; + int globalL1CacheSupported; + int localL1CacheSupported; + size_t sharedMemPerMultiprocessor; + int regsPerMultiprocessor; + int managedMemory; + int isMultiGpuBoard; + int multiGpuBoardGroupID; + int singleToDoublePrecisionPerfRatio; + int pageableMemoryAccess; + int concurrentManagedAccess; + int computePreemptionSupported; + int canUseHostPointerForRegisteredMem; + int cooperativeLaunch; + int cooperativeMultiDeviceLaunch; + int pageableMemoryAccessUsesHostPageTables; + int directManagedMemAccessFromHost; + int accessPolicyMaxWindowSize; + } + \endcode + * where: + * - \ref ::cudaDeviceProp::name "name[256]" is an ASCII string identifying + * the device. + * - \ref ::cudaDeviceProp::uuid "uuid" is a 16-byte unique identifier. + * - \ref ::cudaDeviceProp::totalGlobalMem "totalGlobalMem" is the total + * amount of global memory available on the device in bytes. + * - \ref ::cudaDeviceProp::sharedMemPerBlock "sharedMemPerBlock" is the + * maximum amount of shared memory available to a thread block in bytes. + * - \ref ::cudaDeviceProp::regsPerBlock "regsPerBlock" is the maximum number + * of 32-bit registers available to a thread block. + * - \ref ::cudaDeviceProp::warpSize "warpSize" is the warp size in threads. + * - \ref ::cudaDeviceProp::memPitch "memPitch" is the maximum pitch in + * bytes allowed by the memory copy functions that involve memory regions + * allocated through ::cudaMallocPitch(). + * - \ref ::cudaDeviceProp::maxThreadsPerBlock "maxThreadsPerBlock" is the + * maximum number of threads per block. + * - \ref ::cudaDeviceProp::maxThreadsDim "maxThreadsDim[3]" contains the + * maximum size of each dimension of a block. + * - \ref ::cudaDeviceProp::maxGridSize "maxGridSize[3]" contains the + * maximum size of each dimension of a grid. + * - \ref ::cudaDeviceProp::clockRate "clockRate" is the clock frequency in + * kilohertz. + * - \ref ::cudaDeviceProp::totalConstMem "totalConstMem" is the total amount + * of constant memory available on the device in bytes. + * - \ref ::cudaDeviceProp::major "major", + * \ref ::cudaDeviceProp::minor "minor" are the major and minor revision + * numbers defining the device's compute capability. + * - \ref ::cudaDeviceProp::textureAlignment "textureAlignment" is the + * alignment requirement; texture base addresses that are aligned to + * \ref ::cudaDeviceProp::textureAlignment "textureAlignment" bytes do not + * need an offset applied to texture fetches. + * - \ref ::cudaDeviceProp::texturePitchAlignment "texturePitchAlignment" is the + * pitch alignment requirement for 2D texture references that are bound to + * pitched memory. + * - \ref ::cudaDeviceProp::deviceOverlap "deviceOverlap" is 1 if the device + * can concurrently copy memory between host and device while executing a + * kernel, or 0 if not. Deprecated, use instead asyncEngineCount. + * - \ref ::cudaDeviceProp::multiProcessorCount "multiProcessorCount" is the + * number of multiprocessors on the device. + * - \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled" + * is 1 if there is a run time limit for kernels executed on the device, or + * 0 if not. + * - \ref ::cudaDeviceProp::integrated "integrated" is 1 if the device is an + * integrated (motherboard) GPU and 0 if it is a discrete (card) component. + * - \ref ::cudaDeviceProp::canMapHostMemory "canMapHostMemory" is 1 if the + * device can map host memory into the CUDA address space for use with + * ::cudaHostAlloc()/::cudaHostGetDevicePointer(), or 0 if not. + * - \ref ::cudaDeviceProp::computeMode "computeMode" is the compute mode + * that the device is currently in. Available modes are as follows: + * - cudaComputeModeDefault: Default mode - Device is not restricted and + * multiple threads can use ::cudaSetDevice() with this device. + * - cudaComputeModeProhibited: Compute-prohibited mode - No threads can use + * ::cudaSetDevice() with this device. + * - cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many + * threads in one process will be able to use ::cudaSetDevice() with this device. + * <br> When an occupied exclusive mode device is chosen with ::cudaSetDevice, + * all subsequent non-device management runtime functions will return + * ::cudaErrorDevicesUnavailable. + * - \ref ::cudaDeviceProp::maxTexture1D "maxTexture1D" is the maximum 1D + * texture size. + * - \ref ::cudaDeviceProp::maxTexture1DMipmap "maxTexture1DMipmap" is the maximum + * 1D mipmapped texture texture size. + * - \ref ::cudaDeviceProp::maxTexture1DLinear "maxTexture1DLinear" is the maximum + * 1D texture size for textures bound to linear memory. + * - \ref ::cudaDeviceProp::maxTexture2D "maxTexture2D[2]" contains the maximum + * 2D texture dimensions. + * - \ref ::cudaDeviceProp::maxTexture2DMipmap "maxTexture2DMipmap[2]" contains the + * maximum 2D mipmapped texture dimensions. + * - \ref ::cudaDeviceProp::maxTexture2DLinear "maxTexture2DLinear[3]" contains the + * maximum 2D texture dimensions for 2D textures bound to pitch linear memory. + * - \ref ::cudaDeviceProp::maxTexture2DGather "maxTexture2DGather[2]" contains the + * maximum 2D texture dimensions if texture gather operations have to be performed. + * - \ref ::cudaDeviceProp::maxTexture3D "maxTexture3D[3]" contains the maximum + * 3D texture dimensions. + * - \ref ::cudaDeviceProp::maxTexture3DAlt "maxTexture3DAlt[3]" + * contains the maximum alternate 3D texture dimensions. + * - \ref ::cudaDeviceProp::maxTextureCubemap "maxTextureCubemap" is the + * maximum cubemap texture width or height. + * - \ref ::cudaDeviceProp::maxTexture1DLayered "maxTexture1DLayered[2]" contains + * the maximum 1D layered texture dimensions. + * - \ref ::cudaDeviceProp::maxTexture2DLayered "maxTexture2DLayered[3]" contains + * the maximum 2D layered texture dimensions. + * - \ref ::cudaDeviceProp::maxTextureCubemapLayered "maxTextureCubemapLayered[2]" + * contains the maximum cubemap layered texture dimensions. + * - \ref ::cudaDeviceProp::maxSurface1D "maxSurface1D" is the maximum 1D + * surface size. + * - \ref ::cudaDeviceProp::maxSurface2D "maxSurface2D[2]" contains the maximum + * 2D surface dimensions. + * - \ref ::cudaDeviceProp::maxSurface3D "maxSurface3D[3]" contains the maximum + * 3D surface dimensions. + * - \ref ::cudaDeviceProp::maxSurface1DLayered "maxSurface1DLayered[2]" contains + * the maximum 1D layered surface dimensions. + * - \ref ::cudaDeviceProp::maxSurface2DLayered "maxSurface2DLayered[3]" contains + * the maximum 2D layered surface dimensions. + * - \ref ::cudaDeviceProp::maxSurfaceCubemap "maxSurfaceCubemap" is the maximum + * cubemap surface width or height. + * - \ref ::cudaDeviceProp::maxSurfaceCubemapLayered "maxSurfaceCubemapLayered[2]" + * contains the maximum cubemap layered surface dimensions. + * - \ref ::cudaDeviceProp::surfaceAlignment "surfaceAlignment" specifies the + * alignment requirements for surfaces. + * - \ref ::cudaDeviceProp::concurrentKernels "concurrentKernels" is 1 if the + * device supports executing multiple kernels within the same context + * simultaneously, or 0 if not. It is not guaranteed that multiple kernels + * will be resident on the device concurrently so this feature should not be + * relied upon for correctness. + * - \ref ::cudaDeviceProp::ECCEnabled "ECCEnabled" is 1 if the device has ECC + * support turned on, or 0 if not. + * - \ref ::cudaDeviceProp::pciBusID "pciBusID" is the PCI bus identifier of + * the device. + * - \ref ::cudaDeviceProp::pciDeviceID "pciDeviceID" is the PCI device + * (sometimes called slot) identifier of the device. + * - \ref ::cudaDeviceProp::pciDomainID "pciDomainID" is the PCI domain identifier + * of the device. + * - \ref ::cudaDeviceProp::tccDriver "tccDriver" is 1 if the device is using a + * TCC driver or 0 if not. + * - \ref ::cudaDeviceProp::asyncEngineCount "asyncEngineCount" is 1 when the + * device can concurrently copy memory between host and device while executing + * a kernel. It is 2 when the device can concurrently copy memory between host + * and device in both directions and execute a kernel at the same time. It is + * 0 if neither of these is supported. + * - \ref ::cudaDeviceProp::unifiedAddressing "unifiedAddressing" is 1 if the device + * shares a unified address space with the host and 0 otherwise. + * - \ref ::cudaDeviceProp::memoryClockRate "memoryClockRate" is the peak memory + * clock frequency in kilohertz. + * - \ref ::cudaDeviceProp::memoryBusWidth "memoryBusWidth" is the memory bus width + * in bits. + * - \ref ::cudaDeviceProp::l2CacheSize "l2CacheSize" is L2 cache size in bytes. + * - \ref ::cudaDeviceProp::persistingL2CacheMaxSize "persistingL2CacheMaxSize" is L2 cache's maximum persisting lines size in bytes. + * - \ref ::cudaDeviceProp::maxThreadsPerMultiProcessor "maxThreadsPerMultiProcessor" + * is the number of maximum resident threads per multiprocessor. + * - \ref ::cudaDeviceProp::streamPrioritiesSupported "streamPrioritiesSupported" + * is 1 if the device supports stream priorities, or 0 if it is not supported. + * - \ref ::cudaDeviceProp::globalL1CacheSupported "globalL1CacheSupported" + * is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported. + * - \ref ::cudaDeviceProp::localL1CacheSupported "localL1CacheSupported" + * is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported. + * - \ref ::cudaDeviceProp::sharedMemPerMultiprocessor "sharedMemPerMultiprocessor" is the + * maximum amount of shared memory available to a multiprocessor in bytes; this amount is + * shared by all thread blocks simultaneously resident on a multiprocessor. + * - \ref ::cudaDeviceProp::regsPerMultiprocessor "regsPerMultiprocessor" is the maximum number + * of 32-bit registers available to a multiprocessor; this number is shared + * by all thread blocks simultaneously resident on a multiprocessor. + * - \ref ::cudaDeviceProp::managedMemory "managedMemory" + * is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported. + * - \ref ::cudaDeviceProp::isMultiGpuBoard "isMultiGpuBoard" + * is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not; + * - \ref ::cudaDeviceProp::multiGpuBoardGroupID "multiGpuBoardGroupID" is a unique identifier + * for a group of devices associated with the same board. + * Devices on the same multi-GPU board will share the same identifier. + * - \ref ::cudaDeviceProp::singleToDoublePrecisionPerfRatio "singleToDoublePrecisionPerfRatio" + * is the ratio of single precision performance (in floating-point operations per second) + * to double precision performance. + * - \ref ::cudaDeviceProp::pageableMemoryAccess "pageableMemoryAccess" is 1 if the device supports + * coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise. + * - \ref ::cudaDeviceProp::concurrentManagedAccess "concurrentManagedAccess" is 1 if the device can + * coherently access managed memory concurrently with the CPU, and 0 otherwise. + * - \ref ::cudaDeviceProp::computePreemptionSupported "computePreemptionSupported" is 1 if the device + * supports Compute Preemption, and 0 otherwise. + * - \ref ::cudaDeviceProp::canUseHostPointerForRegisteredMem "canUseHostPointerForRegisteredMem" is 1 if + * the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise. + * - \ref ::cudaDeviceProp::cooperativeLaunch "cooperativeLaunch" is 1 if the device supports launching + * cooperative kernels via ::cudaLaunchCooperativeKernel, and 0 otherwise. + * - \ref ::cudaDeviceProp::cooperativeMultiDeviceLaunch "cooperativeMultiDeviceLaunch" is 1 if the device + * supports launching cooperative kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise. + * - \ref ::cudaDeviceProp::pageableMemoryAccessUsesHostPageTables "pageableMemoryAccessUsesHostPageTables" is 1 if the device accesses + * pageable memory via the host's page tables, and 0 otherwise. + * - \ref ::cudaDeviceProp::directManagedMemAccessFromHost "directManagedMemAccessFromHost" is 1 if the host can directly access managed + * memory on the device without migration, and 0 otherwise. + * - \ref ::cudaDeviceProp::maxBlocksPerMultiProcessor "maxBlocksPerMultiProcessor" is the maximum number of thread blocks + * that can reside on a multiprocessor. + * - \ref ::cudaDeviceProp::accessPolicyMaxWindowSize "accessPolicyMaxWindowSize" is + * the maximum value of ::cudaAccessPolicyWindow::num_bytes. + * + * \param prop - Properties for the specified device + * \param device - Device number to get properties for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice, + * ::cudaDeviceGetAttribute, + * ::cuDeviceGetAttribute, + * ::cuDeviceGetName + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device); + +/** + * \brief Returns information about the device + * + * Returns in \p *value the integer value of the attribute \p attr on device + * \p device. The supported attributes are: + * - ::cudaDevAttrMaxThreadsPerBlock: Maximum number of threads per block + * - ::cudaDevAttrMaxBlockDimX: Maximum x-dimension of a block + * - ::cudaDevAttrMaxBlockDimY: Maximum y-dimension of a block + * - ::cudaDevAttrMaxBlockDimZ: Maximum z-dimension of a block + * - ::cudaDevAttrMaxGridDimX: Maximum x-dimension of a grid + * - ::cudaDevAttrMaxGridDimY: Maximum y-dimension of a grid + * - ::cudaDevAttrMaxGridDimZ: Maximum z-dimension of a grid + * - ::cudaDevAttrMaxSharedMemoryPerBlock: Maximum amount of shared memory + * available to a thread block in bytes + * - ::cudaDevAttrTotalConstantMemory: Memory available on device for + * __constant__ variables in a CUDA C kernel in bytes + * - ::cudaDevAttrWarpSize: Warp size in threads + * - ::cudaDevAttrMaxPitch: Maximum pitch in bytes allowed by the memory copy + * functions that involve memory regions allocated through ::cudaMallocPitch() + * - ::cudaDevAttrMaxTexture1DWidth: Maximum 1D texture width + * - ::cudaDevAttrMaxTexture1DLinearWidth: Maximum width for a 1D texture bound + * to linear memory + * - ::cudaDevAttrMaxTexture1DMipmappedWidth: Maximum mipmapped 1D texture width + * - ::cudaDevAttrMaxTexture2DWidth: Maximum 2D texture width + * - ::cudaDevAttrMaxTexture2DHeight: Maximum 2D texture height + * - ::cudaDevAttrMaxTexture2DLinearWidth: Maximum width for a 2D texture + * bound to linear memory + * - ::cudaDevAttrMaxTexture2DLinearHeight: Maximum height for a 2D texture + * bound to linear memory + * - ::cudaDevAttrMaxTexture2DLinearPitch: Maximum pitch in bytes for a 2D + * texture bound to linear memory + * - ::cudaDevAttrMaxTexture2DMipmappedWidth: Maximum mipmapped 2D texture + * width + * - ::cudaDevAttrMaxTexture2DMipmappedHeight: Maximum mipmapped 2D texture + * height + * - ::cudaDevAttrMaxTexture3DWidth: Maximum 3D texture width + * - ::cudaDevAttrMaxTexture3DHeight: Maximum 3D texture height + * - ::cudaDevAttrMaxTexture3DDepth: Maximum 3D texture depth + * - ::cudaDevAttrMaxTexture3DWidthAlt: Alternate maximum 3D texture width, + * 0 if no alternate maximum 3D texture size is supported + * - ::cudaDevAttrMaxTexture3DHeightAlt: Alternate maximum 3D texture height, + * 0 if no alternate maximum 3D texture size is supported + * - ::cudaDevAttrMaxTexture3DDepthAlt: Alternate maximum 3D texture depth, + * 0 if no alternate maximum 3D texture size is supported + * - ::cudaDevAttrMaxTextureCubemapWidth: Maximum cubemap texture width or + * height + * - ::cudaDevAttrMaxTexture1DLayeredWidth: Maximum 1D layered texture width + * - ::cudaDevAttrMaxTexture1DLayeredLayers: Maximum layers in a 1D layered + * texture + * - ::cudaDevAttrMaxTexture2DLayeredWidth: Maximum 2D layered texture width + * - ::cudaDevAttrMaxTexture2DLayeredHeight: Maximum 2D layered texture height + * - ::cudaDevAttrMaxTexture2DLayeredLayers: Maximum layers in a 2D layered + * texture + * - ::cudaDevAttrMaxTextureCubemapLayeredWidth: Maximum cubemap layered + * texture width or height + * - ::cudaDevAttrMaxTextureCubemapLayeredLayers: Maximum layers in a cubemap + * layered texture + * - ::cudaDevAttrMaxSurface1DWidth: Maximum 1D surface width + * - ::cudaDevAttrMaxSurface2DWidth: Maximum 2D surface width + * - ::cudaDevAttrMaxSurface2DHeight: Maximum 2D surface height + * - ::cudaDevAttrMaxSurface3DWidth: Maximum 3D surface width + * - ::cudaDevAttrMaxSurface3DHeight: Maximum 3D surface height + * - ::cudaDevAttrMaxSurface3DDepth: Maximum 3D surface depth + * - ::cudaDevAttrMaxSurface1DLayeredWidth: Maximum 1D layered surface width + * - ::cudaDevAttrMaxSurface1DLayeredLayers: Maximum layers in a 1D layered + * surface + * - ::cudaDevAttrMaxSurface2DLayeredWidth: Maximum 2D layered surface width + * - ::cudaDevAttrMaxSurface2DLayeredHeight: Maximum 2D layered surface height + * - ::cudaDevAttrMaxSurface2DLayeredLayers: Maximum layers in a 2D layered + * surface + * - ::cudaDevAttrMaxSurfaceCubemapWidth: Maximum cubemap surface width + * - ::cudaDevAttrMaxSurfaceCubemapLayeredWidth: Maximum cubemap layered + * surface width + * - ::cudaDevAttrMaxSurfaceCubemapLayeredLayers: Maximum layers in a cubemap + * layered surface + * - ::cudaDevAttrMaxRegistersPerBlock: Maximum number of 32-bit registers + * available to a thread block + * - ::cudaDevAttrClockRate: Peak clock frequency in kilohertz + * - ::cudaDevAttrTextureAlignment: Alignment requirement; texture base + * addresses aligned to ::textureAlign bytes do not need an offset applied + * to texture fetches + * - ::cudaDevAttrTexturePitchAlignment: Pitch alignment requirement for 2D + * texture references bound to pitched memory + * - ::cudaDevAttrGpuOverlap: 1 if the device can concurrently copy memory + * between host and device while executing a kernel, or 0 if not + * - ::cudaDevAttrMultiProcessorCount: Number of multiprocessors on the device + * - ::cudaDevAttrKernelExecTimeout: 1 if there is a run time limit for kernels + * executed on the device, or 0 if not + * - ::cudaDevAttrIntegrated: 1 if the device is integrated with the memory + * subsystem, or 0 if not + * - ::cudaDevAttrCanMapHostMemory: 1 if the device can map host memory into + * the CUDA address space, or 0 if not + * - ::cudaDevAttrComputeMode: Compute mode is the compute mode that the device + * is currently in. Available modes are as follows: + * - ::cudaComputeModeDefault: Default mode - Device is not restricted and + * multiple threads can use ::cudaSetDevice() with this device. + * - ::cudaComputeModeProhibited: Compute-prohibited mode - No threads can use + * ::cudaSetDevice() with this device. + * - ::cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many + * threads in one process will be able to use ::cudaSetDevice() with this + * device. + * - ::cudaDevAttrConcurrentKernels: 1 if the device supports executing + * multiple kernels within the same context simultaneously, or 0 if + * not. It is not guaranteed that multiple kernels will be resident on the + * device concurrently so this feature should not be relied upon for + * correctness. + * - ::cudaDevAttrEccEnabled: 1 if error correction is enabled on the device, + * 0 if error correction is disabled or not supported by the device + * - ::cudaDevAttrPciBusId: PCI bus identifier of the device + * - ::cudaDevAttrPciDeviceId: PCI device (also known as slot) identifier of + * the device + * - ::cudaDevAttrTccDriver: 1 if the device is using a TCC driver. TCC is only + * available on Tesla hardware running Windows Vista or later. + * - ::cudaDevAttrMemoryClockRate: Peak memory clock frequency in kilohertz + * - ::cudaDevAttrGlobalMemoryBusWidth: Global memory bus width in bits + * - ::cudaDevAttrL2CacheSize: Size of L2 cache in bytes. 0 if the device + * doesn't have L2 cache. + * - ::cudaDevAttrMaxThreadsPerMultiProcessor: Maximum resident threads per + * multiprocessor + * - ::cudaDevAttrUnifiedAddressing: 1 if the device shares a unified address + * space with the host, or 0 if not + * - ::cudaDevAttrComputeCapabilityMajor: Major compute capability version + * number + * - ::cudaDevAttrComputeCapabilityMinor: Minor compute capability version + * number + * - ::cudaDevAttrStreamPrioritiesSupported: 1 if the device supports stream + * priorities, or 0 if not + * - ::cudaDevAttrGlobalL1CacheSupported: 1 if device supports caching globals + * in L1 cache, 0 if not + * - ::cudaDevAttrLocalL1CacheSupported: 1 if device supports caching locals + * in L1 cache, 0 if not + * - ::cudaDevAttrMaxSharedMemoryPerMultiprocessor: Maximum amount of shared memory + * available to a multiprocessor in bytes; this amount is shared by all + * thread blocks simultaneously resident on a multiprocessor + * - ::cudaDevAttrMaxRegistersPerMultiprocessor: Maximum number of 32-bit registers + * available to a multiprocessor; this number is shared by all thread blocks + * simultaneously resident on a multiprocessor + * - ::cudaDevAttrManagedMemory: 1 if device supports allocating + * managed memory, 0 if not + * - ::cudaDevAttrIsMultiGpuBoard: 1 if device is on a multi-GPU board, 0 if not + * - ::cudaDevAttrMultiGpuBoardGroupID: Unique identifier for a group of devices on the + * same multi-GPU board + * - ::cudaDevAttrHostNativeAtomicSupported: 1 if the link between the device and the + * host supports native atomic operations + * - ::cudaDevAttrSingleToDoublePrecisionPerfRatio: Ratio of single precision performance + * (in floating-point operations per second) to double precision performance + * - ::cudaDevAttrPageableMemoryAccess: 1 if the device supports coherently accessing + * pageable memory without calling cudaHostRegister on it, and 0 otherwise + * - ::cudaDevAttrConcurrentManagedAccess: 1 if the device can coherently access managed + * memory concurrently with the CPU, and 0 otherwise + * - ::cudaDevAttrComputePreemptionSupported: 1 if the device supports + * Compute Preemption, 0 if not + * - ::cudaDevAttrCanUseHostPointerForRegisteredMem: 1 if the device can access host + * registered memory at the same virtual address as the CPU, and 0 otherwise + * - ::cudaDevAttrCooperativeLaunch: 1 if the device supports launching cooperative kernels + * via ::cudaLaunchCooperativeKernel, and 0 otherwise + * - ::cudaDevAttrCooperativeMultiDeviceLaunch: 1 if the device supports launching cooperative + * kernels via ::cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise + * - ::cudaDevAttrCanFlushRemoteWrites: 1 if the device supports flushing of outstanding + * remote writes, and 0 otherwise + * - ::cudaDevAttrHostRegisterSupported: 1 if the device supports host memory registration + * via ::cudaHostRegister, and 0 otherwise + * - ::cudaDevAttrPageableMemoryAccessUsesHostPageTables: 1 if the device accesses pageable memory via the + * host's page tables, and 0 otherwise + * - ::cudaDevAttrDirectManagedMemAccessFromHost: 1 if the host can directly access managed memory on the device + * without migration, and 0 otherwise + * - ::cudaDevAttrMaxSharedMemoryPerBlockOptin: Maximum per block shared memory size on the device. This value can + * be opted into when using ::cudaFuncSetAttribute + * - ::cudaDevAttrMaxBlocksPerMultiprocessor: Maximum number of thread blocks that can reside on a multiprocessor + * - ::cudaDevAttrMaxPersistingL2CacheSize: Maximum L2 persisting lines capacity setting in bytes + * - ::cudaDevAttrMaxAccessPolicyWindowSize: Maximum value of cudaAccessPolicyWindow::num_bytes + * - ::cudaDevAttrReservedSharedMemoryPerBlock: Shared memory reserved by CUDA driver per block in bytes + * - ::cudaDevAttrSparseCudaArraySupported: 1 if the device supports sparse CUDA arrays and sparse CUDA mipmapped arrays. + * - ::cudaDevAttrHostRegisterReadOnlySupported: Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly + * to register memory that must be mapped as read-only to the GPU + * - ::cudaDevAttrMemoryPoolsSupported: 1 if the device supports using the cudaMallocAsync and cudaMemPool family of APIs, and 0 otherwise + * - ::cudaDevAttrGPUDirectRDMASupported: 1 if the device supports GPUDirect RDMA APIs, and 0 otherwise + * - ::cudaDevAttrGPUDirectRDMAFlushWritesOptions: bitmask to be interpreted according to the ::cudaFlushGPUDirectRDMAWritesOptions enum + * - ::cudaDevAttrGPUDirectRDMAWritesOrdering: see the ::cudaGPUDirectRDMAWritesOrdering enum for numerical values + * - ::cudaDevAttrMemoryPoolSupportedHandleTypes: Bitmask of handle types supported with mempool based IPC + * - ::cudaDevAttrDeferredMappingCudaArraySupported : 1 if the device supports deferred mapping CUDA arrays and CUDA mipmapped arrays. + * + * \param value - Returned device attribute value + * \param attr - Device attribute to query + * \param device - Device number to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, ::cudaChooseDevice, + * ::cudaGetDeviceProperties, + * ::cuDeviceGetAttribute + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device); + +/** + * \brief Returns the default mempool of a device + * + * The default mempool of a device contains device memory from that device. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue + * ::cudaErrorNotSupported + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cuDeviceGetDefaultMemPool, ::cudaMallocAsync, ::cudaMemPoolTrimTo, ::cudaMemPoolGetAttribute, ::cudaDeviceSetMemPool, ::cudaMemPoolSetAttribute, ::cudaMemPoolSetAccess + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceGetDefaultMemPool(cudaMemPool_t *memPool, int device); + + +/** + * \brief Sets the current memory pool of a device + * + * The memory pool must be local to the specified device. + * Unless a mempool is specified in the ::cudaMallocAsync call, + * ::cudaMallocAsync allocates from the current mempool of the provided stream's device. + * By default, a device's current memory pool is its default memory pool. + * + * \note Use ::cudaMallocFromPoolAsync to specify asynchronous allocations from a device different + * than the one the stream runs on. + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * ::cudaErrorInvalidDevice + * ::cudaErrorNotSupported + * \notefnerr + * \note_callback + * + * \sa ::cuDeviceSetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaMemPoolCreate, ::cudaMemPoolDestroy, ::cudaMallocFromPoolAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceSetMemPool(int device, cudaMemPool_t memPool); + +/** + * \brief Gets the current mempool for a device + * + * Returns the last pool provided to ::cudaDeviceSetMemPool for this device + * or the device's default memory pool if ::cudaDeviceSetMemPool has never been called. + * By default the current mempool is the default mempool for a device, + * otherwise the returned pool must have been set with ::cuDeviceSetMemPool or ::cudaDeviceSetMemPool. + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * ::cudaErrorNotSupported + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cuDeviceGetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceSetMemPool + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceGetMemPool(cudaMemPool_t *memPool, int device); + +/** + * \brief Return NvSciSync attributes that this device can support. + * + * Returns in \p nvSciSyncAttrList, the properties of NvSciSync that + * this CUDA device, \p dev can support. The returned \p nvSciSyncAttrList + * can be used to create an NvSciSync that matches this device's capabilities. + * + * If NvSciSyncAttrKey_RequiredPerm field in \p nvSciSyncAttrList is + * already set this API will return ::cudaErrorInvalidValue. + * + * The applications should set \p nvSciSyncAttrList to a valid + * NvSciSyncAttrList failing which this API will return + * ::cudaErrorInvalidHandle. + * + * The \p flags controls how applications intends to use + * the NvSciSync created from the \p nvSciSyncAttrList. The valid flags are: + * - ::cudaNvSciSyncAttrSignal, specifies that the applications intends to + * signal an NvSciSync on this CUDA device. + * - ::cudaNvSciSyncAttrWait, specifies that the applications intends to + * wait on an NvSciSync on this CUDA device. + * + * At least one of these flags must be set, failing which the API + * returns ::cudaErrorInvalidValue. Both the flags are orthogonal + * to one another: a developer may set both these flags that allows to + * set both wait and signal specific attributes in the same \p nvSciSyncAttrList. + * + * \param nvSciSyncAttrList - Return NvSciSync attributes supported. + * \param device - Valid Cuda Device to get NvSciSync attributes for. + * \param flags - flags describing NvSciSync usage. + * + * \return + * + * ::cudaSuccess, + * ::cudaErrorDeviceUninitialized, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidHandle, + * ::cudaErrorInvalidDevice, + * ::cudaErrorNotSupported, + * ::cudaErrorMemoryAllocation + * + * \sa + * ::cudaImportExternalSemaphore, + * ::cudaDestroyExternalSemaphore, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, int device, int flags); + +/** + * \brief Queries attributes of the link between two devices. + * + * Returns in \p *value the value of the requested attribute \p attrib of the + * link between \p srcDevice and \p dstDevice. The supported attributes are: + * - ::cudaDevP2PAttrPerformanceRank: A relative value indicating the + * performance of the link between two devices. Lower value means better + * performance (0 being the value used for most performant link). + * - ::cudaDevP2PAttrAccessSupported: 1 if peer access is enabled. + * - ::cudaDevP2PAttrNativeAtomicSupported: 1 if native atomic operations over + * the link are supported. + * - ::cudaDevP2PAttrCudaArrayAccessSupported: 1 if accessing CUDA arrays over + * the link is supported. + * + * Returns ::cudaErrorInvalidDevice if \p srcDevice or \p dstDevice are not valid + * or if they represent the same device. + * + * Returns ::cudaErrorInvalidValue if \p attrib is not valid or if \p value is + * a null pointer. + * + * \param value - Returned value of the requested attribute + * \param attrib - The requested attribute of the link between \p srcDevice and \p dstDevice. + * \param srcDevice - The source device of the target link. + * \param dstDevice - The destination device of the target link. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaCtxEnablePeerAccess, + * ::cudaCtxDisablePeerAccess, + * ::cudaCtxCanAccessPeer, + * ::cuDeviceGetP2PAttribute + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaDeviceGetP2PAttribute(int *value, enum cudaDeviceP2PAttr attr, int srcDevice, int dstDevice); + +/** + * \brief Select compute-device which best matches criteria + * + * Returns in \p *device the device which has properties that best match + * \p *prop. + * + * \param device - Device with best match + * \param prop - Desired device properties + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, + * ::cudaGetDeviceProperties + */ +extern __host__ cudaError_t CUDARTAPI cudaChooseDevice(int *device, const struct cudaDeviceProp *prop); + +/** + * \brief Set device to be used for GPU executions + * + * Sets \p device as the current device for the calling host thread. + * Valid device id's are 0 to (::cudaGetDeviceCount() - 1). + * + * Any device memory subsequently allocated from this host thread + * using ::cudaMalloc(), ::cudaMallocPitch() or ::cudaMallocArray() + * will be physically resident on \p device. Any host memory allocated + * from this host thread using ::cudaMallocHost() or ::cudaHostAlloc() + * or ::cudaHostRegister() will have its lifetime associated with + * \p device. Any streams or events created from this host thread will + * be associated with \p device. Any kernels launched from this host + * thread using the <<<>>> operator or ::cudaLaunchKernel() will be executed + * on \p device. + * + * This call may be made from any host thread, to any device, and at + * any time. This function will do no synchronization with the previous + * or new device, and should be considered a very low overhead call. + * If the current context bound to the calling thread is not the primary context, + * this call will bind the primary context to the calling thread and all the + * subsequent memory allocations, stream and event creations, and kernel launches + * will be associated with the primary context. This function will not initialize + * the context until a runtime API requiring the context (such as ::cudaMalloc()) + * is used. This function will not return an error if the device is in + * ::cudaComputeModeExclusiveProcess and is occupied by another process or + * if the device is in ::cudaComputeModeProhibited. + * + * \param device - Device on which the active host thread should execute the + * device code. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties, + * ::cudaChooseDevice, + * ::cuCtxSetCurrent + */ +extern __host__ cudaError_t CUDARTAPI cudaSetDevice(int device); + +/** + * \brief Returns which device is currently being used + * + * Returns in \p *device the current device for the calling host thread. + * + * \param device - Returns the device on which the active host thread + * executes the device code. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties, + * ::cudaChooseDevice, + * ::cuCtxGetCurrent + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaGetDevice(int *device); + +/** + * \brief Set a list of devices that can be used for CUDA + * + * Sets a list of devices for CUDA execution in priority order using + * \p device_arr. The parameter \p len specifies the number of elements in the + * list. CUDA will try devices from the list sequentially until it finds one + * that works. If this function is not called, or if it is called with a \p len + * of 0, then CUDA will go back to its default behavior of trying devices + * sequentially from a default list containing all of the available CUDA + * devices in the system. If a specified device ID in the list does not exist, + * this function will return ::cudaErrorInvalidDevice. If \p len is not 0 and + * \p device_arr is NULL or if \p len exceeds the number of devices in + * the system, then ::cudaErrorInvalidValue is returned. + * + * \param device_arr - List of devices to try + * \param len - Number of devices in specified list + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDeviceCount, ::cudaSetDevice, ::cudaGetDeviceProperties, + * ::cudaSetDeviceFlags, + * ::cudaChooseDevice + */ +extern __host__ cudaError_t CUDARTAPI cudaSetValidDevices(int *device_arr, int len); + +/** + * \brief Sets flags to be used for device executions + * + * Records \p flags as the flags for the current device. If the current device + * has been set and that device has already been initialized, the previous flags + * are overwritten. If the current device has not been initialized, it is + * initialized with the provided flags. If no device has been made current to + * the calling thread, a default device is selected and initialized with the + * provided flags. + * + * The two LSBs of the \p flags parameter can be used to control how the CPU + * thread interacts with the OS scheduler when waiting for results from the + * device. + * + * - ::cudaDeviceScheduleAuto: The default value if the \p flags parameter is + * zero, uses a heuristic based on the number of active CUDA contexts in the + * process \p C and the number of logical processors in the system \p P. If + * \p C \> \p P, then CUDA will yield to other OS threads when waiting for the + * device, otherwise CUDA will not yield while waiting for results and + * actively spin on the processor. Additionally, on Tegra devices, + * ::cudaDeviceScheduleAuto uses a heuristic based on the power profile of + * the platform and may choose ::cudaDeviceScheduleBlockingSync for low-powered + * devices. + * - ::cudaDeviceScheduleSpin: Instruct CUDA to actively spin when waiting for + * results from the device. This can decrease latency when waiting for the + * device, but may lower the performance of CPU threads if they are performing + * work in parallel with the CUDA thread. + * - ::cudaDeviceScheduleYield: Instruct CUDA to yield its thread when waiting + * for results from the device. This can increase latency when waiting for the + * device, but can increase the performance of CPU threads performing work in + * parallel with the device. + * - ::cudaDeviceScheduleBlockingSync: Instruct CUDA to block the CPU thread + * on a synchronization primitive when waiting for the device to finish work. + * - ::cudaDeviceBlockingSync: Instruct CUDA to block the CPU thread on a + * synchronization primitive when waiting for the device to finish work. <br> + * \ref deprecated "Deprecated:" This flag was deprecated as of CUDA 4.0 and + * replaced with ::cudaDeviceScheduleBlockingSync. + * - ::cudaDeviceMapHost: This flag enables allocating pinned + * host memory that is accessible to the device. It is implicit for the + * runtime but may be absent if a context is created using the driver API. + * If this flag is not set, ::cudaHostGetDevicePointer() will always return + * a failure code. + * - ::cudaDeviceLmemResizeToMax: Instruct CUDA to not reduce local memory + * after resizing local memory for a kernel. This can prevent thrashing by + * local memory allocations when launching many kernels with high local + * memory usage at the cost of potentially increased memory usage. <br> + * \ref deprecated "Deprecated:" This flag is deprecated and the behavior enabled + * by this flag is now the default and cannot be disabled. + * + * \param flags - Parameters for device operation + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDeviceFlags, ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaGetDeviceProperties, + * ::cudaSetDevice, ::cudaSetValidDevices, + * ::cudaChooseDevice, + * ::cuDevicePrimaryCtxSetFlags + */ +extern __host__ cudaError_t CUDARTAPI cudaSetDeviceFlags( unsigned int flags ); + +/** + * \brief Gets the flags for the current device + * + * + * Returns in \p flags the flags for the current device. If there is a current + * device for the calling thread, the flags for the device are returned. If + * there is no current device, the flags for the first device are returned, + * which may be the default flags. Compare to the behavior of + * ::cudaSetDeviceFlags. + * + * Typically, the flags returned should match the behavior that will be seen + * if the calling thread uses a device after this call, without any change to + * the flags or current device inbetween by this or another thread. Note that + * if the device is not initialized, it is possible for another thread to + * change the flags for the current device before it is initialized. + * Additionally, when using exclusive mode, if this thread has not requested a + * specific device, it may use a device other than the first device, contrary + * to the assumption made by this function. + * + * If a context has been created via the driver API and is current to the + * calling thread, the flags for that context are always returned. + * + * Flags returned by this function may specifically include ::cudaDeviceMapHost + * even though it is not accepted by ::cudaSetDeviceFlags because it is + * implicit in runtime API flags. The reason for this is that the current + * context may have been created via the driver API in which case the flag is + * not implicit and may be unset. + * + * \param flags - Pointer to store the device flags + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDevice, ::cudaGetDeviceProperties, + * ::cudaSetDevice, ::cudaSetDeviceFlags, + * ::cuCtxGetFlags, + * ::cuDevicePrimaryCtxGetState + */ +extern __host__ cudaError_t CUDARTAPI cudaGetDeviceFlags( unsigned int *flags ); +/** @} */ /* END CUDART_DEVICE */ + +/** + * \defgroup CUDART_STREAM Stream Management + * + * ___MANBRIEF___ stream management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the stream management functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Create an asynchronous stream + * + * Creates a new asynchronous stream. + * + * \param pStream - Pointer to new stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreateWithPriority, + * ::cudaStreamCreateWithFlags, + * ::cudaStreamGetPriority, + * ::cudaStreamGetFlags, + * ::cudaStreamQuery, + * ::cudaStreamSynchronize, + * ::cudaStreamWaitEvent, + * ::cudaStreamAddCallback, + * ::cudaStreamDestroy, + * ::cuStreamCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamCreate(cudaStream_t *pStream); + +/** + * \brief Create an asynchronous stream + * + * Creates a new asynchronous stream. The \p flags argument determines the + * behaviors of the stream. Valid values for \p flags are + * - ::cudaStreamDefault: Default stream creation flag. + * - ::cudaStreamNonBlocking: Specifies that work running in the created + * stream may run concurrently with work in stream 0 (the NULL stream), and that + * the created stream should perform no implicit synchronization with stream 0. + * + * \param pStream - Pointer to new stream identifier + * \param flags - Parameters for stream creation + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, + * ::cudaStreamCreateWithPriority, + * ::cudaStreamGetFlags, + * ::cudaStreamQuery, + * ::cudaStreamSynchronize, + * ::cudaStreamWaitEvent, + * ::cudaStreamAddCallback, + * ::cudaStreamDestroy, + * ::cuStreamCreate + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags); + +/** + * \brief Create an asynchronous stream with the specified priority + * + * Creates a stream with the specified priority and returns a handle in \p pStream. + * This API alters the scheduler priority of work in the stream. Work in a higher + * priority stream may preempt work already executing in a low priority stream. + * + * \p priority follows a convention where lower numbers represent higher priorities. + * '0' represents default priority. The range of meaningful numerical priorities can + * be queried using ::cudaDeviceGetStreamPriorityRange. If the specified priority is + * outside the numerical range returned by ::cudaDeviceGetStreamPriorityRange, + * it will automatically be clamped to the lowest or the highest number in the range. + * + * \param pStream - Pointer to new stream identifier + * \param flags - Flags for stream creation. See ::cudaStreamCreateWithFlags for a list of valid flags that can be passed + * \param priority - Priority of the stream. Lower numbers represent higher priorities. + * See ::cudaDeviceGetStreamPriorityRange for more information about + * the meaningful stream priorities that can be passed. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \note Stream priorities are supported only on GPUs + * with compute capability 3.5 or higher. + * + * \note In the current implementation, only compute kernels launched in + * priority streams are affected by the stream's priority. Stream priorities have + * no effect on host-to-device and device-to-host memory operations. + * + * \sa ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags, + * ::cudaDeviceGetStreamPriorityRange, + * ::cudaStreamGetPriority, + * ::cudaStreamQuery, + * ::cudaStreamWaitEvent, + * ::cudaStreamAddCallback, + * ::cudaStreamSynchronize, + * ::cudaStreamDestroy, + * ::cuStreamCreateWithPriority + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, int priority); + +/** + * \brief Query the priority of a stream + * + * Query the priority of a stream. The priority is returned in in \p priority. + * Note that if the stream was created with a priority outside the meaningful + * numerical range returned by ::cudaDeviceGetStreamPriorityRange, + * this function returns the clamped priority. + * See ::cudaStreamCreateWithPriority for details about priority clamping. + * + * \param hStream - Handle to the stream to be queried + * \param priority - Pointer to a signed integer in which the stream's priority is returned + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreateWithPriority, + * ::cudaDeviceGetStreamPriorityRange, + * ::cudaStreamGetFlags, + * ::cuStreamGetPriority + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority); + +/** + * \brief Query the flags of a stream + * + * Query the flags of a stream. The flags are returned in \p flags. + * See ::cudaStreamCreateWithFlags for a list of valid flags. + * + * \param hStream - Handle to the stream to be queried + * \param flags - Pointer to an unsigned integer in which the stream's flags are returned + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreateWithPriority, + * ::cudaStreamCreateWithFlags, + * ::cudaStreamGetPriority, + * ::cuStreamGetFlags + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags); + +/** + * \brief Resets all persisting lines in cache to normal status. + * + * Resets all persisting lines in cache to normal status. + * Takes effect on function return. + * + * \return + * ::cudaSuccess, + * \notefnerr + * + * \sa + * ::cudaAccessPolicyWindow + */ +extern __host__ cudaError_t CUDARTAPI cudaCtxResetPersistingL2Cache(void); + +/** + * \brief Copies attributes from source stream to destination stream. + * + * Copies attributes from source stream \p src to destination stream \p dst. + * Both streams must have the same context. + * + * \param[out] dst Destination stream + * \param[in] src Source stream + * For attributes see ::cudaStreamAttrID + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotSupported + * \notefnerr + * + * \sa + * ::cudaAccessPolicyWindow + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dst, cudaStream_t src); + + /** + * \brief Queries stream attribute. + * + * Queries attribute \p attr from \p hStream and stores it in corresponding + * member of \p value_out. + * + * \param[in] hStream + * \param[in] attr + * \param[out] value_out + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * + * \sa + * ::cudaAccessPolicyWindow + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetAttribute( + cudaStream_t hStream, cudaStreamAttrID attr, + cudaStreamAttrValue *value_out); + + /** + * \brief Sets stream attribute. + * + * Sets attribute \p attr on \p hStream from corresponding attribute of + * \p value. The updated attribute will be applied to subsequent work + * submitted to the stream. It will not affect previously submitted work. + * + * \param[out] hStream + * \param[in] attr + * \param[in] value + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * + * \sa + * ::cudaAccessPolicyWindow + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamSetAttribute( + cudaStream_t hStream, cudaStreamAttrID attr, + const cudaStreamAttrValue *value); + + /** + * \brief Destroys and cleans up an asynchronous stream + * + * Destroys and cleans up the asynchronous stream specified by \p stream. + * + * In case the device is still doing work in the stream \p stream + * when ::cudaStreamDestroy() is called, the function will return immediately + * and the resources associated with \p stream will be released automatically + * once the device has completed all work in \p stream. + * + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa ::cudaStreamCreate, + * ::cudaStreamCreateWithFlags, + * ::cudaStreamQuery, + * ::cudaStreamWaitEvent, + * ::cudaStreamSynchronize, + * ::cudaStreamAddCallback, + * ::cuStreamDestroy + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamDestroy(cudaStream_t stream); + +/** + * \brief Make a compute stream wait on an event + * + * Makes all future work submitted to \p stream wait for all work captured in + * \p event. See ::cudaEventRecord() for details on what is captured by an event. + * The synchronization will be performed efficiently on the device when applicable. + * \p event may be from a different device than \p stream. + * + * flags include: + * - ::cudaEventWaitDefault: Default event creation flag. + * - ::cudaEventWaitExternal: Event is captured in the graph as an external + * event node when performing stream capture. + * + * \param stream - Stream to wait + * \param event - Event to wait on + * \param flags - Parameters for the operation(See above) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, + * ::cuStreamWaitEvent + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags __dv(0)); + +/** + * Type of stream callback functions. + * \param stream The stream as passed to ::cudaStreamAddCallback, may be NULL. + * \param status ::cudaSuccess or any persistent error on the stream. + * \param userData User parameter provided at registration. + */ +typedef void (CUDART_CB *cudaStreamCallback_t)(cudaStream_t stream, cudaError_t status, void *userData); + +/** + * \brief Add a callback to a compute stream + * + * \note This function is slated for eventual deprecation and removal. If + * you do not require the callback to execute in case of a device error, + * consider using ::cudaLaunchHostFunc. Additionally, this function is not + * supported with ::cudaStreamBeginCapture and ::cudaStreamEndCapture, unlike + * ::cudaLaunchHostFunc. + * + * Adds a callback to be called on the host after all currently enqueued + * items in the stream have completed. For each + * cudaStreamAddCallback call, a callback will be executed exactly once. + * The callback will block later work in the stream until it is finished. + * + * The callback may be passed ::cudaSuccess or an error code. In the event + * of a device error, all subsequently executed callbacks will receive an + * appropriate ::cudaError_t. + * + * Callbacks must not make any CUDA API calls. Attempting to use CUDA APIs + * may result in ::cudaErrorNotPermitted. Callbacks must not perform any + * synchronization that may depend on outstanding device work or other callbacks + * that are not mandated to run earlier. Callbacks without a mandated order + * (in independent streams) execute in undefined order and may be serialized. + * + * For the purposes of Unified Memory, callback execution makes a number of + * guarantees: + * <ul> + * <li>The callback stream is considered idle for the duration of the + * callback. Thus, for example, a callback may always use memory attached + * to the callback stream.</li> + * <li>The start of execution of a callback has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the callback. It thus synchronizes streams which have been "joined" + * prior to the callback.</li> + * <li>Adding device work to any stream does not have the effect of making + * the stream active until all preceding callbacks have executed. Thus, for + * example, a callback might use global attached memory even if work has + * been added to another stream, if it has been properly ordered with an + * event.</li> + * <li>Completion of a callback does not cause a stream to become + * active except as described above. The callback stream will remain idle + * if no device work follows the callback, and will remain idle across + * consecutive callbacks without device work in between. Thus, for example, + * stream synchronization can be done by signaling from a callback at the + * end of the stream.</li> + * </ul> + * + * \param stream - Stream to add callback to + * \param callback - The function to call once preceding stream operations are complete + * \param userData - User specified data to be passed to the callback function + * \param flags - Reserved for future use, must be 0 + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorInvalidValue, + * ::cudaErrorNotSupported + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamSynchronize, ::cudaStreamWaitEvent, ::cudaStreamDestroy, ::cudaMallocManaged, ::cudaStreamAttachMemAsync, + * ::cudaLaunchHostFunc, ::cuStreamAddCallback + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, + cudaStreamCallback_t callback, void *userData, unsigned int flags); + +/** + * \brief Waits for stream tasks to complete + * + * Blocks until \p stream has completed all operations. If the + * ::cudaDeviceScheduleBlockingSync flag was set for this device, + * the host thread will block until the stream is finished with + * all of its tasks. + * + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamQuery, ::cudaStreamWaitEvent, ::cudaStreamAddCallback, ::cudaStreamDestroy, + * ::cuStreamSynchronize + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream); + +/** + * \brief Queries an asynchronous stream for completion status + * + * Returns ::cudaSuccess if all operations in \p stream have + * completed, or ::cudaErrorNotReady if not. + * + * For the purposes of Unified Memory, a return value of ::cudaSuccess + * is equivalent to having called ::cudaStreamSynchronize(). + * + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidResourceHandle + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, + * ::cuStreamQuery + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream); + +/** + * \brief Attach memory to a stream asynchronously + * + * Enqueues an operation in \p stream to specify stream association of + * \p length bytes of memory starting from \p devPtr. This function is a + * stream-ordered operation, meaning that it is dependent on, and will + * only take effect when, previous work in stream has completed. Any + * previous association is automatically replaced. + * + * \p devPtr must point to an one of the following types of memories: + * - managed memory declared using the __managed__ keyword or allocated with + * ::cudaMallocManaged. + * - a valid host-accessible region of system-allocated pageable memory. This + * type of memory may only be specified if the device associated with the + * stream reports a non-zero value for the device attribute + * ::cudaDevAttrPageableMemoryAccess. + * + * For managed allocations, \p length must be either zero or the entire + * allocation's size. Both indicate that the entire allocation's stream + * association is being changed. Currently, it is not possible to change stream + * association for a portion of a managed allocation. + * + * For pageable allocations, \p length must be non-zero. + * + * The stream association is specified using \p flags which must be + * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle. + * The default value for \p flags is ::cudaMemAttachSingle + * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed + * by any stream on any device. + * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee + * that it won't access the memory on the device from any stream on a device that + * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with + * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess, + * the program makes a guarantee that it will only access the memory on the device + * from \p stream. It is illegal to attach singly to the NULL stream, because the + * NULL stream is a virtual global stream and not a specific stream. An error will + * be returned in this case. + * + * When memory is associated with a single stream, the Unified Memory system will + * allow CPU access to this memory region so long as all operations in \p stream + * have completed, regardless of whether other streams are active. In effect, + * this constrains exclusive ownership of the managed memory region by + * an active GPU to per-stream activity instead of whole-GPU activity. + * + * Accessing memory on the device from streams that are not associated with + * it will produce undefined results. No error checking is performed by the + * Unified Memory system to ensure that kernels launched into other streams + * do not access this region. + * + * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync + * via events, synchronization or other means to ensure legal access to memory + * at all times. Data visibility and coherency will be changed appropriately + * for all kernels which follow a stream-association change. + * + * If \p stream is destroyed while data is associated with it, the association is + * removed and the association reverts to the default visibility of the allocation + * as specified at ::cudaMallocManaged. For __managed__ variables, the default + * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an + * asynchronous operation, and as a result, the change to default association won't + * happen until all work in the stream has completed. + * + * \param stream - Stream in which to enqueue the attach operation + * \param devPtr - Pointer to memory (must be a pointer to managed memory or + * to a valid host-accessible region of system-allocated + * memory) + * \param length - Length of memory (defaults to zero) + * \param flags - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle) + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged, + * ::cuStreamAttachMemAsync + */ +#if defined(__cplusplus) +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags = cudaMemAttachSingle); +#else +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length __dv(0), unsigned int flags); +#endif + +/** + * \brief Begins graph capture on a stream + * + * Begin graph capture on \p stream. When a stream is in capture mode, all operations + * pushed into the stream will not be executed, but will instead be captured into + * a graph, which will be returned via ::cudaStreamEndCapture. Capture may not be initiated + * if \p stream is ::cudaStreamLegacy. Capture must be ended on the same stream in which + * it was initiated, and it may only be initiated if the stream is not already in capture + * mode. The capture mode may be queried via ::cudaStreamIsCapturing. A unique id + * representing the capture sequence may be queried via ::cudaStreamGetCaptureInfo. + * + * If \p mode is not ::cudaStreamCaptureModeRelaxed, ::cudaStreamEndCapture must be + * called on this stream from the same thread. + * + * \note Kernels captured using this API must not use texture and surface references. + * Reading or writing through any texture or surface reference is undefined + * behavior. This restriction does not apply to texture and surface objects. + * + * \param stream - Stream in which to initiate capture + * \param mode - Controls the interaction of this capture sequence with other API + * calls that are potentially unsafe. For more details see + * ::cudaThreadExchangeStreamCaptureMode. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa + * ::cudaStreamCreate, + * ::cudaStreamIsCapturing, + * ::cudaStreamEndCapture, + * ::cudaThreadExchangeStreamCaptureMode + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode); + +/** + * \brief Swaps the stream capture interaction mode for a thread + * + * Sets the calling thread's stream capture interaction mode to the value contained + * in \p *mode, and overwrites \p *mode with the previous mode for the thread. To + * facilitate deterministic behavior across function or module boundaries, callers + * are encouraged to use this API in a push-pop fashion: \code + cudaStreamCaptureMode mode = desiredMode; + cudaThreadExchangeStreamCaptureMode(&mode); + ... + cudaThreadExchangeStreamCaptureMode(&mode); // restore previous mode + * \endcode + * + * During stream capture (see ::cudaStreamBeginCapture), some actions, such as a call + * to ::cudaMalloc, may be unsafe. In the case of ::cudaMalloc, the operation is + * not enqueued asynchronously to a stream, and is not observed by stream capture. + * Therefore, if the sequence of operations captured via ::cudaStreamBeginCapture + * depended on the allocation being replayed whenever the graph is launched, the + * captured graph would be invalid. + * + * Therefore, stream capture places restrictions on API calls that can be made within + * or concurrently to a ::cudaStreamBeginCapture-::cudaStreamEndCapture sequence. This + * behavior can be controlled via this API and flags to ::cudaStreamBeginCapture. + * + * A thread's mode is one of the following: + * - \p cudaStreamCaptureModeGlobal: This is the default mode. If the local thread has + * an ongoing capture sequence that was not initiated with + * \p cudaStreamCaptureModeRelaxed at \p cuStreamBeginCapture, or if any other thread + * has a concurrent capture sequence initiated with \p cudaStreamCaptureModeGlobal, + * this thread is prohibited from potentially unsafe API calls. + * - \p cudaStreamCaptureModeThreadLocal: If the local thread has an ongoing capture + * sequence not initiated with \p cudaStreamCaptureModeRelaxed, it is prohibited + * from potentially unsafe API calls. Concurrent capture sequences in other threads + * are ignored. + * - \p cudaStreamCaptureModeRelaxed: The local thread is not prohibited from potentially + * unsafe API calls. Note that the thread is still prohibited from API calls which + * necessarily conflict with stream capture, for example, attempting ::cudaEventQuery + * on an event that was last recorded inside a capture sequence. + * + * \param mode - Pointer to mode value to swap with the current mode + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * + * \sa + * ::cudaStreamBeginCapture + */ +extern __host__ cudaError_t CUDARTAPI cudaThreadExchangeStreamCaptureMode(enum cudaStreamCaptureMode *mode); + +/** + * \brief Ends capture on a stream, returning the captured graph + * + * End capture on \p stream, returning the captured graph via \p pGraph. + * Capture must have been initiated on \p stream via a call to ::cudaStreamBeginCapture. + * If capture was invalidated, due to a violation of the rules of stream capture, then + * a NULL graph will be returned. + * + * If the \p mode argument to ::cudaStreamBeginCapture was not + * ::cudaStreamCaptureModeRelaxed, this call must be from the same thread as + * ::cudaStreamBeginCapture. + * + * \param stream - Stream to query + * \param pGraph - The captured graph + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorStreamCaptureWrongThread + * \notefnerr + * + * \sa + * ::cudaStreamCreate, + * ::cudaStreamBeginCapture, + * ::cudaStreamIsCapturing + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph); + +/** + * \brief Returns a stream's capture status + * + * Return the capture status of \p stream via \p pCaptureStatus. After a successful + * call, \p *pCaptureStatus will contain one of the following: + * - ::cudaStreamCaptureStatusNone: The stream is not capturing. + * - ::cudaStreamCaptureStatusActive: The stream is capturing. + * - ::cudaStreamCaptureStatusInvalidated: The stream was capturing but an error + * has invalidated the capture sequence. The capture sequence must be terminated + * with ::cudaStreamEndCapture on the stream where it was initiated in order to + * continue using \p stream. + * + * Note that, if this is called on ::cudaStreamLegacy (the "null stream") while + * a blocking stream on the same device is capturing, it will return + * ::cudaErrorStreamCaptureImplicit and \p *pCaptureStatus is unspecified + * after the call. The blocking stream capture is not invalidated. + * + * When a blocking stream is capturing, the legacy stream is in an + * unusable state until the blocking stream capture is terminated. The legacy + * stream is not supported for stream capture, but attempted use would have an + * implicit dependency on the capturing stream(s). + * + * \param stream - Stream to query + * \param pCaptureStatus - Returns the stream's capture status + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorStreamCaptureImplicit + * \notefnerr + * + * \sa + * ::cudaStreamCreate, + * ::cudaStreamBeginCapture, + * ::cudaStreamEndCapture + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus); + +/** + * \brief Query capture status of a stream + * + * Note there is a later version of this API, ::cudaStreamGetCaptureInfo_v2. It will + * supplant this version in 12.0, which is retained for minor version compatibility. + * + * Query the capture status of a stream and get a unique id representing + * the capture sequence over the lifetime of the process. + * + * If called on ::cudaStreamLegacy (the "null stream") while a stream not created + * with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit. + * + * A valid id is returned only if both of the following are true: + * - the call returns ::cudaSuccess + * - captureStatus is set to ::cudaStreamCaptureStatusActive + * + * \param stream - Stream to query + * \param pCaptureStatus - Returns the stream's capture status + * \param pId - Returns the unique id of the capture sequence + * + * \return + * ::cudaSuccess, + * ::cudaErrorStreamCaptureImplicit + * \notefnerr + * + * \sa + * ::cudaStreamGetCaptureInfo_v2, + * ::cudaStreamBeginCapture, + * ::cudaStreamIsCapturing + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus, unsigned long long *pId); + +/** + * \brief Query a stream's capture state (11.3+) + * + * Query stream state related to stream capture. + * + * If called on ::cudaStreamLegacy (the "null stream") while a stream not created + * with ::cudaStreamNonBlocking is capturing, returns ::cudaErrorStreamCaptureImplicit. + * + * Valid data (other than capture status) is returned only if both of the following are true: + * - the call returns cudaSuccess + * - the returned capture status is ::cudaStreamCaptureStatusActive + * + * This version of cudaStreamGetCaptureInfo is introduced in CUDA 11.3 and will supplant the + * previous version ::cudaStreamGetCaptureInfo in 12.0. Developers requiring compatibility + * across minor versions to CUDA 11.0 (driver version 445) can do one of the following: + * - Use the older version of the API, ::cudaStreamGetCaptureInfo + * - Pass null for all of \p graph_out, \p dependencies_out, and \p numDependencies_out. + * + * \param stream - The stream to query + * \param captureStatus_out - Location to return the capture status of the stream; required + * \param id_out - Optional location to return an id for the capture sequence, which is + * unique over the lifetime of the process + * \param graph_out - Optional location to return the graph being captured into. All + * operations other than destroy and node removal are permitted on the graph + * while the capture sequence is in progress. This API does not transfer + * ownership of the graph, which is transferred or destroyed at + * ::cudaStreamEndCapture. Note that the graph handle may be invalidated before + * end of capture for certain errors. Nodes that are or become + * unreachable from the original stream at ::cudaStreamEndCapture due to direct + * actions on the graph do not trigger ::cudaErrorStreamCaptureUnjoined. + * \param dependencies_out - Optional location to store a pointer to an array of nodes. + * The next node to be captured in the stream will depend on this set of nodes, + * absent operations such as event wait which modify this set. The array pointer + * is valid until the next API call which operates on the stream or until end of + * capture. The node handles may be copied out and are valid until they or the + * graph is destroyed. The driver-owned array may also be passed directly to + * APIs that operate on the graph (not the stream) without copying. + * \param numDependencies_out - Optional location to store the size of the array + * returned in dependencies_out. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorStreamCaptureImplicit + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cudaStreamGetCaptureInfo, + * ::cudaStreamBeginCapture, + * ::cudaStreamIsCapturing, + * ::cudaStreamUpdateCaptureDependencies + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), size_t *numDependencies_out __dv(0)); + +/** + * \brief Update the set of dependencies in a capturing stream (11.3+) + * + * Modifies the dependency set of a capturing stream. The dependency set is the set + * of nodes that the next captured node in the stream will depend on. + * + * Valid flags are ::cudaStreamAddCaptureDependencies and + * ::cudaStreamSetCaptureDependencies. These control whether the set passed to + * the API is added to the existing set or replaces it. A flags value of 0 defaults + * to ::cudaStreamAddCaptureDependencies. + * + * Nodes that are removed from the dependency set via this API do not result in + * ::cudaErrorStreamCaptureUnjoined if they are unreachable from the stream at + * ::cudaStreamEndCapture. + * + * Returns ::cudaErrorIllegalState if the stream is not capturing. + * + * This API is new in CUDA 11.3. Developers requiring compatibility across minor + * versions of the CUDA driver to 11.0 should not use this API or provide a fallback. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorIllegalState + * \notefnerr + * + * \sa + * ::cudaStreamBeginCapture, + * ::cudaStreamGetCaptureInfo, + * ::cudaStreamGetCaptureInfo_v2 + */ +extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags __dv(0)); +/** @} */ /* END CUDART_STREAM */ + +/** + * \defgroup CUDART_EVENT Event Management + * + * ___MANBRIEF___ event management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the event management functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Creates an event object + * + * Creates an event object for the current device using ::cudaEventDefault. + * + * \param event - Newly created event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*, unsigned int) "cudaEventCreate (C++ API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent, + * ::cuEventCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaEventCreate(cudaEvent_t *event); + +/** + * \brief Creates an event object with the specified flags + * + * Creates an event object for the current device with the specified flags. Valid + * flags include: + * - ::cudaEventDefault: Default event creation flag. + * - ::cudaEventBlockingSync: Specifies that event should use blocking + * synchronization. A host thread that uses ::cudaEventSynchronize() to wait + * on an event created with this flag will block until the event actually + * completes. + * - ::cudaEventDisableTiming: Specifies that the created event does not need + * to record timing data. Events created with this flag specified and + * the ::cudaEventBlockingSync flag not specified will provide the best + * performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery(). + * - ::cudaEventInterprocess: Specifies that the created event may be used as an + * interprocess event by ::cudaIpcGetEventHandle(). ::cudaEventInterprocess must + * be specified along with ::cudaEventDisableTiming. + * + * \param event - Newly created event + * \param flags - Flags for new event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent, + * ::cuEventCreate + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags); + +/** + * \brief Records an event + * + * Captures in \p event the contents of \p stream at the time of this call. + * \p event and \p stream must be on the same CUDA context. + * Calls such as ::cudaEventQuery() or ::cudaStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p stream after this call do not modify \p event. See note on default + * stream behavior for what is captured in the default case. + * + * ::cudaEventRecord() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cudaStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cudaEventRecord(). Before the first call to ::cudaEventRecord(), an + * event represents an empty set of work, so for example ::cudaEventQuery() + * would return ::cudaSuccess. + * + * \param event - Event to record + * \param stream - Stream in which to record event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent, + * ::cudaEventRecordWithFlags, + * ::cuEventRecord + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)); + +/** + * \brief Records an event + * + * Captures in \p event the contents of \p stream at the time of this call. + * \p event and \p stream must be on the same CUDA context. + * Calls such as ::cudaEventQuery() or ::cudaStreamWaitEvent() will then + * examine or wait for completion of the work that was captured. Uses of + * \p stream after this call do not modify \p event. See note on default + * stream behavior for what is captured in the default case. + * + * ::cudaEventRecordWithFlags() can be called multiple times on the same event and + * will overwrite the previously captured state. Other APIs such as + * ::cudaStreamWaitEvent() use the most recently captured state at the time + * of the API call, and are not affected by later calls to + * ::cudaEventRecordWithFlags(). Before the first call to ::cudaEventRecordWithFlags(), an + * event represents an empty set of work, so for example ::cudaEventQuery() + * would return ::cudaSuccess. + * + * flags include: + * - ::cudaEventRecordDefault: Default event creation flag. + * - ::cudaEventRecordExternal: Event is captured in the graph as an external + * event node when performing stream capture. + * + * \param event - Event to record + * \param stream - Stream in which to record event + * \param flags - Parameters for the operation(See above) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cudaStreamWaitEvent, + * ::cudaEventRecord, + * ::cuEventRecord, + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0), unsigned int flags __dv(0)); +#endif + +/** + * \brief Queries an event's status + * + * Queries the status of all work currently captured by \p event. See + * ::cudaEventRecord() for details on what is captured by an event. + * + * Returns ::cudaSuccess if all captured work has been completed, or + * ::cudaErrorNotReady if any captured work is incomplete. + * + * For the purposes of Unified Memory, a return value of ::cudaSuccess + * is equivalent to having called ::cudaEventSynchronize(). + * + * \param event - Event to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cuEventQuery + */ +extern __host__ cudaError_t CUDARTAPI cudaEventQuery(cudaEvent_t event); + +/** + * \brief Waits for an event to complete + * + * Waits until the completion of all work currently captured in \p event. + * See ::cudaEventRecord() for details on what is captured by an event. + * + * Waiting for an event that was created with the ::cudaEventBlockingSync + * flag will cause the calling CPU thread to block until the event has + * been completed by the device. If the ::cudaEventBlockingSync flag has + * not been set, then the CPU thread will busy-wait until the event has + * been completed by the device. + * + * \param event - Event to wait for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventRecord, + * ::cudaEventQuery, ::cudaEventDestroy, ::cudaEventElapsedTime, + * ::cuEventSynchronize + */ +extern __host__ cudaError_t CUDARTAPI cudaEventSynchronize(cudaEvent_t event); + +/** + * \brief Destroys an event object + * + * Destroys the event specified by \p event. + * + * An event may be destroyed before it is complete (i.e., while + * ::cudaEventQuery() would return ::cudaErrorNotReady). In this case, the + * call does not block on completion of the event, and any associated + * resources will automatically be released asynchronously at completion. + * + * \param event - Event to destroy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure + * \notefnerr + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventRecord, ::cudaEventElapsedTime, + * ::cuEventDestroy + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventDestroy(cudaEvent_t event); + +/** + * \brief Computes the elapsed time between events + * + * Computes the elapsed time between two events (in milliseconds with a + * resolution of around 0.5 microseconds). + * + * If either event was last recorded in a non-NULL stream, the resulting time + * may be greater than expected (even if both used the same stream handle). This + * happens because the ::cudaEventRecord() operation takes place asynchronously + * and there is no guarantee that the measured latency is actually just between + * the two events. Any number of other different stream operations could execute + * in between the two measured events, thus altering the timing in a significant + * way. + * + * If ::cudaEventRecord() has not been called on either event, then + * ::cudaErrorInvalidResourceHandle is returned. If ::cudaEventRecord() has been + * called on both events but one or both of them has not yet been completed + * (that is, ::cudaEventQuery() would return ::cudaErrorNotReady on at least one + * of the events), ::cudaErrorNotReady is returned. If either event was created + * with the ::cudaEventDisableTiming flag, then this function will return + * ::cudaErrorInvalidResourceHandle. + * + * \param ms - Time between \p start and \p end in ms + * \param start - Starting event + * \param end - Ending event + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotReady, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorLaunchFailure, + * ::cudaErrorUnknown + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)", + * ::cudaEventCreateWithFlags, ::cudaEventQuery, + * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventRecord, + * ::cuEventElapsedTime + */ +extern __host__ cudaError_t CUDARTAPI cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end); + +/** @} */ /* END CUDART_EVENT */ + +/** + * \defgroup CUDART_EXTRES_INTEROP External Resource Interoperability + * + * ___MANBRIEF___ External resource interoperability functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the external resource interoperability functions of the CUDA + * runtime application programming interface. + * + * @{ + */ + +/** + * \brief Imports an external memory object + * + * Imports an externally allocated memory object and returns + * a handle to that in \p extMem_out. + * + * The properties of the handle being imported must be described in + * \p memHandleDesc. The ::cudaExternalMemoryHandleDesc structure + * is defined as follows: + * + * \code + typedef struct cudaExternalMemoryHandleDesc_st { + cudaExternalMemoryHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + const void *nvSciBufObject; + } handle; + unsigned long long size; + unsigned int flags; + } cudaExternalMemoryHandleDesc; + * \endcode + * + * where ::cudaExternalMemoryHandleDesc::type specifies the type + * of handle being imported. ::cudaExternalMemoryHandleType is + * defined as: + * + * \code + typedef enum cudaExternalMemoryHandleType_enum { + cudaExternalMemoryHandleTypeOpaqueFd = 1, + cudaExternalMemoryHandleTypeOpaqueWin32 = 2, + cudaExternalMemoryHandleTypeOpaqueWin32Kmt = 3, + cudaExternalMemoryHandleTypeD3D12Heap = 4, + cudaExternalMemoryHandleTypeD3D12Resource = 5, + cudaExternalMemoryHandleTypeD3D11Resource = 6, + cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7, + cudaExternalMemoryHandleTypeNvSciBuf = 8 + } cudaExternalMemoryHandleType; + * \endcode + * + * If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeOpaqueFd, then + * ::cudaExternalMemoryHandleDesc::handle::fd must be a valid + * file descriptor referencing a memory object. Ownership of + * the file descriptor is transferred to the CUDA driver when the + * handle is imported successfully. Performing any operations on the + * file descriptor after it is imported results in undefined behavior. + * + * If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeOpaqueWin32, then exactly one + * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and + * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a memory object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::cudaExternalMemoryHandleDesc::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a memory object. + * + * If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt, then + * ::cudaExternalMemoryHandleDesc::handle::win32::handle must + * be non-NULL and + * ::cudaExternalMemoryHandleDesc::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * memory object are destroyed. + * + * If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeD3D12Heap, then exactly one + * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and + * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Heap object. This handle holds a reference to the underlying + * object. If ::cudaExternalMemoryHandleDesc::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Heap object. + * + * If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeD3D12Resource, then exactly one + * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and + * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Resource object. This handle holds a reference to the + * underlying object. If + * ::cudaExternalMemoryHandleDesc::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D12Resource object. + * + * If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeD3D11Resource,then exactly one + * of ::cudaExternalMemoryHandleDesc::handle::win32::handle and + * ::cudaExternalMemoryHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalMemoryHandleDesc::handle::win32::handle is + * not NULL, then it must represent a valid shared NT handle that is + * returned by IDXGIResource1::CreateSharedHandle when referring to a + * ID3D11Resource object. If + * ::cudaExternalMemoryHandleDesc::handle::win32::name + * is not NULL, then it must point to a NULL-terminated array of + * UTF-16 characters that refers to a ID3D11Resource object. + * + * If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt, then + * ::cudaExternalMemoryHandleDesc::handle::win32::handle must + * be non-NULL and ::cudaExternalMemoryHandleDesc::handle::win32::name + * must be NULL. The handle specified must be a valid shared KMT + * handle that is returned by IDXGIResource::GetSharedHandle when + * referring to a ID3D11Resource object. + * + * If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeNvSciBuf, then + * ::cudaExternalMemoryHandleDesc::handle::nvSciBufObject must be NON-NULL + * and reference a valid NvSciBuf object. + * If the NvSciBuf object imported into CUDA is also mapped by other drivers, then the + * application must use ::cudaWaitExternalSemaphoresAsync or ::cudaSignalExternalSemaphoresAsync + * as approprriate barriers to maintain coherence between CUDA and the other drivers. + * See ::cudaExternalSemaphoreWaitSkipNvSciBufMemSync and ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync + * for memory synchronization. + * + * The size of the memory object must be specified in + * ::cudaExternalMemoryHandleDesc::size. + * + * Specifying the flag ::cudaExternalMemoryDedicated in + * ::cudaExternalMemoryHandleDesc::flags indicates that the + * resource is a dedicated resource. The definition of what a + * dedicated resource is outside the scope of this extension. + * This flag must be set if ::cudaExternalMemoryHandleDesc::type + * is one of the following: + * ::cudaExternalMemoryHandleTypeD3D12Resource + * ::cudaExternalMemoryHandleTypeD3D11Resource + * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt + * + * \param extMem_out - Returned handle to an external memory object + * \param memHandleDesc - Memory import handle descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \note If the Vulkan memory imported into CUDA is mapped on the CPU then the + * application must use vkInvalidateMappedMemoryRanges/vkFlushMappedMemoryRanges + * as well as appropriate Vulkan pipeline barriers to maintain coherence between + * CPU and GPU. For more information on these APIs, please refer to "Synchronization + * and Cache Control" chapter from Vulkan specification. + * + * + * \sa ::cudaDestroyExternalMemory, + * ::cudaExternalMemoryGetMappedBuffer, + * ::cudaExternalMemoryGetMappedMipmappedArray + */ +extern __host__ cudaError_t CUDARTAPI cudaImportExternalMemory(cudaExternalMemory_t *extMem_out, const struct cudaExternalMemoryHandleDesc *memHandleDesc); + +/** + * \brief Maps a buffer onto an imported memory object + * + * Maps a buffer onto an imported memory object and returns a device + * pointer in \p devPtr. + * + * The properties of the buffer being mapped must be described in + * \p bufferDesc. The ::cudaExternalMemoryBufferDesc structure is + * defined as follows: + * + * \code + typedef struct cudaExternalMemoryBufferDesc_st { + unsigned long long offset; + unsigned long long size; + unsigned int flags; + } cudaExternalMemoryBufferDesc; + * \endcode + * + * where ::cudaExternalMemoryBufferDesc::offset is the offset in + * the memory object where the buffer's base address is. + * ::cudaExternalMemoryBufferDesc::size is the size of the buffer. + * ::cudaExternalMemoryBufferDesc::flags must be zero. + * + * The offset and size have to be suitably aligned to match the + * requirements of the external API. Mapping two buffers whose ranges + * overlap may or may not result in the same virtual address being + * returned for the overlapped portion. In such cases, the application + * must ensure that all accesses to that region from the GPU are + * volatile. Otherwise writes made via one address are not guaranteed + * to be visible via the other address, even if they're issued by the + * same thread. It is recommended that applications map the combined + * range instead of mapping separate buffers and then apply the + * appropriate offsets to the returned pointer to derive the + * individual buffers. + * + * The returned pointer \p devPtr must be freed using ::cudaFree. + * + * \param devPtr - Returned device pointer to buffer + * \param extMem - Handle to external memory object + * \param bufferDesc - Buffer descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaImportExternalMemory, + * ::cudaDestroyExternalMemory, + * ::cudaExternalMemoryGetMappedMipmappedArray + */ +extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedBuffer(void **devPtr, cudaExternalMemory_t extMem, const struct cudaExternalMemoryBufferDesc *bufferDesc); + +/** + * \brief Maps a CUDA mipmapped array onto an external memory object + * + * Maps a CUDA mipmapped array onto an external object and returns a + * handle to it in \p mipmap. + * + * The properties of the CUDA mipmapped array being mapped must be + * described in \p mipmapDesc. The structure + * ::cudaExternalMemoryMipmappedArrayDesc is defined as follows: + * + * \code + typedef struct cudaExternalMemoryMipmappedArrayDesc_st { + unsigned long long offset; + cudaChannelFormatDesc formatDesc; + cudaExtent extent; + unsigned int flags; + unsigned int numLevels; + } cudaExternalMemoryMipmappedArrayDesc; + * \endcode + * + * where ::cudaExternalMemoryMipmappedArrayDesc::offset is the + * offset in the memory object where the base level of the mipmap + * chain is. + * ::cudaExternalMemoryMipmappedArrayDesc::formatDesc describes the + * format of the data. + * ::cudaExternalMemoryMipmappedArrayDesc::extent specifies the + * dimensions of the base level of the mipmap chain. + * ::cudaExternalMemoryMipmappedArrayDesc::flags are flags associated + * with CUDA mipmapped arrays. For further details, please refer to + * the documentation for ::cudaMalloc3DArray. Note that if the mipmapped + * array is bound as a color target in the graphics API, then the flag + * ::cudaArrayColorAttachment must be specified in + * ::cudaExternalMemoryMipmappedArrayDesc::flags. + * ::cudaExternalMemoryMipmappedArrayDesc::numLevels specifies + * the total number of levels in the mipmap chain. + * + * The returned CUDA mipmapped array must be freed using ::cudaFreeMipmappedArray. + * + * \param mipmap - Returned CUDA mipmapped array + * \param extMem - Handle to external memory object + * \param mipmapDesc - CUDA array descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * \note On Tegra devices, this API will always attempt to do a compressed mapping when the ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeOpaqueFd + * + * \sa ::cudaImportExternalMemory, + * ::cudaDestroyExternalMemory, + * ::cudaExternalMemoryGetMappedBuffer + * + * \note If ::cudaExternalMemoryHandleDesc::type is + * ::cudaExternalMemoryHandleTypeNvSciBuf, then + * ::cudaExternalMemoryMipmappedArrayDesc::numLevels must not be greater than 1. + */ +extern __host__ cudaError_t CUDARTAPI cudaExternalMemoryGetMappedMipmappedArray(cudaMipmappedArray_t *mipmap, cudaExternalMemory_t extMem, const struct cudaExternalMemoryMipmappedArrayDesc *mipmapDesc); + +/** + * \brief Destroys an external memory object. + * + * Destroys the specified external memory object. Any existing buffers + * and CUDA mipmapped arrays mapped onto this object must no longer be + * used and must be explicitly freed using ::cudaFree and + * ::cudaFreeMipmappedArray respectively. + * + * \param extMem - External memory object to be destroyed + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa ::cudaImportExternalMemory, + * ::cudaExternalMemoryGetMappedBuffer, + * ::cudaExternalMemoryGetMappedMipmappedArray + */ +extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalMemory(cudaExternalMemory_t extMem); + +/** + * \brief Imports an external semaphore + * + * Imports an externally allocated synchronization object and returns + * a handle to that in \p extSem_out. + * + * The properties of the handle being imported must be described in + * \p semHandleDesc. The ::cudaExternalSemaphoreHandleDesc is defined + * as follows: + * + * \code + typedef struct cudaExternalSemaphoreHandleDesc_st { + cudaExternalSemaphoreHandleType type; + union { + int fd; + struct { + void *handle; + const void *name; + } win32; + const void* NvSciSyncObj; + } handle; + unsigned int flags; + } cudaExternalSemaphoreHandleDesc; + * \endcode + * + * where ::cudaExternalSemaphoreHandleDesc::type specifies the type of + * handle being imported. ::cudaExternalSemaphoreHandleType is defined + * as: + * + * \code + typedef enum cudaExternalSemaphoreHandleType_enum { + cudaExternalSemaphoreHandleTypeOpaqueFd = 1, + cudaExternalSemaphoreHandleTypeOpaqueWin32 = 2, + cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3, + cudaExternalSemaphoreHandleTypeD3D12Fence = 4, + cudaExternalSemaphoreHandleTypeD3D11Fence = 5, + cudaExternalSemaphoreHandleTypeNvSciSync = 6, + cudaExternalSemaphoreHandleTypeKeyedMutex = 7, + cudaExternalSemaphoreHandleTypeKeyedMutexKmt = 8, + cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd = 9, + cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 = 10 + } cudaExternalSemaphoreHandleType; + * \endcode + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeOpaqueFd, then + * ::cudaExternalSemaphoreHandleDesc::handle::fd must be a valid file + * descriptor referencing a synchronization object. Ownership of the + * file descriptor is transferred to the CUDA driver when the handle + * is imported successfully. Performing any operations on the file + * descriptor after it is imported results in undefined behavior. + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeOpaqueWin32, then exactly one of + * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and + * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name is + * not NULL, then it must name a valid synchronization object. + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt, then + * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle must be + * non-NULL and ::cudaExternalSemaphoreHandleDesc::handle::win32::name + * must be NULL. The handle specified must be a globally shared KMT + * handle. This handle does not hold a reference to the underlying + * object, and thus will be invalid when all references to the + * synchronization object are destroyed. + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeD3D12Fence, then exactly one of + * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and + * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D12Device::CreateSharedHandle when referring to a + * ID3D12Fence object. This handle holds a reference to the underlying + * object. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D12Fence object. + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeD3D11Fence, then exactly one of + * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and + * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * is returned by ID3D11Fence::CreateSharedHandle. If + * ::cudaExternalSemaphoreHandleDesc::handle::win32::name + * is not NULL, then it must name a valid synchronization object that + * refers to a valid ID3D11Fence object. + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeNvSciSync, then + * ::cudaExternalSemaphoreHandleDesc::handle::nvSciSyncObj + * represents a valid NvSciSyncObj. + * + * ::cudaExternalSemaphoreHandleTypeKeyedMutex, then exactly one of + * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and + * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle + * is not NULL, then it represent a valid shared NT handle that + * is returned by IDXGIResource1::CreateSharedHandle when referring to + * a IDXGIKeyedMutex object. + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt, then + * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle must be + * non-NULL and ::cudaExternalSemaphoreHandleDesc::handle::win32::name + * must be NULL. The handle specified must represent a valid KMT + * handle that is returned by IDXGIResource::GetSharedHandle when + * referring to a IDXGIKeyedMutex object. + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd, then + * ::cudaExternalSemaphoreHandleDesc::handle::fd must be a valid file + * descriptor referencing a synchronization object. Ownership of the + * file descriptor is transferred to the CUDA driver when the handle + * is imported successfully. Performing any operations on the file + * descriptor after it is imported results in undefined behavior. + * + * If ::cudaExternalSemaphoreHandleDesc::type is + * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32, then exactly one of + * ::cudaExternalSemaphoreHandleDesc::handle::win32::handle and + * ::cudaExternalSemaphoreHandleDesc::handle::win32::name must not be + * NULL. If ::cudaExternalSemaphoreHandleDesc::handle::win32::handle + * is not NULL, then it must represent a valid shared NT handle that + * references a synchronization object. Ownership of this handle is + * not transferred to CUDA after the import operation, so the + * application must release the handle using the appropriate system + * call. If ::cudaExternalSemaphoreHandleDesc::handle::win32::name is + * not NULL, then it must name a valid synchronization object. + * + * \param extSem_out - Returned handle to an external semaphore + * \param semHandleDesc - Semaphore import handle descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDestroyExternalSemaphore, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaImportExternalSemaphore(cudaExternalSemaphore_t *extSem_out, const struct cudaExternalSemaphoreHandleDesc *semHandleDesc); + +/** + * \brief Signals a set of external semaphore objects + * + * Enqueues a signal operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of signaling a semaphore depends on the type of + * the object. + * + * If the semaphore object is any one of the following types: + * ::cudaExternalSemaphoreHandleTypeOpaqueFd, + * ::cudaExternalSemaphoreHandleTypeOpaqueWin32, + * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt + * then signaling the semaphore will set it to the signaled state. + * + * If the semaphore object is any one of the following types: + * ::cudaExternalSemaphoreHandleTypeD3D12Fence, + * ::cudaExternalSemaphoreHandleTypeD3D11Fence, + * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd, + * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 + * then the semaphore will be set to the value specified in + * ::cudaExternalSemaphoreSignalParams::params::fence::value. + * + * If the semaphore object is of the type ::cudaExternalSemaphoreHandleTypeNvSciSync + * this API sets ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence to a + * value that can be used by subsequent waiters of the same NvSciSync object to + * order operations with those currently submitted in \p stream. Such an update + * will overwrite previous contents of + * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence. By deefault, + * signaling such an external semaphore object causes appropriate memory synchronization + * operations to be performed over all the external memory objects that are imported as + * ::cudaExternalMemoryHandleTypeNvSciBuf. This ensures that any subsequent accesses + * made by other importers of the same set of NvSciBuf memory object(s) are coherent. + * These operations can be skipped by specifying the flag + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync, which can be used as a + * performance optimization when data coherency is not required. But specifying this + * flag in scenarios where data coherency is required results in undefined behavior. + * Also, for semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync, + * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in + * ::cudaDeviceGetNvSciSyncAttributes to cudaNvSciSyncAttrSignal, this API will return + * cudaErrorNotSupported. + * + * If the semaphore object is any one of the following types: + * ::cudaExternalSemaphoreHandleTypeKeyedMutex, + * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt, + * then the keyed mutex will be released with the key specified in + * ::cudaExternalSemaphoreSignalParams::params::keyedmutex::key. + * + * \param extSemArray - Set of external semaphores to be signaled + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to signal + * \param stream - Stream to enqueue the signal operations in + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaImportExternalSemaphore, + * ::cudaDestroyExternalSemaphore, + * ::cudaWaitExternalSemaphoresAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)); + +/** + * \brief Waits on a set of external semaphore objects + * + * Enqueues a wait operation on a set of externally allocated + * semaphore object in the specified stream. The operations will be + * executed when all prior operations in the stream complete. + * + * The exact semantics of waiting on a semaphore depends on the type + * of the object. + * + * If the semaphore object is any one of the following types: + * ::cudaExternalSemaphoreHandleTypeOpaqueFd, + * ::cudaExternalSemaphoreHandleTypeOpaqueWin32, + * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt + * then waiting on the semaphore will wait until the semaphore reaches + * the signaled state. The semaphore will then be reset to the + * unsignaled state. Therefore for every signal operation, there can + * only be one wait operation. + * + * If the semaphore object is any one of the following types: + * ::cudaExternalSemaphoreHandleTypeD3D12Fence, + * ::cudaExternalSemaphoreHandleTypeD3D11Fence, + * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd, + * ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 + * then waiting on the semaphore will wait until the value of the + * semaphore is greater than or equal to + * ::cudaExternalSemaphoreWaitParams::params::fence::value. + * + * If the semaphore object is of the type ::cudaExternalSemaphoreHandleTypeNvSciSync + * then, waiting on the semaphore will wait until the + * ::cudaExternalSemaphoreSignalParams::params::nvSciSync::fence is signaled by the + * signaler of the NvSciSyncObj that was associated with this semaphore object. + * By default, waiting on such an external semaphore object causes appropriate + * memory synchronization operations to be performed over all external memory objects + * that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf. This ensures that + * any subsequent accesses made by other importers of the same set of NvSciBuf memory + * object(s) are coherent. These operations can be skipped by specifying the flag + * ::cudaExternalSemaphoreWaitSkipNvSciBufMemSync, which can be used as a + * performance optimization when data coherency is not required. But specifying this + * flag in scenarios where data coherency is required results in undefined behavior. + * Also, for semaphore object of the type ::cudaExternalSemaphoreHandleTypeNvSciSync, + * if the NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags in + * ::cudaDeviceGetNvSciSyncAttributes to cudaNvSciSyncAttrWait, this API will return + * cudaErrorNotSupported. + * + * If the semaphore object is any one of the following types: + * ::cudaExternalSemaphoreHandleTypeKeyedMutex, + * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt, + * then the keyed mutex will be acquired when it is released with the key specified + * in ::cudaExternalSemaphoreSignalParams::params::keyedmutex::key or + * until the timeout specified by + * ::cudaExternalSemaphoreSignalParams::params::keyedmutex::timeoutMs + * has lapsed. The timeout interval can either be a finite value + * specified in milliseconds or an infinite value. In case an infinite + * value is specified the timeout never elapses. The windows INFINITE + * macro must be used to specify infinite timeout + * + * \param extSemArray - External semaphores to be waited on + * \param paramsArray - Array of semaphore parameters + * \param numExtSems - Number of semaphores to wait on + * \param stream - Stream to enqueue the wait operations in + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle + * ::cudaErrorTimeout + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaImportExternalSemaphore, + * ::cudaDestroyExternalSemaphore, + * ::cudaSignalExternalSemaphoresAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)); + +/** + * \brief Destroys an external semaphore + * + * Destroys an external semaphore object and releases any references + * to the underlying resource. Any outstanding signals or waits must + * have completed before the semaphore is destroyed. + * + * \param extSem - External semaphore to be destroyed + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa ::cudaImportExternalSemaphore, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaDestroyExternalSemaphore(cudaExternalSemaphore_t extSem); + +/** @} */ /* END CUDART_EXTRES_INTEROP */ + +/** + * \defgroup CUDART_EXECUTION Execution Control + * + * ___MANBRIEF___ execution control functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the execution control functions of the CUDA runtime + * application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Launches a device function + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region + * of memory from which the actual parameter will be copied. + * + * For templated functions, pass the function symbol as follows: + * func_name<template_arg_0,...,template_arg_N> + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorUnsupportedPtxVersion, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound, + * ::cudaErrorJitCompilationDisabled + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * ::cuLaunchKernel + */ +extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); + +/** + * \brief Launches a CUDA function with launch-time configuration + * + * Note that the functionally equivalent variadic template ::cudaLaunchKernelEx + * is available for C++11 and newer. + * + * Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x + * × \p config->gridDim.y × \p config->gridDim.z) grid of blocks. + * Each block contains \p config->blockDim (\p config->blockDim.x × + * \p config->blockDim.y × \p config->blockDim.z) threads. + * + * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that + * will be available to each thread block. + * + * \p config->stream specifies a stream the invocation is associated to. + * + * Configuration beyond grid and block dimensions, dynamic shared memory size, + * and stream can be provided with the following two fields of \p config: + * + * \p config->attrs is an array of \p config->numAttrs contiguous + * ::cudaLaunchAttribute elements. The value of this pointer is not considered + * if \p config->numAttrs is zero. However, in that case, it is recommended to + * set the pointer to NULL. + * \p config->numAttrs is the number of attributes populating the first + * \p config->numAttrs positions of the \p config->attrs array. + * + * If the kernel has N parameters the \p args should point to array of N + * pointers. Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point + * to the region of memory from which the actual parameter will be copied. + * + * N.B. This function is so named to avoid unintentionally invoking the + * templated version, \p cudaLaunchKernelEx, for kernels taking a single + * void** or void* parameter. + * + * \param config - Launch configuration + * \param func - Kernel to launch + * \param args - Array of pointers to kernel parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorSharedObjectInitFailed, + * ::cudaErrorInvalidPtx, + * ::cudaErrorUnsupportedPtxVersion, + * ::cudaErrorNoKernelImageForDevice, + * ::cudaErrorJitCompilerNotFound, + * ::cudaErrorJitCompilationDisabled + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaLaunchKernelEx(const cudaLaunchConfig_t *config, void (*kernel)(ExpTypes...), ActTypes &&... args) "cudaLaunchKernelEx (C++ API)", + * ::cuLaunchKernelEx + */ +extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args); + +/** + * \brief Launches a device function where thread blocks can cooperate and synchronize as they execute + * + * The function invokes kernel \p func on \p gridDim (\p gridDim.x × \p gridDim.y + * × \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x × + * \p blockDim.y × \p blockDim.z) threads. + * + * The device on which this kernel is invoked must have a non-zero value for + * the device attribute ::cudaDevAttrCooperativeLaunch. + * + * The total number of blocks launched cannot exceed the maximum number of blocks per + * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * If the kernel has N parameters the \p args should point to array of N pointers. + * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region + * of memory from which the actual parameter will be copied. + * + * For templated functions, pass the function symbol as follows: + * func_name<template_arg_0,...,template_arg_N> + * + * \p sharedMem sets the amount of dynamic shared memory that will be available to + * each thread block. + * + * \p stream specifies a stream the invocation is associated to. + * + * \param func - Device function symbol + * \param gridDim - Grid dimentions + * \param blockDim - Block dimentions + * \param args - Arguments + * \param sharedMem - Shared memory + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorCooperativeLaunchTooLarge, + * ::cudaErrorSharedObjectInitFailed + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)", + * ::cudaLaunchCooperativeKernelMultiDevice, + * ::cuLaunchCooperativeKernel + */ +extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); + +/** + * \brief Launches device functions on multiple devices where thread blocks can cooperate and synchronize as they execute + * + * \deprecated This function is deprecated as of CUDA 11.3. + * + * Invokes kernels as specified in the \p launchParamsList array where each element + * of the array specifies all the parameters required to perform a single kernel launch. + * These kernels can cooperate and synchronize as they execute. The size of the array is + * specified by \p numDevices. + * + * No two kernels can be launched on the same device. All the devices targeted by this + * multi-device launch must be identical. All devices must have a non-zero value for the + * device attribute ::cudaDevAttrCooperativeMultiDeviceLaunch. + * + * The same kernel must be launched on all devices. Note that any __device__ or __constant__ + * variables are independently instantiated on every device. It is the application's + * responsiblity to ensure these variables are initialized and used appropriately. + * + * The size of the grids as specified in blocks, the size of the blocks themselves and the + * amount of shared memory used by each thread block must also match across all launched kernels. + * + * The streams used to launch these kernels must have been created via either ::cudaStreamCreate + * or ::cudaStreamCreateWithPriority or ::cudaStreamCreateWithPriority. The NULL stream or + * ::cudaStreamLegacy or ::cudaStreamPerThread cannot be used. + * + * The total number of blocks launched per kernel cannot exceed the maximum number of blocks + * per multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. Since the + * total number of blocks launched per device has to match across all devices, the maximum + * number of blocks that can be launched per device will be limited by the device with the + * least number of multiprocessors. + * + * The kernel cannot make use of CUDA dynamic parallelism. + * + * The ::cudaLaunchParams structure is defined as: + * \code + struct cudaLaunchParams + { + void *func; + dim3 gridDim; + dim3 blockDim; + void **args; + size_t sharedMem; + cudaStream_t stream; + }; + * \endcode + * where: + * - ::cudaLaunchParams::func specifies the kernel to be launched. This same functions must + * be launched on all devices. For templated functions, pass the function symbol as follows: + * func_name<template_arg_0,...,template_arg_N> + * - ::cudaLaunchParams::gridDim specifies the width, height and depth of the grid in blocks. + * This must match across all kernels launched. + * - ::cudaLaunchParams::blockDim is the width, height and depth of each thread block. This + * must match across all kernels launched. + * - ::cudaLaunchParams::args specifies the arguments to the kernel. If the kernel has + * N parameters then ::cudaLaunchParams::args should point to array of N pointers. Each + * pointer, from <tt>::cudaLaunchParams::args[0]</tt> to <tt>::cudaLaunchParams::args[N - 1]</tt>, + * point to the region of memory from which the actual parameter will be copied. + * - ::cudaLaunchParams::sharedMem is the dynamic shared-memory size per thread block in bytes. + * This must match across all kernels launched. + * - ::cudaLaunchParams::stream is the handle to the stream to perform the launch in. This cannot + * be the NULL stream or ::cudaStreamLegacy or ::cudaStreamPerThread. + * + * By default, the kernel won't begin execution on any GPU until all prior work in all the specified + * streams has completed. This behavior can be overridden by specifying the flag + * ::cudaCooperativeLaunchMultiDeviceNoPreSync. When this flag is specified, each kernel + * will only wait for prior work in the stream corresponding to that GPU to complete before it begins + * execution. + * + * Similarly, by default, any subsequent work pushed in any of the specified streams will not begin + * execution until the kernels on all GPUs have completed. This behavior can be overridden by specifying + * the flag ::cudaCooperativeLaunchMultiDeviceNoPostSync. When this flag is specified, + * any subsequent work pushed in any of the specified streams will only wait for the kernel launched + * on the GPU corresponding to that stream to complete before it begins execution. + * + * \param launchParamsList - List of launch parameters, one per device + * \param numDevices - Size of the \p launchParamsList array + * \param flags - Flags to control launch behavior + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidConfiguration, + * ::cudaErrorLaunchFailure, + * ::cudaErrorLaunchTimeout, + * ::cudaErrorLaunchOutOfResources, + * ::cudaErrorCooperativeLaunchTooLarge, + * ::cudaErrorSharedObjectInitFailed + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaLaunchCooperativeKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C++ API)", + * ::cudaLaunchCooperativeKernel, + * ::cuLaunchCooperativeKernelMultiDevice + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernelMultiDevice(struct cudaLaunchParams *launchParamsList, unsigned int numDevices, unsigned int flags __dv(0)); + +/** + * \brief Sets the preferred cache configuration for a device function + * + * On devices where the L1 cache and shared memory use the same hardware + * resources, this sets through \p cacheConfig the preferred cache configuration + * for the function specified via \p func. This is only a preference. The + * runtime will use the requested configuration if possible, but it is free to + * choose a different configuration if required to execute \p func. + * + * \p func is a device function symbol and must be declared as a + * \c __global__ function. If the specified function does not exist, + * then ::cudaErrorInvalidDeviceFunction is returned. For templated functions, + * pass the function symbol as follows: func_name<template_arg_0,...,template_arg_N> + * + * This setting does nothing on devices where the size of the L1 cache and + * shared memory are fixed. + * + * Launching a kernel with a different preference than the most recent + * preference setting may insert a device-side synchronization point. + * + * The supported cache configurations are: + * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default) + * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache + * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory + * - ::cudaFuncCachePreferEqual: prefer equal size L1 cache and shared memory + * + * \param func - Device function symbol + * \param cacheConfig - Requested cache configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_string_api_deprecation2 + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", + * ::cudaThreadGetCacheConfig, + * ::cudaThreadSetCacheConfig, + * ::cuFuncSetCacheConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaFuncSetCacheConfig(const void *func, enum cudaFuncCache cacheConfig); + +/** + * \brief Sets the shared memory configuration for a device function + * + * On devices with configurable shared memory banks, this function will + * force all subsequent launches of the specified device function to have + * the given shared memory bank size configuration. On any given launch of the + * function, the shared memory configuration of the device will be temporarily + * changed if needed to suit the function's preferred configuration. Changes in + * shared memory configuration between subsequent launches of functions, + * may introduce a device side synchronization point. + * + * Any per-function setting of shared memory bank size set via + * ::cudaFuncSetSharedMemConfig will override the device wide setting set by + * ::cudaDeviceSetSharedMemConfig. + * + * Changing the shared memory bank size will not increase shared memory usage + * or affect occupancy of kernels, but may have major effects on performance. + * Larger bank sizes will allow for greater potential bandwidth to shared memory, + * but will change what kinds of accesses to shared memory will result in bank + * conflicts. + * + * This function will do nothing on devices with fixed shared memory bank size. + * + * For templated functions, pass the function symbol as follows: + * func_name<template_arg_0,...,template_arg_N> + * + * The supported bank configurations are: + * - ::cudaSharedMemBankSizeDefault: use the device's shared memory configuration + * when launching this function. + * - ::cudaSharedMemBankSizeFourByte: set shared memory bank width to be + * four bytes natively when launching this function. + * - ::cudaSharedMemBankSizeEightByte: set shared memory bank width to be eight + * bytes natively when launching this function. + * + * \param func - Device function symbol + * \param config - Requested shared memory configuration + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_string_api_deprecation2 + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceSetSharedMemConfig, + * ::cudaDeviceGetSharedMemConfig, + * ::cudaDeviceSetCacheConfig, + * ::cudaDeviceGetCacheConfig, + * ::cudaFuncSetCacheConfig, + * ::cuFuncSetSharedMemConfig + */ +extern __host__ cudaError_t CUDARTAPI cudaFuncSetSharedMemConfig(const void *func, enum cudaSharedMemConfig config); + +/** + * \brief Find out attributes for a given function + * + * This function obtains the attributes of a function specified via \p func. + * \p func is a device function symbol and must be declared as a + * \c __global__ function. The fetched attributes are placed in \p attr. + * If the specified function does not exist, then + * ::cudaErrorInvalidDeviceFunction is returned. For templated functions, pass + * the function symbol as follows: func_name<template_arg_0,...,template_arg_N> + * + * Note that some function attributes such as + * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock" + * may vary based on the device that is currently being used. + * + * \param attr - Return pointer to function's attributes + * \param func - Device function symbol + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction + * \notefnerr + * \note_string_api_deprecation2 + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)", + * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)", + * ::cuFuncGetAttribute + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func); + + +/** + * \brief Set attributes for a given function + * + * This function sets the attributes of a function specified via \p func. + * The parameter \p func must be a pointer to a function that executes + * on the device. The parameter specified by \p func must be declared as a \p __global__ + * function. The enumeration defined by \p attr is set to the value defined by \p value. + * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned. + * If the specified attribute cannot be written, or if the value is incorrect, + * then ::cudaErrorInvalidValue is returned. + * + * Valid values for \p attr are: + * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes + * cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture. + * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * + * \param func - Function to get attributes of + * \param attr - Attribute to set + * \param value - Value to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)", + * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFuncSetAttribute(const void *func, enum cudaFuncAttribute attr, int value); + +/** + * \brief Converts a double argument to be executed on a device + * + * \param d - Double to convert + * + * \deprecated This function is deprecated as of CUDA 7.5 + * + * Converts the double value of \p d to an internal float representation if + * the device does not support double arithmetic. If the device does natively + * support doubles, then this function does nothing. + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForHost + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaSetDoubleForDevice(double *d); + +/** + * \brief Converts a double argument after execution on a device + * + * \deprecated This function is deprecated as of CUDA 7.5 + * + * Converts the double value of \p d from a potentially internal float + * representation if the device does not support double arithmetic. If the + * device does natively support doubles, then this function does nothing. + * + * \param d - Double to convert + * + * \return + * ::cudaSuccess + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)", + * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)", + * ::cudaSetDoubleForDevice + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaSetDoubleForHost(double *d); + +/** + * \brief Enqueues a host function call in a stream + * + * Enqueues a host function to run in a stream. The function will be called + * after currently enqueued work and will block work added after it. + * + * The host function must not make any CUDA API calls. Attempting to use a + * CUDA API may result in ::cudaErrorNotPermitted, but this is not required. + * The host function must not perform any synchronization that may depend on + * outstanding CUDA work not mandated to run earlier. Host functions without a + * mandated order (such as in independent streams) execute in undefined order + * and may be serialized. + * + * For the purposes of Unified Memory, execution makes a number of guarantees: + * <ul> + * <li>The stream is considered idle for the duration of the function's + * execution. Thus, for example, the function may always use memory attached + * to the stream it was enqueued in.</li> + * <li>The start of execution of the function has the same effect as + * synchronizing an event recorded in the same stream immediately prior to + * the function. It thus synchronizes streams which have been "joined" + * prior to the function.</li> + * <li>Adding device work to any stream does not have the effect of making + * the stream active until all preceding host functions and stream callbacks + * have executed. Thus, for + * example, a function might use global attached memory even if work has + * been added to another stream, if the work has been ordered behind the + * function call with an event.</li> + * <li>Completion of the function does not cause a stream to become + * active except as described above. The stream will remain idle + * if no device work follows the function, and will remain idle across + * consecutive host functions or stream callbacks without device work in + * between. Thus, for example, + * stream synchronization can be done by signaling from a host function at the + * end of the stream.</li> + * </ul> + * + * Note that, in constrast to ::cuStreamAddCallback, the function will not be + * called in the event of an error in the CUDA context. + * + * \param hStream - Stream to enqueue function call in + * \param fn - The function to call once preceding stream operations are complete + * \param userData - User-specified data to be passed to the function + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorInvalidValue, + * ::cudaErrorNotSupported + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaStreamCreate, + * ::cudaStreamQuery, + * ::cudaStreamSynchronize, + * ::cudaStreamWaitEvent, + * ::cudaStreamDestroy, + * ::cudaMallocManaged, + * ::cudaStreamAttachMemAsync, + * ::cudaStreamAddCallback, + * ::cuLaunchHostFunc + */ +extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData); + +/** @} */ /* END CUDART_EXECUTION */ + +/** + * \defgroup CUDART_OCCUPANCY Occupancy + * + * ___MANBRIEF___ occupancy calculation functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the occupancy calculation functions of the CUDA runtime + * application programming interface. + * + * Besides the occupancy calculator functions + * (\ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessor and \ref ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags), + * there are also C++ only occupancy-based launch configuration functions documented in + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * See + * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)" + * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)", + * + * @{ + */ + +/** + * \brief Returns occupancy for a device function + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, + * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)", + * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)", + * ::cuOccupancyMaxActiveBlocksPerMultiprocessor + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize); + +/** + * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM. + * + * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. + * + * \param dynamicSmemSize - Returned maximum dynamic shared memory + * \param func - Kernel function for which occupancy is calculated + * \param numBlocks - Number of blocks to fit on SM + * \param blockSize - Size of the block + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, + * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)", + * ::cudaOccupancyAvailableDynamicSMemPerBlock + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, const void *func, int numBlocks, int blockSize); + +/** + * \brief Returns occupancy for a device function with the specified flags + * + * Returns in \p *numBlocks the maximum number of active blocks per + * streaming multiprocessor for the device function. + * + * The \p flags parameter controls how special cases are handled. Valid flags include: + * + * - ::cudaOccupancyDefault: keeps the default behavior as + * ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * + * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior + * on platform where global caching affects occupancy. On such platforms, if caching + * is enabled, but per-block SM resource usage would result in zero occupancy, the + * occupancy calculator will calculate the occupancy as if caching is disabled. + * Setting this flag makes the occupancy calculator to return 0 in such cases. + * More information can be found about this feature in the "Unified L1/Texture Cache" + * section of the Maxwell tuning guide. + * + * \param numBlocks - Returned occupancy + * \param func - Kernel function for which occupancy is calculated + * \param blockSize - Block size the kernel is intended to be launched with + * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes + * \param flags - Requested behavior for the occupancy calculator + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor, + * \ref ::cudaOccupancyMaxPotentialBlockSize(int*, int*, T, size_t, int) "cudaOccupancyMaxPotentialBlockSize (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeWithFlags(int*, int*, T, size_t, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMem(int*, int*, T, UnaryFunction, int) "cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API)", + * \ref ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int*, int*, T, UnaryFunction, int, unsigned int) "cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API)", + * \ref ::cudaOccupancyAvailableDynamicSMemPerBlock(size_t*, T, int, int) "cudaOccupancyAvailableDynamicSMemPerBlock (C++ API)", + * ::cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags); + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum cluster size in \p *clusterSize. + * + * The cluster dimensions in \p config are ignored. If func has a required + * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect + * the required cluster size. + * + * By default this function will always return a value that's portable on + * future hardware. A higher value may be returned if the kernel function + * allows non-portable cluster sizes. + * + * This function will respect the compile time launch bounds. + * + * \param clusterSize - Returned maximum cluster size that can be launched + * for the given kernel function and launch configuration + * \param func - Kernel function for which maximum cluster + * size is calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaFuncGetAttributes + * \ref ::cudaOccupancyMaxPotentialClusterSize(int*, T, const cudaLaunchConfig_t*) "cudaOccupancyMaxPotentialClusterSize (C++ API)", + * ::cuOccupancyMaxPotentialClusterSize + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxPotentialClusterSize(int *clusterSize, const void *func, const cudaLaunchConfig_t *launchConfig); + + +/** + * \brief Given the kernel function (\p func) and launch configuration + * (\p config), return the maximum number of clusters that could co-exist + * on the target device in \p *numClusters. + * + * If the function has required cluster size already set (see + * ::cudaFuncGetAttributes), the cluster size from config must either be + * unspecified or match the required size. + * Without required sizes, the cluster size must be specified in config, + * else the function will return an error. + * + * Note that various attributes of the kernel function may affect occupancy + * calculation. Runtime environment may affect how the hardware schedules + * the clusters, so the calculated occupancy is not guaranteed to be achievable. + * + * \param numClusters - Returned maximum number of clusters that + * could co-exist on the target device + * \param func - Kernel function for which maximum number + * of clusters are calculated + * \param config - Launch configuration for the given kernel function + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDeviceFunction, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidClusterSize, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaFuncGetAttributes + * \ref ::cudaOccupancyMaxActiveClusters(int*, T, const cudaLaunchConfig_t*) "cudaOccupancyMaxActiveClusters (C++ API)", + * ::cuOccupancyMaxActiveClusters + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaOccupancyMaxActiveClusters(int *numClusters, const void *func, const cudaLaunchConfig_t *launchConfig); +/** @} */ /* END CUDA_OCCUPANCY */ + +/** + * \defgroup CUDART_MEMORY Memory Management + * + * ___MANBRIEF___ memory management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the memory management functions of the CUDA runtime + * application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Allocates memory that will be automatically managed by the Unified Memory system + * + * Allocates \p size bytes of managed memory on the device and returns in + * \p *devPtr a pointer to the allocated memory. If the device doesn't support + * allocating managed memory, ::cudaErrorNotSupported is returned. Support + * for managed memory can be queried using the device attribute + * ::cudaDevAttrManagedMemory. The allocated memory is suitably + * aligned for any kind of variable. The memory is not cleared. If \p size + * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer + * is valid on the CPU and on all GPUs in the system that support managed memory. + * All accesses to this pointer must obey the Unified Memory programming model. + * + * \p flags specifies the default stream association for this allocation. + * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The + * default value for \p flags is ::cudaMemAttachGlobal. + * If ::cudaMemAttachGlobal is specified, then this memory is accessible from + * any stream on any device. If ::cudaMemAttachHost is specified, then the + * allocation should not be accessed from devices that have a zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to + * ::cudaStreamAttachMemAsync will be required to enable access on such devices. + * + * If the association is later changed via ::cudaStreamAttachMemAsync to + * a single stream, the default association, as specifed during ::cudaMallocManaged, + * is restored when that stream is destroyed. For __managed__ variables, the + * default association is always ::cudaMemAttachGlobal. Note that destroying a + * stream is an asynchronous operation, and as a result, the change to default + * association won't happen until all work in the stream has completed. + * + * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree. + * + * Device memory oversubscription is possible for GPUs that have a non-zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on + * such GPUs may be evicted from device memory to host memory at any time by the Unified + * Memory driver in order to make room for other allocations. + * + * In a multi-GPU system where all GPUs have a non-zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this + * API returns and instead may be populated on access. In such systems, managed memory can + * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to + * maintain data locality and prevent excessive page faults to the extent possible. The application + * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application + * can also explicitly migrate memory to a desired processor's memory via + * ::cudaMemPrefetchAsync. + * + * In a multi-GPU system where all of the GPUs have a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support + * with each other, the physical storage for managed memory is created on the GPU which is active + * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced + * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate + * memory among such GPUs. + * + * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and + * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess + * is zero for at least one of those GPUs, the location chosen for physical storage of managed + * memory is system-dependent. + * - On Linux, the location chosen will be device memory as long as the current set of active + * contexts are on devices that either have peer-to-peer support with each other or have a + * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * If there is an active context on a GPU that does not have a non-zero value for that device + * attribute and it does not have peer-to-peer support with the other devices that have active + * contexts on them, then the location for physical storage will be 'zero-copy' or host memory. + * Note that this means that managed memory that is located in device memory is migrated to + * host memory if a new context is created on a GPU that doesn't have a non-zero value for + * the device attribute and does not support peer-to-peer with at least one of the other devices + * that has an active context. This in turn implies that context creation may fail if there is + * insufficient host memory to migrate all managed allocations. + * - On Windows, the physical storage is always created in 'zero-copy' or host memory. + * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these + * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to + * restrict CUDA to only use those GPUs that have peer-to-peer support. + * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero + * value to force the driver to always use device memory for physical storage. + * When this environment variable is set to a non-zero value, all devices used in + * that process that support managed memory have to be peer-to-peer compatible + * with each other. The error ::cudaErrorInvalidDevice will be returned if a device + * that supports managed memory is used and it is not peer-to-peer compatible with + * any of the other managed memory supporting devices that were previously used in + * that process, even if ::cudaDeviceReset has been called on those devices. These + * environment variables are described in the CUDA programming guide under the + * "CUDA environment variables" section. + * + * \param devPtr - Pointer to allocated device memory + * \param size - Requested allocation size in bytes + * \param flags - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal) + * + * \return + * ::cudaSuccess, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * ::cudaMalloc3D, ::cudaMalloc3DArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync, + * ::cuMemAllocManaged + */ +#if defined(__cplusplus) +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags = cudaMemAttachGlobal); +#else +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMallocManaged(void **devPtr, size_t size, unsigned int flags); +#endif + +/** + * \brief Allocate memory on the device + * + * Allocates \p size bytes of linear memory on the device and returns in + * \p *devPtr a pointer to the allocated memory. The allocated memory is + * suitably aligned for any kind of variable. The memory is not cleared. + * ::cudaMalloc() returns ::cudaErrorMemoryAllocation in case of failure. + * + * The device version of ::cudaFree cannot be used with a \p *devPtr + * allocated using the host API, and vice versa. + * + * \param devPtr - Pointer to allocated device memory + * \param size - Requested allocation size in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * ::cudaMalloc3D, ::cudaMalloc3DArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::cuMemAlloc + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMalloc(void **devPtr, size_t size); + +/** + * \brief Allocates page-locked memory on the host + * + * Allocates \p size bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cudaMemcpy*(). Since the memory can be accessed directly by the device, + * it can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * memory with ::cudaMallocHost() may degrade system performance, since it + * reduces the amount of memory available to the system for paging. As a + * result, this function is best used sparingly to allocate staging areas for + * data exchange between host and device. + * + * \param ptr - Pointer to allocated host memory + * \param size - Requested allocation size in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocArray, ::cudaMalloc3D, + * ::cudaMalloc3DArray, ::cudaHostAlloc, ::cudaFree, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t, unsigned int) "cudaMallocHost (C++ API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::cuMemAllocHost + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocHost(void **ptr, size_t size); + +/** + * \brief Allocates pitched memory on the device + * + * Allocates at least \p width (in bytes) * \p height bytes of linear memory + * on the device and returns in \p *devPtr a pointer to the allocated memory. + * The function may pad the allocation to ensure that corresponding pointers + * in any given row will continue to meet the alignment requirements for + * coalescing as the address is updated from row to row. The pitch returned in + * \p *pitch by ::cudaMallocPitch() is the width in bytes of the allocation. + * The intended usage of \p pitch is as a separate parameter of the allocation, + * used to compute addresses within the 2D array. Given the row and column of + * an array element of type \p T, the address is computed as: + * \code + T* pElement = (T*)((char*)BaseAddress + Row * pitch) + Column; + \endcode + * + * For allocations of 2D arrays, it is recommended that programmers consider + * performing pitch allocations using ::cudaMallocPitch(). Due to pitch + * alignment restrictions in the hardware, this is especially true if the + * application will be performing 2D memory copies between different regions + * of device memory (whether linear memory or CUDA arrays). + * + * \param devPtr - Pointer to allocated pitched device memory + * \param pitch - Pitch for allocation + * \param width - Requested pitched allocation width (in bytes) + * \param height - Requested pitched allocation height + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, + * ::cudaHostAlloc, + * ::cuMemAllocPitch + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocPitch(void **devPtr, size_t *pitch, size_t width, size_t height); + +/** + * \brief Allocate an array on the device + * + * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure + * \p desc and returns a handle to the new CUDA array in \p *array. + * + * The ::cudaChannelFormatDesc is defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + \endcode + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation + * - ::cudaArraySurfaceLoadStore: Allocates an array that can be read from or written to using a surface reference + * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the array. + * - ::cudaArraySparse: Allocates a CUDA array without physical backing memory. The subregions within this sparse array + * can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. + * The physical backing memory must be allocated via ::cuMemCreate. + * - ::cudaArrayDeferredMapping: Allocates a CUDA array without physical backing memory. The entire array can + * later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. + * The physical backing memory must be allocated via ::cuMemCreate. + * + * \p width and \p height must meet certain size requirements. See ::cudaMalloc3DArray() for more details. + * + * \param array - Pointer to allocated array in device memory + * \param desc - Requested channel format + * \param width - Requested array allocation width + * \param height - Requested array allocation height + * \param flags - Requested properties of allocated array + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, + * ::cudaHostAlloc, + * ::cuArrayCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocArray(cudaArray_t *array, const struct cudaChannelFormatDesc *desc, size_t width, size_t height __dv(0), unsigned int flags __dv(0)); + +/** + * \brief Frees memory on the device + * + * Frees the memory space pointed to by \p devPtr, which must have been + * returned by a previous call to one of the following memory allocation APIs - + * ::cudaMalloc(), ::cudaMallocPitch(), ::cudaMallocManaged(), ::cudaMallocAsync(), + * ::cudaMallocFromPoolAsync(). + * + * Note - This API will not perform any implicit synchronization when the pointer was + * allocated with ::cudaMallocAsync or ::cudaMallocFromPoolAsync. Callers must ensure + * that all accesses to the pointer have completed before invoking ::cudaFree. For + * best performance and memory reuse, users should use ::cudaFreeAsync to free memory + * allocated via the stream ordered memory allocator. + * + * If ::cudaFree(\p devPtr) has already been called before, + * an error is returned. If \p devPtr is 0, no operation is performed. + * ::cudaFree() returns ::cudaErrorValue in case of failure. + * + * The device version of ::cudaFree cannot be used with a \p *devPtr + * allocated using the host API, and vice versa. + * + * \param devPtr - Device pointer to memory to free + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaMallocManaged, ::cudaMallocArray, ::cudaFreeArray, ::cudaMallocAsync, ::cudaMallocFromPoolAsync + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaFreeAsync + * ::cudaHostAlloc, + * ::cuMemFree + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaFree(void *devPtr); + +/** + * \brief Frees page-locked memory + * + * Frees the memory space pointed to by \p hostPtr, which must have been + * returned by a previous call to ::cudaMallocHost() or ::cudaHostAlloc(). + * + * \param ptr - Pointer to memory to free + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, + * ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaHostAlloc, + * ::cuMemFreeHost + */ +extern __host__ cudaError_t CUDARTAPI cudaFreeHost(void *ptr); + +/** + * \brief Frees an array on the device + * + * Frees the CUDA array \p array, which must have been returned by a + * previous call to ::cudaMallocArray(). If \p devPtr is 0, + * no operation is performed. + * + * \param array - Pointer to array to free + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::cuArrayDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaFreeArray(cudaArray_t array); + +/** + * \brief Frees a mipmapped array on the device + * + * Frees the CUDA mipmapped array \p mipmappedArray, which must have been + * returned by a previous call to ::cudaMallocMipmappedArray(). If \p devPtr + * is 0, no operation is performed. + * + * \param mipmappedArray - Pointer to mipmapped array to free + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::cuMipmappedArrayDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaFreeMipmappedArray(cudaMipmappedArray_t mipmappedArray); + + +/** + * \brief Allocates page-locked memory on the host + * + * Allocates \p size bytes of host memory that is page-locked and accessible + * to the device. The driver tracks the virtual memory ranges allocated with + * this function and automatically accelerates calls to functions such as + * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it + * can be read or written with much higher bandwidth than pageable memory + * obtained with functions such as ::malloc(). Allocating excessive amounts of + * pinned memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to allocate staging areas for data exchange between host + * and device. + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaHostAllocDefault: This flag's value is defined to be 0 and causes + * ::cudaHostAlloc() to emulate ::cudaMallocHost(). + * - ::cudaHostAllocPortable: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space. + * The device pointer to the memory may be obtained by calling + * ::cudaHostGetDevicePointer(). + * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC). + * WC memory can be transferred across the PCI Express bus more quickly on some + * system configurations, but cannot be read efficiently by most CPUs. WC + * memory is a good option for buffers that will be written by the CPU and read + * by the device via mapped pinned memory or host->device transfers. + * + * All of these flags are orthogonal to one another: a developer may allocate + * memory that is portable, mapped and/or write-combined with no restrictions. + * + * In order for the ::cudaHostAllocMapped flag to have any effect, the CUDA context + * must support the ::cudaDeviceMapHost flag, which can be checked via + * ::cudaGetDeviceFlags(). The ::cudaDeviceMapHost flag is implicitly set for + * contexts created via the runtime API. + * + * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices + * that do not support mapped pinned memory. The failure is deferred to + * ::cudaHostGetDevicePointer() because the memory may be mapped into other + * CUDA contexts via the ::cudaHostAllocPortable flag. + * + * Memory allocated by this function must be freed with ::cudaFreeHost(). + * + * \param pHost - Device pointer to allocated memory + * \param size - Requested allocation size in bytes + * \param flags - Requested properties of allocated memory + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaSetDeviceFlags, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, + * ::cudaGetDeviceFlags, + * ::cuMemHostAlloc + */ +extern __host__ cudaError_t CUDARTAPI cudaHostAlloc(void **pHost, size_t size, unsigned int flags); + +/** + * \brief Registers an existing host memory range for use by CUDA + * + * Page-locks the memory range specified by \p ptr and \p size and maps it + * for the device(s) as specified by \p flags. This memory range also is added + * to the same tracking mechanism as ::cudaHostAlloc() to automatically accelerate + * calls to functions such as ::cudaMemcpy(). Since the memory can be accessed + * directly by the device, it can be read or written with much higher bandwidth + * than pageable memory that has not been registered. Page-locking excessive + * amounts of memory may degrade system performance, since it reduces the amount + * of memory available to the system for paging. As a result, this function is + * best used sparingly to register staging areas for data exchange between + * host and device. + * + * ::cudaHostRegister is supported only on I/O coherent devices that have a non-zero + * value for the device attribute ::cudaDevAttrHostRegisterSupported. + * + * The \p flags parameter enables different options to be specified that + * affect the allocation, as follows. + * + * - ::cudaHostRegisterDefault: On a system with unified virtual addressing, + * the memory will be both mapped and portable. On a system with no unified + * virtual addressing, the memory will be neither mapped nor portable. + * + * - ::cudaHostRegisterPortable: The memory returned by this call will be + * considered as pinned memory by all CUDA contexts, not just the one that + * performed the allocation. + * + * - ::cudaHostRegisterMapped: Maps the allocation into the CUDA address + * space. The device pointer to the memory may be obtained by calling + * ::cudaHostGetDevicePointer(). + * + * - ::cudaHostRegisterIoMemory: The passed memory pointer is treated as + * pointing to some memory-mapped I/O space, e.g. belonging to a + * third-party PCIe device, and it will marked as non cache-coherent and + * contiguous. + * + * - ::cudaHostRegisterReadOnly: The passed memory pointer is treated as + * pointing to memory that is considered read-only by the device. On + * platforms without ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, this + * flag is required in order to register memory mapped to the CPU as + * read-only. Support for the use of this flag can be queried from the device + * attribute cudaDeviceAttrReadOnlyHostRegisterSupported. Using this flag with + * a current context associated with a device that does not have this attribute + * set will cause ::cudaHostRegister to error with cudaErrorNotSupported. + * + * All of these flags are orthogonal to one another: a developer may page-lock + * memory that is portable or mapped with no restrictions. + * + * The CUDA context must have been created with the ::cudaMapHost flag in + * order for the ::cudaHostRegisterMapped flag to have any effect. + * + * The ::cudaHostRegisterMapped flag may be specified on CUDA contexts for + * devices that do not support mapped pinned memory. The failure is deferred + * to ::cudaHostGetDevicePointer() because the memory may be mapped into + * other CUDA contexts via the ::cudaHostRegisterPortable flag. + * + * For devices that have a non-zero value for the device attribute + * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory + * can also be accessed from the device using the host pointer \p ptr. + * The device pointer returned by ::cudaHostGetDevicePointer() may or may not + * match the original host pointer \p ptr and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cudaHostGetDevicePointer() + * will match the original pointer \p ptr. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cudaHostGetDevicePointer() will not match the original host pointer \p ptr, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * The memory page-locked by this function must be unregistered with ::cudaHostUnregister(). + * + * \param ptr - Host pointer to memory to page-lock + * \param size - Size in bytes of the address range to page-lock in bytes + * \param flags - Flags for allocation request + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation, + * ::cudaErrorHostMemoryAlreadyRegistered, + * ::cudaErrorNotSupported + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaHostUnregister, ::cudaHostGetFlags, ::cudaHostGetDevicePointer, + * ::cuMemHostRegister + */ +extern __host__ cudaError_t CUDARTAPI cudaHostRegister(void *ptr, size_t size, unsigned int flags); + +/** + * \brief Unregisters a memory range that was registered with cudaHostRegister + * + * Unmaps the memory range whose base address is specified by \p ptr, and makes + * it pageable again. + * + * The base address must be the same one specified to ::cudaHostRegister(). + * + * \param ptr - Host pointer to memory to unregister + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorHostMemoryNotRegistered + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaHostUnregister, + * ::cuMemHostUnregister + */ +extern __host__ cudaError_t CUDARTAPI cudaHostUnregister(void *ptr); + +/** + * \brief Passes back device pointer of mapped host memory allocated by + * cudaHostAlloc or registered by cudaHostRegister + * + * Passes back the device pointer corresponding to the mapped, pinned host + * buffer allocated by ::cudaHostAlloc() or registered by ::cudaHostRegister(). + * + * ::cudaHostGetDevicePointer() will fail if the ::cudaDeviceMapHost flag was + * not specified before deferred context creation occurred, or if called on a + * device that does not support mapped, pinned memory. + * + * For devices that have a non-zero value for the device attribute + * ::cudaDevAttrCanUseHostPointerForRegisteredMem, the memory + * can also be accessed from the device using the host pointer \p pHost. + * The device pointer returned by ::cudaHostGetDevicePointer() may or may not + * match the original host pointer \p pHost and depends on the devices visible to the + * application. If all devices visible to the application have a non-zero value for the + * device attribute, the device pointer returned by ::cudaHostGetDevicePointer() + * will match the original pointer \p pHost. If any device visible to the application + * has a zero value for the device attribute, the device pointer returned by + * ::cudaHostGetDevicePointer() will not match the original host pointer \p pHost, + * but it will be suitable for use on all devices provided Unified Virtual Addressing + * is enabled. In such systems, it is valid to access the memory using either pointer + * on devices that have a non-zero value for the device attribute. Note however that + * such devices should access the memory using only of the two pointers and not both. + * + * \p flags provides for future releases. For now, it must be set to 0. + * + * \param pDevice - Returned device pointer for mapped memory + * \param pHost - Requested host pointer mapping + * \param flags - Flags for extensions (must be 0 for now) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaSetDeviceFlags, ::cudaHostAlloc, + * ::cuMemHostGetDevicePointer + */ +extern __host__ cudaError_t CUDARTAPI cudaHostGetDevicePointer(void **pDevice, void *pHost, unsigned int flags); + +/** + * \brief Passes back flags used to allocate pinned host memory allocated by + * cudaHostAlloc + * + * ::cudaHostGetFlags() will fail if the input pointer does not + * reside in an address range allocated by ::cudaHostAlloc(). + * + * \param pFlags - Returned flags word + * \param pHost - Host pointer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaHostAlloc, + * ::cuMemHostGetFlags + */ +extern __host__ cudaError_t CUDARTAPI cudaHostGetFlags(unsigned int *pFlags, void *pHost); + +/** + * \brief Allocates logical 1D, 2D, or 3D memory objects on the device + * + * Allocates at least \p width * \p height * \p depth bytes of linear memory + * on the device and returns a ::cudaPitchedPtr in which \p ptr is a pointer + * to the allocated memory. The function may pad the allocation to ensure + * hardware alignment requirements are met. The pitch returned in the \p pitch + * field of \p pitchedDevPtr is the width in bytes of the allocation. + * + * The returned ::cudaPitchedPtr contains additional fields \p xsize and + * \p ysize, the logical width and height of the allocation, which are + * equivalent to the \p width and \p height \p extent parameters provided by + * the programmer during allocation. + * + * For allocations of 2D and 3D objects, it is highly recommended that + * programmers perform allocations using ::cudaMalloc3D() or + * ::cudaMallocPitch(). Due to alignment restrictions in the hardware, this is + * especially true if the application will be performing memory copies + * involving 2D or 3D objects (whether linear memory or CUDA arrays). + * + * \param pitchedDevPtr - Pointer to allocated pitched device memory + * \param extent - Requested allocation size (\p width field in bytes) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMemcpy3D, ::cudaMemset3D, + * ::cudaMalloc3DArray, ::cudaMallocArray, ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, ::make_cudaPitchedPtr, ::make_cudaExtent, + * ::cuMemAllocPitch + */ +extern __host__ cudaError_t CUDARTAPI cudaMalloc3D(struct cudaPitchedPtr* pitchedDevPtr, struct cudaExtent extent); + +/** + * \brief Allocate an array on the device + * + * Allocates a CUDA array according to the ::cudaChannelFormatDesc structure + * \p desc and returns a handle to the new CUDA array in \p *array. + * + * The ::cudaChannelFormatDesc is defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + \endcode + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * ::cudaMalloc3DArray() can allocate the following: + * + * - A 1D array is allocated if the height and depth extents are both zero. + * - A 2D array is allocated if only the depth extent is zero. + * - A 3D array is allocated if all three extents are non-zero. + * - A 1D layered CUDA array is allocated if only the height extent is zero and + * the cudaArrayLayered flag is set. Each layer is a 1D array. The number of layers is + * determined by the depth extent. + * - A 2D layered CUDA array is allocated if all three extents are non-zero and + * the cudaArrayLayered flag is set. Each layer is a 2D array. The number of layers is + * determined by the depth extent. + * - A cubemap CUDA array is allocated if all three extents are non-zero and the + * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. A cubemap is + * a special type of 2D layered CUDA array, where the six layers represent the six faces of a cube. + * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace. + * - A cubemap layered CUDA array is allocated if all three extents are non-zero, and both, + * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be + * a multiple of six. A cubemap layered CUDA array is a special type of 2D layered CUDA array that consists + * of a collection of cubemaps. The first six layers represent the first cubemap, the next six layers form + * the second cubemap, and so on. + * + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default array allocation + * - ::cudaArrayLayered: Allocates a layered CUDA array, with the depth extent indicating the number of layers + * - ::cudaArrayCubemap: Allocates a cubemap CUDA array. Width must be equal to height, and depth must be six. + * If the cudaArrayLayered flag is also set, depth must be a multiple of six. + * - ::cudaArraySurfaceLoadStore: Allocates a CUDA array that could be read from or written to using a surface + * reference. + * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA + * array. Texture gather can only be performed on 2D CUDA arrays. + * - ::cudaArraySparse: Allocates a CUDA array without physical backing memory. The subregions within this sparse array + * can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. This flag can only be used for + * creating 2D, 3D or 2D layered sparse CUDA arrays. The physical backing memory must be allocated via ::cuMemCreate. + * - ::cudaArrayDeferredMapping: Allocates a CUDA array without physical backing memory. The entire array can + * later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. The physical backing memory must be allocated + * via ::cuMemCreate. + * + * The width, height and depth extents must meet certain size requirements as listed in the following table. + * All values are specified in elements. + * + * Note that 2D CUDA arrays have different size requirements if the ::cudaArrayTextureGather flag is set. In that + * case, the valid range for (width, height, depth) is ((1,maxTexture2DGather[0]), (1,maxTexture2DGather[1]), 0). + * + * \xmlonly + * <table outputclass="xmlonly"> + * <tgroup cols="3" colsep="1" rowsep="1"> + * <colspec colname="c1" colwidth="1.0*"/> + * <colspec colname="c2" colwidth="3.0*"/> + * <colspec colname="c3" colwidth="3.0*"/> + * <thead> + * <row> + * <entry>CUDA array type</entry> + * <entry>Valid extents that must always be met {(width range in elements), + * (height range), (depth range)}</entry> + * <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in + * elements), (height range), (depth range)}</entry> + * </row> + * </thead> + * <tbody> + * <row> + * <entry>1D</entry> + * <entry>{ (1,maxTexture1D), 0, 0 }</entry> + * <entry>{ (1,maxSurface1D), 0, 0 }</entry> + * </row> + * <row> + * <entry>2D</entry> + * <entry>{ (1,maxTexture2D[0]), (1,maxTexture2D[1]), 0 }</entry> + * <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry> + * </row> + * <row> + * <entry>3D</entry> + * <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) } + * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]), + * (1,maxTexture3DAlt[2]) }</entry> + * <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry> + * </row> + * <row> + * <entry>1D Layered</entry> + * <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry> + * <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry> + * </row> + * <row> + * <entry>2D Layered</entry> + * <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]), + * (1,maxTexture2DLayered[2]) }</entry> + * <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]), + * (1,maxSurface2DLayered[2]) }</entry> + * </row> + * <row> + * <entry>Cubemap</entry> + * <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry> + * <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry> + * </row> + * <row> + * <entry>Cubemap Layered</entry> + * <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]), + * (1,maxTextureCubemapLayered[1]) }</entry> + * <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]), + * (1,maxSurfaceCubemapLayered[1]) }</entry> + * </row> + * </tbody> + * </tgroup> + * </table> + * \endxmlonly + * + * \param array - Pointer to allocated array in device memory + * \param desc - Requested channel format + * \param extent - Requested allocation size (\p width field in elements) + * \param flags - Flags for extensions + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, + * ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::make_cudaExtent, + * ::cuArray3DCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaMalloc3DArray(cudaArray_t *array, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int flags __dv(0)); + +/** + * \brief Allocate a mipmapped array on the device + * + * Allocates a CUDA mipmapped array according to the ::cudaChannelFormatDesc structure + * \p desc and returns a handle to the new CUDA mipmapped array in \p *mipmappedArray. + * \p numLevels specifies the number of mipmap levels to be allocated. This value is + * clamped to the range [1, 1 + floor(log2(max(width, height, depth)))]. + * + * The ::cudaChannelFormatDesc is defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + \endcode + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * ::cudaMallocMipmappedArray() can allocate the following: + * + * - A 1D mipmapped array is allocated if the height and depth extents are both zero. + * - A 2D mipmapped array is allocated if only the depth extent is zero. + * - A 3D mipmapped array is allocated if all three extents are non-zero. + * - A 1D layered CUDA mipmapped array is allocated if only the height extent is zero and + * the cudaArrayLayered flag is set. Each layer is a 1D mipmapped array. The number of layers is + * determined by the depth extent. + * - A 2D layered CUDA mipmapped array is allocated if all three extents are non-zero and + * the cudaArrayLayered flag is set. Each layer is a 2D mipmapped array. The number of layers is + * determined by the depth extent. + * - A cubemap CUDA mipmapped array is allocated if all three extents are non-zero and the + * cudaArrayCubemap flag is set. Width must be equal to height, and depth must be six. + * The order of the six layers in memory is the same as that listed in ::cudaGraphicsCubeFace. + * - A cubemap layered CUDA mipmapped array is allocated if all three extents are non-zero, and both, + * cudaArrayCubemap and cudaArrayLayered flags are set. Width must be equal to height, and depth must be + * a multiple of six. A cubemap layered CUDA mipmapped array is a special type of 2D layered CUDA mipmapped + * array that consists of a collection of cubemap mipmapped arrays. The first six layers represent the + * first cubemap mipmapped array, the next six layers form the second cubemap mipmapped array, and so on. + * + * + * The \p flags parameter enables different options to be specified that affect + * the allocation, as follows. + * - ::cudaArrayDefault: This flag's value is defined to be 0 and provides default mipmapped array allocation + * - ::cudaArrayLayered: Allocates a layered CUDA mipmapped array, with the depth extent indicating the number of layers + * - ::cudaArrayCubemap: Allocates a cubemap CUDA mipmapped array. Width must be equal to height, and depth must be six. + * If the cudaArrayLayered flag is also set, depth must be a multiple of six. + * - ::cudaArraySurfaceLoadStore: This flag indicates that individual mipmap levels of the CUDA mipmapped array + * will be read from or written to using a surface reference. + * - ::cudaArrayTextureGather: This flag indicates that texture gather operations will be performed on the CUDA + * array. Texture gather can only be performed on 2D CUDA mipmapped arrays, and the gather operations are + * performed only on the most detailed mipmap level. + * - ::cudaArraySparse: Allocates a CUDA mipmapped array without physical backing memory. The subregions within this sparse array + * can later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. This flag can only be used for creating + * 2D, 3D or 2D layered sparse CUDA mipmapped arrays. The physical backing memory must be allocated via ::cuMemCreate. + * - ::cudaArrayDeferredMapping: Allocates a CUDA mipmapped array without physical backing memory. The entire array can + * later be mapped onto a physical memory allocation by calling ::cuMemMapArrayAsync. The physical backing memory must be allocated + * via ::cuMemCreate. + * + * The width, height and depth extents must meet certain size requirements as listed in the following table. + * All values are specified in elements. + * + * \xmlonly + * <table outputclass="xmlonly"> + * <tgroup cols="3" colsep="1" rowsep="1"> + * <colspec colname="c1" colwidth="1.0*"/> + * <colspec colname="c2" colwidth="3.0*"/> + * <colspec colname="c3" colwidth="3.0*"/> + * <thead> + * <row> + * <entry>CUDA array type</entry> + * <entry>Valid extents that must always be met {(width range in elements), + * (height range), (depth range)}</entry> + * <entry>Valid extents with cudaArraySurfaceLoadStore set {(width range in + * elements), (height range), (depth range)}</entry> + * </row> + * </thead> + * <tbody> + * <row> + * <entry>1D</entry> + * <entry>{ (1,maxTexture1DMipmap), 0, 0 }</entry> + * <entry>{ (1,maxSurface1D), 0, 0 }</entry> + * </row> + * <row> + * <entry>2D</entry> + * <entry>{ (1,maxTexture2DMipmap[0]), (1,maxTexture2DMipmap[1]), 0 }</entry> + * <entry>{ (1,maxSurface2D[0]), (1,maxSurface2D[1]), 0 }</entry> + * </row> + * <row> + * <entry>3D</entry> + * <entry>{ (1,maxTexture3D[0]), (1,maxTexture3D[1]), (1,maxTexture3D[2]) } + * OR { (1,maxTexture3DAlt[0]), (1,maxTexture3DAlt[1]), + * (1,maxTexture3DAlt[2]) }</entry> + * <entry>{ (1,maxSurface3D[0]), (1,maxSurface3D[1]), (1,maxSurface3D[2]) }</entry> + * </row> + * <row> + * <entry>1D Layered</entry> + * <entry>{ (1,maxTexture1DLayered[0]), 0, (1,maxTexture1DLayered[1]) }</entry> + * <entry>{ (1,maxSurface1DLayered[0]), 0, (1,maxSurface1DLayered[1]) }</entry> + * </row> + * <row> + * <entry>2D Layered</entry> + * <entry>{ (1,maxTexture2DLayered[0]), (1,maxTexture2DLayered[1]), + * (1,maxTexture2DLayered[2]) }</entry> + * <entry>{ (1,maxSurface2DLayered[0]), (1,maxSurface2DLayered[1]), + * (1,maxSurface2DLayered[2]) }</entry> + * </row> + * <row> + * <entry>Cubemap</entry> + * <entry>{ (1,maxTextureCubemap), (1,maxTextureCubemap), 6 }</entry> + * <entry>{ (1,maxSurfaceCubemap), (1,maxSurfaceCubemap), 6 }</entry> + * </row> + * <row> + * <entry>Cubemap Layered</entry> + * <entry>{ (1,maxTextureCubemapLayered[0]), (1,maxTextureCubemapLayered[0]), + * (1,maxTextureCubemapLayered[1]) }</entry> + * <entry>{ (1,maxSurfaceCubemapLayered[0]), (1,maxSurfaceCubemapLayered[0]), + * (1,maxSurfaceCubemapLayered[1]) }</entry> + * </row> + * </tbody> + * </tgroup> + * </table> + * \endxmlonly + * + * \param mipmappedArray - Pointer to allocated mipmapped array in device memory + * \param desc - Requested channel format + * \param extent - Requested allocation size (\p width field in elements) + * \param numLevels - Number of mipmap levels to allocate + * \param flags - Flags for extensions + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, + * ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::make_cudaExtent, + * ::cuMipmappedArrayCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocMipmappedArray(cudaMipmappedArray_t *mipmappedArray, const struct cudaChannelFormatDesc* desc, struct cudaExtent extent, unsigned int numLevels, unsigned int flags __dv(0)); + +/** + * \brief Gets a mipmap level of a CUDA mipmapped array + * + * Returns in \p *levelArray a CUDA array that represents a single mipmap level + * of the CUDA mipmapped array \p mipmappedArray. + * + * If \p level is greater than the maximum number of levels in this mipmapped array, + * ::cudaErrorInvalidValue is returned. + * + * If \p mipmappedArray is NULL, + * ::cudaErrorInvalidResourceHandle is returned. + * + * \param levelArray - Returned mipmap level CUDA array + * \param mipmappedArray - CUDA mipmapped array + * \param level - Mipmap level + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc3D, ::cudaMalloc, ::cudaMallocPitch, ::cudaFree, + * ::cudaFreeArray, + * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)", + * ::cudaFreeHost, ::cudaHostAlloc, + * ::make_cudaExtent, + * ::cuMipmappedArrayGetLevel + */ +extern __host__ cudaError_t CUDARTAPI cudaGetMipmappedArrayLevel(cudaArray_t *levelArray, cudaMipmappedArray_const_t mipmappedArray, unsigned int level); + +/** + * \brief Copies data between 3D objects + * +\code +struct cudaExtent { + size_t width; + size_t height; + size_t depth; +}; +struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d); + +struct cudaPos { + size_t x; + size_t y; + size_t z; +}; +struct cudaPos make_cudaPos(size_t x, size_t y, size_t z); + +struct cudaMemcpy3DParms { + cudaArray_t srcArray; + struct cudaPos srcPos; + struct cudaPitchedPtr srcPtr; + cudaArray_t dstArray; + struct cudaPos dstPos; + struct cudaPitchedPtr dstPtr; + struct cudaExtent extent; + enum cudaMemcpyKind kind; +}; +\endcode + * + * ::cudaMemcpy3D() copies data betwen two 3D objects. The source and + * destination objects may be in either host memory, device memory, or a CUDA + * array. The source, destination, extent, and kind of copy performed is + * specified by the ::cudaMemcpy3DParms struct which should be initialized to + * zero before use: +\code +cudaMemcpy3DParms myParms = {0}; +\endcode + * + * The struct passed to ::cudaMemcpy3D() must specify one of \p srcArray or + * \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one + * non-zero source or destination will cause ::cudaMemcpy3D() to return an + * error. + * + * The \p srcPos and \p dstPos fields are optional offsets into the source and + * destination objects and are defined in units of each object's elements. The + * element for a host or device pointer is assumed to be <b>unsigned char</b>. + * + * The \p extent field defines the dimensions of the transferred area in + * elements. If a CUDA array is participating in the copy, the extent is + * defined in terms of that array's elements. If no CUDA array is + * participating in the copy then the extents are defined in elements of + * <b>unsigned char</b>. + * + * The \p kind field defines the direction of the copy. It must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost + * passed as kind and cudaArray type passed as source or destination, if the kind + * implies cudaArray type to be present on the host, ::cudaMemcpy3D() will + * disregard that implication and silently correct the kind based on the fact that + * cudaArray type can only be present on the device. + * + * If the source and destination are both arrays, ::cudaMemcpy3D() will return + * an error if they do not have the same element size. + * + * The source and destination object may not overlap. If overlapping source + * and destination objects are specified, undefined behavior will result. + * + * The source object must entirely contain the region defined by \p srcPos + * and \p extent. The destination object must entirely contain the region + * defined by \p dstPos and \p extent. + * + * ::cudaMemcpy3D() returns an error if the pitch of \p srcPtr or \p dstPtr + * exceeds the maximum allowed. The pitch of a ::cudaPitchedPtr allocated + * with ::cudaMalloc3D() will always be valid. + * + * \param p - 3D memory copy parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3DAsync, + * ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::make_cudaExtent, ::make_cudaPos, + * ::cuMemcpy3D + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p); + +/** + * \brief Copies memory between devices + * + * Perform a 3D memory copy according to the parameters specified in + * \p p. See the definition of the ::cudaMemcpy3DPeerParms structure + * for documentation of its parameters. + * + * Note that this function is synchronous with respect to the host only if + * the source or destination of the transfer is host memory. Note also + * that this copy is serialized with respect to all pending and future + * asynchronous work in to the current device, the copy's source device, + * and the copy's destination device (use ::cudaMemcpy3DPeerAsync to avoid + * this synchronization). + * + * \param p - Parameters for the memory copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_sync + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, + * ::cudaMemcpy3DPeerAsync, + * ::cuMemcpy3DPeer + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p); + +/** + * \brief Copies data between 3D objects + * +\code +struct cudaExtent { + size_t width; + size_t height; + size_t depth; +}; +struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d); + +struct cudaPos { + size_t x; + size_t y; + size_t z; +}; +struct cudaPos make_cudaPos(size_t x, size_t y, size_t z); + +struct cudaMemcpy3DParms { + cudaArray_t srcArray; + struct cudaPos srcPos; + struct cudaPitchedPtr srcPtr; + cudaArray_t dstArray; + struct cudaPos dstPos; + struct cudaPitchedPtr dstPtr; + struct cudaExtent extent; + enum cudaMemcpyKind kind; +}; +\endcode + * + * ::cudaMemcpy3DAsync() copies data betwen two 3D objects. The source and + * destination objects may be in either host memory, device memory, or a CUDA + * array. The source, destination, extent, and kind of copy performed is + * specified by the ::cudaMemcpy3DParms struct which should be initialized to + * zero before use: +\code +cudaMemcpy3DParms myParms = {0}; +\endcode + * + * The struct passed to ::cudaMemcpy3DAsync() must specify one of \p srcArray + * or \p srcPtr and one of \p dstArray or \p dstPtr. Passing more than one + * non-zero source or destination will cause ::cudaMemcpy3DAsync() to return an + * error. + * + * The \p srcPos and \p dstPos fields are optional offsets into the source and + * destination objects and are defined in units of each object's elements. The + * element for a host or device pointer is assumed to be <b>unsigned char</b>. + * For CUDA arrays, positions must be in the range [0, 2048) for any + * dimension. + * + * The \p extent field defines the dimensions of the transferred area in + * elements. If a CUDA array is participating in the copy, the extent is + * defined in terms of that array's elements. If no CUDA array is + * participating in the copy then the extents are defined in elements of + * <b>unsigned char</b>. + * + * The \p kind field defines the direction of the copy. It must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * For ::cudaMemcpyHostToHost or ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost + * passed as kind and cudaArray type passed as source or destination, if the kind + * implies cudaArray type to be present on the host, ::cudaMemcpy3DAsync() will + * disregard that implication and silently correct the kind based on the fact that + * cudaArray type can only be present on the device. + * + * If the source and destination are both arrays, ::cudaMemcpy3DAsync() will + * return an error if they do not have the same element size. + * + * The source and destination object may not overlap. If overlapping source + * and destination objects are specified, undefined behavior will result. + * + * The source object must lie entirely within the region defined by \p srcPos + * and \p extent. The destination object must lie entirely within the region + * defined by \p dstPos and \p extent. + * + * ::cudaMemcpy3DAsync() returns an error if the pitch of \p srcPtr or + * \p dstPtr exceeds the maximum allowed. The pitch of a + * ::cudaPitchedPtr allocated with ::cudaMalloc3D() will always be valid. + * + * ::cudaMemcpy3DAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream + * is non-zero, the copy may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param p - 3D memory copy parameters + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMalloc3D, ::cudaMalloc3DArray, ::cudaMemset3D, ::cudaMemcpy3D, + * ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, :::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::make_cudaExtent, ::make_cudaPos, + * ::cuMemcpy3DAsync + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)); + +/** + * \brief Copies memory between devices asynchronously. + * + * Perform a 3D memory copy according to the parameters specified in + * \p p. See the definition of the ::cudaMemcpy3DPeerParms structure + * for documentation of its parameters. + * + * \param p - Parameters for the memory copy + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, + * ::cudaMemcpy3DPeerAsync, + * ::cuMemcpy3DPeerAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)); + +/** + * \brief Gets free and total device memory + * + * Returns in \p *total the total amount of memory available to the the current context. + * Returns in \p *free the amount of memory on the device that is free according to the OS. + * CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free. + * In a multi-tenet situation, free estimate returned is prone to race condition where + * a new allocation/free done by a different process or a different thread in the same + * process between the time when free memory was estimated and reported, will result in + * deviation in free value reported and actual free memory. + * + * The integrated GPU on Tegra shares memory with CPU and other component + * of the SoC. The free and total values returned by the API excludes + * the SWAP memory space maintained by the OS on some platforms. + * The OS may move some of the memory pages into swap area as the GPU or + * CPU allocate or access memory. See Tegra app note on how to calculate + * total and free memory on Tegra. + * + * \param free - Returned free memory in bytes + * \param total - Returned total memory in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorLaunchFailure + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cuMemGetInfo + */ +extern __host__ cudaError_t CUDARTAPI cudaMemGetInfo(size_t *free, size_t *total); + +/** + * \brief Gets info about the specified cudaArray + * + * Returns in \p *desc, \p *extent and \p *flags respectively, the type, shape + * and flags of \p array. + * + * Any of \p *desc, \p *extent and \p *flags may be specified as NULL. + * + * \param desc - Returned array type + * \param extent - Returned array shape. 2D arrays will have depth of zero + * \param flags - Returned array flags + * \param array - The ::cudaArray to get info for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cuArrayGetDescriptor, + * ::cuArray3DGetDescriptor + */ +extern __host__ cudaError_t CUDARTAPI cudaArrayGetInfo(struct cudaChannelFormatDesc *desc, struct cudaExtent *extent, unsigned int *flags, cudaArray_t array); + +/** + * \brief Gets a CUDA array plane from a CUDA array + * + * Returns in \p pPlaneArray a CUDA array that represents a single format plane + * of the CUDA array \p hArray. + * + * If \p planeIdx is greater than the maximum number of planes in this array or if the array does + * not have a multi-planar format e.g: ::cudaChannelFormatKindNV12, then ::cudaErrorInvalidValue is returned. + * + * Note that if the \p hArray has format ::cudaChannelFormatKindNV12, then passing in 0 for \p planeIdx returns + * a CUDA array of the same size as \p hArray but with one 8-bit channel and ::cudaChannelFormatKindUnsigned as its format kind. + * If 1 is passed for \p planeIdx, then the returned CUDA array has half the height and width + * of \p hArray with two 8-bit channels and ::cudaChannelFormatKindUnsigned as its format kind. + * + * \param pPlaneArray - Returned CUDA array referenced by the \p planeIdx + * \param hArray - CUDA array + * \param planeIdx - Plane index + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * + * \sa + * ::cuArrayGetPlane + */ +extern __host__ cudaError_t CUDARTAPI cudaArrayGetPlane(cudaArray_t *pPlaneArray, cudaArray_t hArray, unsigned int planeIdx); + +/** + * \brief Returns the memory requirements of a CUDA array + * + * Returns the memory requirements of a CUDA array in \p memoryRequirements + * If the CUDA array is not allocated with flag ::cudaArrayDeferredMapping + * ::cudaErrorInvalidValue will be returned. + * + * The returned value in ::cudaArrayMemoryRequirements::size + * represents the total size of the CUDA array. + * The returned value in ::cudaArrayMemoryRequirements::alignment + * represents the alignment necessary for mapping the CUDA array. + * + * \return + * ::cudaSuccess + * ::cudaErrorInvalidValue + * + * \param[out] memoryRequirements - Pointer to ::cudaArrayMemoryRequirements + * \param[in] array - CUDA array to get the memory requirements of + * \param[in] device - Device to get the memory requirements for + * \sa ::cudaMipmappedArrayGetMemoryRequirements + */ +extern __host__ cudaError_t CUDARTAPI cudaArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements *memoryRequirements, cudaArray_t array, int device); + +/** + * \brief Returns the memory requirements of a CUDA mipmapped array + * + * Returns the memory requirements of a CUDA mipmapped array in \p memoryRequirements + * If the CUDA mipmapped array is not allocated with flag ::cudaArrayDeferredMapping + * ::cudaErrorInvalidValue will be returned. + * + * The returned value in ::cudaArrayMemoryRequirements::size + * represents the total size of the CUDA mipmapped array. + * The returned value in ::cudaArrayMemoryRequirements::alignment + * represents the alignment necessary for mapping the CUDA mipmapped + * array. + * + * \return + * ::cudaSuccess + * ::cudaErrorInvalidValue + * + * \param[out] memoryRequirements - Pointer to ::cudaArrayMemoryRequirements + * \param[in] mipmap - CUDA mipmapped array to get the memory requirements of + * \param[in] device - Device to get the memory requirements for + * \sa ::cudaArrayGetMemoryRequirements + */ +extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetMemoryRequirements(struct cudaArrayMemoryRequirements *memoryRequirements, cudaMipmappedArray_t mipmap, int device); + +/** + * \brief Returns the layout properties of a sparse CUDA array + * + * Returns the layout properties of a sparse CUDA array in \p sparseProperties. + * If the CUDA array is not allocated with flag ::cudaArraySparse + * ::cudaErrorInvalidValue will be returned. + * + * If the returned value in ::cudaArraySparseProperties::flags contains ::cudaArraySparsePropertiesSingleMipTail, + * then ::cudaArraySparseProperties::miptailSize represents the total size of the array. Otherwise, it will be zero. + * Also, the returned value in ::cudaArraySparseProperties::miptailFirstLevel is always zero. + * Note that the \p array must have been allocated using ::cudaMallocArray or ::cudaMalloc3DArray. For CUDA arrays obtained + * using ::cudaMipmappedArrayGetLevel, ::cudaErrorInvalidValue will be returned. Instead, ::cudaMipmappedArrayGetSparseProperties + * must be used to obtain the sparse properties of the entire CUDA mipmapped array to which \p array belongs to. + * + * \return + * ::cudaSuccess + * ::cudaErrorInvalidValue + * + * \param[out] sparseProperties - Pointer to return the ::cudaArraySparseProperties + * \param[in] array - The CUDA array to get the sparse properties of + * + * \sa + * ::cudaMipmappedArrayGetSparseProperties, + * ::cuMemMapArrayAsync + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaArray_t array); +#endif + +/** + * \brief Returns the layout properties of a sparse CUDA mipmapped array + * + * Returns the sparse array layout properties in \p sparseProperties. + * If the CUDA mipmapped array is not allocated with flag ::cudaArraySparse + * ::cudaErrorInvalidValue will be returned. + * + * For non-layered CUDA mipmapped arrays, ::cudaArraySparseProperties::miptailSize returns the + * size of the mip tail region. The mip tail region includes all mip levels whose width, height or depth + * is less than that of the tile. + * For layered CUDA mipmapped arrays, if ::cudaArraySparseProperties::flags contains ::cudaArraySparsePropertiesSingleMipTail, + * then ::cudaArraySparseProperties::miptailSize specifies the size of the mip tail of all layers combined. + * Otherwise, ::cudaArraySparseProperties::miptailSize specifies mip tail size per layer. + * The returned value of ::cudaArraySparseProperties::miptailFirstLevel is valid only if ::cudaArraySparseProperties::miptailSize is non-zero. + * + * \return + * ::cudaSuccess + * ::cudaErrorInvalidValue + * + * \param[out] sparseProperties - Pointer to return ::cudaArraySparseProperties + * \param[in] mipmap - The CUDA mipmapped array to get the sparse properties of + * + * \sa + * ::cudaArrayGetSparseProperties, + * ::cuMemMapArrayAsync + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaMipmappedArrayGetSparseProperties(struct cudaArraySparseProperties *sparseProperties, cudaMipmappedArray_t mipmap); +#endif + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the memory area pointed to by \p src to the + * memory area pointed to by \p dst, where \p kind specifies the direction + * of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. Calling + * ::cudaMemcpy() with dst and src pointers that do not match the direction of + * the copy results in an undefined behavior. + * + * \param dst - Destination memory address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_init_rt + * \note_callback + * + * \note_sync + * \note_memcpy + * + * \sa ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyDtoH, + * ::cuMemcpyHtoD, + * ::cuMemcpyDtoD, + * ::cuMemcpy + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); + +/** + * \brief Copies memory between two devices + * + * Copies memory from one device to memory on another device. \p dst is the + * base device pointer of the destination memory and \p dstDevice is the + * destination device. \p src is the base device pointer of the source memory + * and \p srcDevice is the source device. \p count specifies the number of bytes + * to copy. + * + * Note that this function is asynchronous with respect to the host, but + * serialized with respect all pending and future asynchronous work in to the + * current device, \p srcDevice, and \p dstDevice (use ::cudaMemcpyPeerAsync + * to avoid this synchronization). + * + * \param dst - Destination device pointer + * \param dstDevice - Destination device + * \param src - Source device pointer + * \param srcDevice - Source device + * \param count - Size of memory copy in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_sync + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpyAsync, ::cudaMemcpyPeerAsync, + * ::cudaMemcpy3DPeerAsync, + * ::cuMemcpyPeer + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeer(void *dst, int dstDevice, const void *src, int srcDevice, size_t count); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the memory + * area pointed to by \p src to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. \p dpitch and + * \p spitch are the widths in memory in bytes of the 2D arrays pointed to by + * \p dst and \p src, including any padding added to the end of each row. The + * memory areas may not overlap. \p width must not exceed either \p dpitch or + * \p spitch. Calling ::cudaMemcpy2D() with \p dst and \p src pointers that do + * not match the direction of the copy results in an undefined behavior. + * ::cudaMemcpy2D() returns an error if \p dpitch or \p spitch exceeds + * the maximum allowed. + * + * \param dst - Destination memory address + * \param dpitch - Pitch of destination memory + * \param src - Source memory address + * \param spitch - Pitch of source memory + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_init_rt + * \note_callback + * \note_memcpy + * + * \sa ::cudaMemcpy, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2D, + * ::cuMemcpy2DUnaligned + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the memory + * area pointed to by \p src to the CUDA array \p dst starting at + * \p hOffset rows and \p wOffset bytes from the upper left corner, + * where \p kind specifies the direction of the copy, and must be one + * of ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p spitch is the width in memory in bytes of the 2D array pointed to by + * \p src, including any padding added to the end of each row. \p wOffset + + * \p width must not exceed the width of the CUDA array \p dst. \p width must + * not exceed \p spitch. ::cudaMemcpy2DToArray() returns an error if \p spitch + * exceeds the maximum allowed. + * + * \param dst - Destination memory address + * \param wOffset - Destination starting X offset (columns in bytes) + * \param hOffset - Destination starting Y offset (rows) + * \param src - Source memory address + * \param spitch - Pitch of source memory + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * \note_init_rt + * \note_callback + * \note_memcpy + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2D, + * ::cuMemcpy2DUnaligned + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the CUDA + * array \p src starting at \p hOffset rows and \p wOffset bytes from the + * upper left corner to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. \p dpitch is the + * width in memory in bytes of the 2D array pointed to by \p dst, including any + * padding added to the end of each row. \p wOffset + \p width must not exceed + * the width of the CUDA array \p src. \p width must not exceed \p dpitch. + * ::cudaMemcpy2DFromArray() returns an error if \p dpitch exceeds the maximum + * allowed. + * + * \param dst - Destination memory address + * \param dpitch - Pitch of destination memory + * \param src - Source memory address + * \param wOffset - Source starting X offset (columns in bytes) + * \param hOffset - Source starting Y offset (rows) + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * \note_init_rt + * \note_callback + * \note_memcpy + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2D, + * ::cuMemcpy2DUnaligned + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the CUDA + * array \p src starting at \p hOffsetSrc rows and \p wOffsetSrc bytes from the + * upper left corner to the CUDA array \p dst starting at \p hOffsetDst rows + * and \p wOffsetDst bytes from the upper left corner, where \p kind + * specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p wOffsetDst + \p width must not exceed the width of the CUDA array \p dst. + * \p wOffsetSrc + \p width must not exceed the width of the CUDA array \p src. + * + * \param dst - Destination memory address + * \param wOffsetDst - Destination starting X offset (columns in bytes) + * \param hOffsetDst - Destination starting Y offset (rows) + * \param src - Source memory address + * \param wOffsetSrc - Source starting X offset (columns in bytes) + * \param hOffsetSrc - Source starting Y offset (rows) + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2D, + * ::cuMemcpy2DUnaligned + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); + +/** + * \brief Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area pointed to by \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy, + * ::cuMemcpyHtoD, + * ::cuMemcpyDtoD + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)); + +/** + * \brief Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_sync + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy, + * ::cuMemcpyDtoH, + * ::cuMemcpyDtoD + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)); + + +/** + * \brief Copies data between host and device + * + * Copies \p count bytes from the memory area pointed to by \p src to the + * memory area pointed to by \p dst, where \p kind specifies the + * direction of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * The memory areas may not overlap. Calling ::cudaMemcpyAsync() with \p dst and + * \p src pointers that do not match the direction of the copy results in an + * undefined behavior. + * + * ::cudaMemcpyAsync() is asynchronous with respect to the host, so the call + * may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and the \p stream is + * non-zero, the copy may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param dst - Destination memory address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * \note_memcpy + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAsync, + * ::cuMemcpyDtoHAsync, + * ::cuMemcpyHtoDAsync, + * ::cuMemcpyDtoDAsync + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies memory between two devices asynchronously. + * + * Copies memory from one device to memory on another device. \p dst is the + * base device pointer of the destination memory and \p dstDevice is the + * destination device. \p src is the base device pointer of the source memory + * and \p srcDevice is the source device. \p count specifies the number of bytes + * to copy. + * + * Note that this function is asynchronous with respect to the host and all work + * on other devices. + * + * \param dst - Destination device pointer + * \param dstDevice - Destination device + * \param src - Source device pointer + * \param srcDevice - Source device + * \param count - Size of memory copy in bytes + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, + * ::cudaMemcpy3DPeerAsync, + * ::cuMemcpyPeerAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice, size_t count, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the memory + * area pointed to by \p src to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p dpitch and \p spitch are the widths in memory in bytes of the 2D arrays + * pointed to by \p dst and \p src, including any padding added to the end of + * each row. The memory areas may not overlap. \p width must not exceed either + * \p dpitch or \p spitch. + * + * Calling ::cudaMemcpy2DAsync() with \p dst and \p src pointers that do not + * match the direction of the copy results in an undefined behavior. + * ::cudaMemcpy2DAsync() returns an error if \p dpitch or \p spitch is greater + * than the maximum allowed. + * + * ::cudaMemcpy2DAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and + * \p stream is non-zero, the copy may overlap with operations in other + * streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param dst - Destination memory address + * \param dpitch - Pitch of destination memory + * \param src - Source memory address + * \param spitch - Pitch of source memory + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * \note_memcpy + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2DAsync + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the memory + * area pointed to by \p src to the CUDA array \p dst starting at \p hOffset + * rows and \p wOffset bytes from the upper left corner, where \p kind specifies + * the direction of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p spitch is the width in memory in bytes of the 2D array pointed to by + * \p src, including any padding added to the end of each row. \p wOffset + + * \p width must not exceed the width of the CUDA array \p dst. \p width must + * not exceed \p spitch. ::cudaMemcpy2DToArrayAsync() returns an error if + * \p spitch exceeds the maximum allowed. + * + * ::cudaMemcpy2DToArrayAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and + * \p stream is non-zero, the copy may overlap with operations in other + * streams. + * + * \param dst - Destination memory address + * \param wOffset - Destination starting X offset (columns in bytes) + * \param hOffset - Destination starting Y offset (rows) + * \param src - Source memory address + * \param spitch - Pitch of source memory + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * \note_memcpy + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2DAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * Copies a matrix (\p height rows of \p width bytes each) from the CUDA + * array \p src starting at \p hOffset rows and \p wOffset bytes from the + * upper left corner to the memory area pointed to by \p dst, + * where \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * \p dpitch is the width in memory in bytes of the 2D + * array pointed to by \p dst, including any padding added to the end of each + * row. \p wOffset + \p width must not exceed the width of the CUDA array + * \p src. \p width must not exceed \p dpitch. ::cudaMemcpy2DFromArrayAsync() + * returns an error if \p dpitch exceeds the maximum allowed. + * + * ::cudaMemcpy2DFromArrayAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream is + * non-zero, the copy may overlap with operations in other streams. + * + * \param dst - Destination memory address + * \param dpitch - Pitch of destination memory + * \param src - Source memory address + * \param wOffset - Source starting X offset (columns in bytes) + * \param hOffset - Source starting Y offset (rows) + * \param width - Width of matrix transfer (columns in bytes) + * \param height - Height of matrix transfer (rows) + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidPitchValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * \note_memcpy + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpy2DAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data to the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p src + * to the memory area pointed to by \p offset bytes from the start of symbol + * \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If + * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy + * may overlap with operations in other streams. + * + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_null_stream + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAsync, + * ::cuMemcpyHtoDAsync, + * ::cuMemcpyDtoDAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data from the given symbol on the device + * + * Copies \p count bytes from the memory area pointed to by \p offset bytes + * from the start of symbol \p symbol to the memory area pointed to by \p dst. + * The memory areas may not overlap. \p symbol is a variable that resides in + * global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally be + * associated to a stream by passing a non-zero \p stream argument. If \p kind + * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap + * with operations in other streams. + * + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorInvalidMemcpyDirection, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_async + * \note_null_stream + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, + * ::cuMemcpyAsync, + * ::cuMemcpyDtoHAsync, + * ::cuMemcpyDtoDAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + + +/** + * \brief Initializes or sets device memory to a value + * + * Fills the first \p count bytes of the memory area pointed to by \p devPtr + * with the constant byte value \p value. + * + * Note that this function is asynchronous with respect to the host unless + * \p devPtr refers to pinned host memory. + * + * \param devPtr - Pointer to device memory + * \param value - Value to set for each byte of specified memory + * \param count - Size in bytes to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_init_rt + * \note_callback + * + * \sa + * ::cuMemsetD8, + * ::cuMemsetD16, + * ::cuMemsetD32 + */ +extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count); + +/** + * \brief Initializes or sets device memory to a value + * + * Sets to the specified value \p value a matrix (\p height rows of \p width + * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the + * 2D array pointed to by \p dstPtr, including any padding added to the end + * of each row. This function performs fastest when the pitch is one that has + * been passed back by ::cudaMallocPitch(). + * + * Note that this function is asynchronous with respect to the host unless + * \p devPtr refers to pinned host memory. + * + * \param devPtr - Pointer to 2D device memory + * \param pitch - Pitch in bytes of 2D device memory(Unused if \p height is 1) + * \param value - Value to set for each byte of specified memory + * \param width - Width of matrix set (columns in bytes) + * \param height - Height of matrix set (rows) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemset, ::cudaMemset3D, ::cudaMemsetAsync, + * ::cudaMemset2DAsync, ::cudaMemset3DAsync, + * ::cuMemsetD2D8, + * ::cuMemsetD2D16, + * ::cuMemsetD2D32 + */ +extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height); + +/** + * \brief Initializes or sets device memory to a value + * + * Initializes each element of a 3D array to the specified value \p value. + * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field + * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed + * to by \p pitchedDevPtr, including any padding added to the end of each row. + * The \p xsize field specifies the logical width of each row in bytes, while + * the \p ysize field specifies the height of each 2D slice in rows. + * The \p pitch field of \p pitchedDevPtr is ignored when \p height and \p depth + * are both equal to 1. + * + * The extents of the initialized region are specified as a \p width in bytes, + * a \p height in rows, and a \p depth in slices. + * + * Extents with \p width greater than or equal to the \p xsize of + * \p pitchedDevPtr may perform significantly faster than extents narrower + * than the \p xsize. Secondarily, extents with \p height equal to the + * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is + * shorter than the \p ysize. + * + * This function performs fastest when the \p pitchedDevPtr has been allocated + * by ::cudaMalloc3D(). + * + * Note that this function is asynchronous with respect to the host unless + * \p pitchedDevPtr refers to pinned host memory. + * + * \param pitchedDevPtr - Pointer to pitched device memory + * \param value - Value to set for each byte of specified memory + * \param extent - Size parameters for where to set device memory (\p width field in bytes) + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemset, ::cudaMemset2D, + * ::cudaMemsetAsync, ::cudaMemset2DAsync, ::cudaMemset3DAsync, + * ::cudaMalloc3D, ::make_cudaPitchedPtr, + * ::make_cudaExtent + */ +extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent); + +/** + * \brief Initializes or sets device memory to a value + * + * Fills the first \p count bytes of the memory area pointed to by \p devPtr + * with the constant byte value \p value. + * + * ::cudaMemsetAsync() is asynchronous with respect to the host, so + * the call may return before the memset is complete. The operation can optionally + * be associated to a stream by passing a non-zero \p stream argument. + * If \p stream is non-zero, the operation may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param devPtr - Pointer to device memory + * \param value - Value to set for each byte of specified memory + * \param count - Size in bytes to set + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, + * ::cudaMemset2DAsync, ::cudaMemset3DAsync, + * ::cuMemsetD8Async, + * ::cuMemsetD16Async, + * ::cuMemsetD32Async + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)); + +/** + * \brief Initializes or sets device memory to a value + * + * Sets to the specified value \p value a matrix (\p height rows of \p width + * bytes each) pointed to by \p dstPtr. \p pitch is the width in bytes of the + * 2D array pointed to by \p dstPtr, including any padding added to the end + * of each row. This function performs fastest when the pitch is one that has + * been passed back by ::cudaMallocPitch(). + * + * ::cudaMemset2DAsync() is asynchronous with respect to the host, so + * the call may return before the memset is complete. The operation can optionally + * be associated to a stream by passing a non-zero \p stream argument. + * If \p stream is non-zero, the operation may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param devPtr - Pointer to 2D device memory + * \param pitch - Pitch in bytes of 2D device memory(Unused if \p height is 1) + * \param value - Value to set for each byte of specified memory + * \param width - Width of matrix set (columns in bytes) + * \param height - Height of matrix set (rows) + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, + * ::cudaMemsetAsync, ::cudaMemset3DAsync, + * ::cuMemsetD2D8Async, + * ::cuMemsetD2D16Async, + * ::cuMemsetD2D32Async + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0)); + +/** + * \brief Initializes or sets device memory to a value + * + * Initializes each element of a 3D array to the specified value \p value. + * The object to initialize is defined by \p pitchedDevPtr. The \p pitch field + * of \p pitchedDevPtr is the width in memory in bytes of the 3D array pointed + * to by \p pitchedDevPtr, including any padding added to the end of each row. + * The \p xsize field specifies the logical width of each row in bytes, while + * the \p ysize field specifies the height of each 2D slice in rows. + * The \p pitch field of \p pitchedDevPtr is ignored when \p height and \p depth + * are both equal to 1. + * + * The extents of the initialized region are specified as a \p width in bytes, + * a \p height in rows, and a \p depth in slices. + * + * Extents with \p width greater than or equal to the \p xsize of + * \p pitchedDevPtr may perform significantly faster than extents narrower + * than the \p xsize. Secondarily, extents with \p height equal to the + * \p ysize of \p pitchedDevPtr will perform faster than when the \p height is + * shorter than the \p ysize. + * + * This function performs fastest when the \p pitchedDevPtr has been allocated + * by ::cudaMalloc3D(). + * + * ::cudaMemset3DAsync() is asynchronous with respect to the host, so + * the call may return before the memset is complete. The operation can optionally + * be associated to a stream by passing a non-zero \p stream argument. + * If \p stream is non-zero, the operation may overlap with operations in other streams. + * + * The device version of this function only handles device to device copies and + * cannot be given local or shared pointers. + * + * \param pitchedDevPtr - Pointer to pitched device memory + * \param value - Value to set for each byte of specified memory + * \param extent - Size parameters for where to set device memory (\p width field in bytes) + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_memset + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemset, ::cudaMemset2D, ::cudaMemset3D, + * ::cudaMemsetAsync, ::cudaMemset2DAsync, + * ::cudaMalloc3D, ::make_cudaPitchedPtr, + * ::make_cudaExtent + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0)); + +/** + * \brief Finds the address associated with a CUDA symbol + * + * Returns in \p *devPtr the address of symbol \p symbol on the device. + * \p symbol is a variable that resides in global or constant memory space. + * If \p symbol cannot be found, or if \p symbol is not declared in the + * global or constant memory space, \p *devPtr is unchanged and the error + * ::cudaErrorInvalidSymbol is returned. + * + * \param devPtr - Return device pointer associated with symbol + * \param symbol - Device symbol address + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)", + * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)", + * ::cuModuleGetGlobal + */ +extern __host__ cudaError_t CUDARTAPI cudaGetSymbolAddress(void **devPtr, const void *symbol); + +/** + * \brief Finds the size of the object associated with a CUDA symbol + * + * Returns in \p *size the size of symbol \p symbol. \p symbol is a variable that + * resides in global or constant memory space. If \p symbol cannot be found, or + * if \p symbol is not declared in global or constant memory space, \p *size is + * unchanged and the error ::cudaErrorInvalidSymbol is returned. + * + * \param size - Size of object associated with symbol + * \param symbol - Device symbol address + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSymbol, + * ::cudaErrorNoKernelImageForDevice + * \notefnerr + * \note_string_api_deprecation + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)", + * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)", + * ::cuModuleGetGlobal + */ +extern __host__ cudaError_t CUDARTAPI cudaGetSymbolSize(size_t *size, const void *symbol); + +/** + * \brief Prefetches memory to the specified destination device + * + * Prefetches memory to the specified destination device. \p devPtr is the + * base device pointer of the memory to be prefetched and \p dstDevice is the + * destination device. \p count specifies the number of bytes to copy. \p stream + * is the stream in which the operation is enqueued. The memory range must refer + * to managed memory allocated via ::cudaMallocManaged or declared via __managed__ variables. + * + * Passing in cudaCpuDeviceId for \p dstDevice will prefetch the data to host memory. If + * \p dstDevice is a GPU, then the device attribute ::cudaDevAttrConcurrentManagedAccess + * must be non-zero. Additionally, \p stream must be associated with a device that has a + * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * + * The start address and end address of the memory range will be rounded down and rounded up + * respectively to be aligned to CPU page size before the prefetch operation is enqueued + * in the stream. + * + * If no physical memory has been allocated for this region, then this memory region + * will be populated and mapped on the destination device. If there's insufficient + * memory to prefetch the desired region, the Unified Memory driver may evict pages from other + * ::cudaMallocManaged allocations to host memory in order to make room. Device memory + * allocated using ::cudaMalloc or ::cudaMallocArray will not be evicted. + * + * By default, any mappings to the previous location of the migrated pages are removed and + * mappings for the new location are only setup on \p dstDevice. The exact behavior however + * also depends on the settings applied to this memory range via ::cudaMemAdvise as described + * below: + * + * If ::cudaMemAdviseSetReadMostly was set on any subset of this memory range, + * then that subset will create a read-only copy of the pages on \p dstDevice. + * + * If ::cudaMemAdviseSetPreferredLocation was called on any subset of this memory + * range, then the pages will be migrated to \p dstDevice even if \p dstDevice is not the + * preferred location of any pages in the memory range. + * + * If ::cudaMemAdviseSetAccessedBy was called on any subset of this memory range, + * then mappings to those pages from all the appropriate processors are updated to + * refer to the new location if establishing such a mapping is possible. Otherwise, + * those mappings are cleared. + * + * Note that this API is not required for functionality and only serves to improve performance + * by allowing the application to migrate data to a suitable location before it is accessed. + * Memory accesses to this range are always coherent and are allowed even when the data is + * actively being migrated. + * + * Note that this function is asynchronous with respect to the host and all work + * on other devices. + * + * \param devPtr - Pointer to be prefetched + * \param count - Size in bytes + * \param dstDevice - Destination device to prefetch to + * \param stream - Stream to enqueue prefetch operation + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, + * ::cudaMemcpy3DPeerAsync, ::cudaMemAdvise, + * ::cuMemPrefetchAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream __dv(0)); + +/** + * \brief Advise about the usage of a given memory range + * + * Advise the Unified Memory subsystem about the usage pattern for the memory range + * starting at \p devPtr with a size of \p count bytes. The start address and end address of the memory + * range will be rounded down and rounded up respectively to be aligned to CPU page size before the + * advice is applied. The memory range must refer to managed memory allocated via ::cudaMallocManaged + * or declared via __managed__ variables. The memory range could also refer to system-allocated pageable + * memory provided it represents a valid, host-accessible region of memory and all additional constraints + * imposed by \p advice as outlined below are also satisfied. Specifying an invalid system-allocated pageable + * memory range results in an error being returned. + * + * The \p advice parameter can take the following values: + * - ::cudaMemAdviseSetReadMostly: This implies that the data is mostly going to be read + * from and only occasionally written to. Any read accesses from any processor to this region will create a + * read-only copy of at least the accessed pages in that processor's memory. Additionally, if ::cudaMemPrefetchAsync + * is called on this region, it will create a read-only copy of the data on the destination processor. + * If any processor writes to this region, all copies of the corresponding page will be invalidated + * except for the one where the write occurred. The \p device argument is ignored for this advice. + * Note that for a page to be read-duplicated, the accessing processor must either be the CPU or a GPU + * that has a non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess. + * Also, if a context is created on a device that does not have the device attribute + * ::cudaDevAttrConcurrentManagedAccess set, then read-duplication will not occur until + * all such contexts are destroyed. + * If the memory region refers to valid system-allocated pageable memory, then the accessing device must + * have a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccess for a read-only + * copy to be created on that device. Note however that if the accessing device also has a non-zero value for the + * device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, then setting this advice + * will not create a read-only copy when that device accesses this memory region. + * + * - ::cudaMemAdviceUnsetReadMostly: Undoes the effect of ::cudaMemAdviceReadMostly and also prevents the + * Unified Memory driver from attempting heuristic read-duplication on the memory range. Any read-duplicated + * copies of the data will be collapsed into a single copy. The location for the collapsed + * copy will be the preferred location if the page has a preferred location and one of the read-duplicated + * copies was resident at that location. Otherwise, the location chosen is arbitrary. + * + * - ::cudaMemAdviseSetPreferredLocation: This advice sets the preferred location for the + * data to be the memory belonging to \p device. Passing in cudaCpuDeviceId for \p device sets the + * preferred location as host memory. If \p device is a GPU, then it must have a non-zero value for the + * device attribute ::cudaDevAttrConcurrentManagedAccess. Setting the preferred location + * does not cause data to migrate to that location immediately. Instead, it guides the migration policy + * when a fault occurs on that memory region. If the data is already in its preferred location and the + * faulting processor can establish a mapping without requiring the data to be migrated, then + * data migration will be avoided. On the other hand, if the data is not in its preferred location + * or if a direct mapping cannot be established, then it will be migrated to the processor accessing + * it. It is important to note that setting the preferred location does not prevent data prefetching + * done using ::cudaMemPrefetchAsync. + * Having a preferred location can override the page thrash detection and resolution logic in the Unified + * Memory driver. Normally, if a page is detected to be constantly thrashing between for example host and device + * memory, the page may eventually be pinned to host memory by the Unified Memory driver. But + * if the preferred location is set as device memory, then the page will continue to thrash indefinitely. + * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice, unless read accesses from + * \p device will not result in a read-only copy being created on that device as outlined in description for + * the advice ::cudaMemAdviseSetReadMostly. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has + * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, + * then this call has no effect. Note however that this behavior may change in the future. + * + * - ::cudaMemAdviseUnsetPreferredLocation: Undoes the effect of ::cudaMemAdviseSetPreferredLocation + * and changes the preferred location to none. + * + * - ::cudaMemAdviseSetAccessedBy: This advice implies that the data will be accessed by \p device. + * Passing in ::cudaCpuDeviceId for \p device will set the advice for the CPU. If \p device is a GPU, then + * the device attribute ::cudaDevAttrConcurrentManagedAccess must be non-zero. + * This advice does not cause data migration and has no impact on the location of the data per se. Instead, + * it causes the data to always be mapped in the specified processor's page tables, as long as the + * location of the data permits a mapping to be established. If the data gets migrated for any reason, + * the mappings are updated accordingly. + * This advice is recommended in scenarios where data locality is not important, but avoiding faults is. + * Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the + * data located on one GPU is occasionally accessed by peer GPUs. In such scenarios, migrating data + * over to the other GPUs is not as important because the accesses are infrequent and the overhead of + * migration may be too high. But preventing faults can still help improve performance, and so having + * a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated + * to host memory because the CPU typically cannot access device memory directly. Any GPU that had the + * ::cudaMemAdviceSetAccessedBy flag set for this data will now have its mapping updated to point to the + * page in host memory. + * If ::cudaMemAdviseSetReadMostly is also set on this memory region or any subset of it, then the + * policies associated with that advice will override the policies of this advice. Additionally, if the + * preferred location of this memory region or any subset of it is also \p device, then the policies + * associated with ::cudaMemAdviseSetPreferredLocation will override the policies of this advice. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has + * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, + * then this call has no effect. + * + * - ::cudaMemAdviseUnsetAccessedBy: Undoes the effect of ::cudaMemAdviseSetAccessedBy. Any mappings to + * the data from \p device may be removed at any time causing accesses to result in non-fatal page faults. + * If the memory region refers to valid system-allocated pageable memory, then \p device must have a non-zero + * value for the device attribute ::cudaDevAttrPageableMemoryAccess. Additionally, if \p device has + * a non-zero value for the device attribute ::cudaDevAttrPageableMemoryAccessUsesHostPageTables, + * then this call has no effect. + * + * \param devPtr - Pointer to memory to set the advice for + * \param count - Size in bytes of the memory range + * \param advice - Advice to be applied for the specified memory range + * \param device - Device to apply the advice for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpyPeer, ::cudaMemcpyAsync, + * ::cudaMemcpy3DPeerAsync, ::cudaMemPrefetchAsync, + * ::cuMemAdvise + */ +extern __host__ cudaError_t CUDARTAPI cudaMemAdvise(const void *devPtr, size_t count, enum cudaMemoryAdvise advice, int device); + +/** +* \brief Query an attribute of a given memory range +* +* Query an attribute about the memory range starting at \p devPtr with a size of \p count bytes. The +* memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via +* __managed__ variables. +* +* The \p attribute parameter can take the following values: +* - ::cudaMemRangeAttributeReadMostly: If this attribute is specified, \p data will be interpreted +* as a 32-bit integer, and \p dataSize must be 4. The result returned will be 1 if all pages in the given +* memory range have read-duplication enabled, or 0 otherwise. +* - ::cudaMemRangeAttributePreferredLocation: If this attribute is specified, \p data will be +* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be a GPU device +* id if all pages in the memory range have that GPU as their preferred location, or it will be cudaCpuDeviceId +* if all pages in the memory range have the CPU as their preferred location, or it will be cudaInvalidDeviceId +* if either all the pages don't have the same preferred location or some of the pages don't have a +* preferred location at all. Note that the actual location of the pages in the memory range at the time of +* the query may be different from the preferred location. +* - ::cudaMemRangeAttributeAccessedBy: If this attribute is specified, \p data will be interpreted +* as an array of 32-bit integers, and \p dataSize must be a non-zero multiple of 4. The result returned +* will be a list of device ids that had ::cudaMemAdviceSetAccessedBy set for that entire memory range. +* If any device does not have that advice set for the entire memory range, that device will not be included. +* If \p data is larger than the number of devices that have that advice set for that memory range, +* cudaInvalidDeviceId will be returned in all the extra space provided. For ex., if \p dataSize is 12 +* (i.e. \p data has 3 elements) and only device 0 has the advice set, then the result returned will be +* { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If \p data is smaller than the number of devices that have +* that advice set, then only as many devices will be returned as can fit in the array. There is no +* guarantee on which specific devices will be returned, however. +* - ::cudaMemRangeAttributeLastPrefetchLocation: If this attribute is specified, \p data will be +* interpreted as a 32-bit integer, and \p dataSize must be 4. The result returned will be the last location +* to which all pages in the memory range were prefetched explicitly via ::cudaMemPrefetchAsync. This will either be +* a GPU id or cudaCpuDeviceId depending on whether the last location for prefetch was a GPU or the CPU +* respectively. If any page in the memory range was never explicitly prefetched or if all pages were not +* prefetched to the same location, cudaInvalidDeviceId will be returned. Note that this simply returns the +* last location that the applicaton requested to prefetch the memory range to. It gives no indication as to +* whether the prefetch operation to that location has completed or even begun. +* +* \param data - A pointers to a memory location where the result +* of each attribute query will be written to. +* \param dataSize - Array containing the size of data +* \param attribute - The attribute to query +* \param devPtr - Start of the range to query +* \param count - Size of the range to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemRangeGetAttributes, ::cudaMemPrefetchAsync, + * ::cudaMemAdvise, + * ::cuMemRangeGetAttribute + */ +extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttribute(void *data, size_t dataSize, enum cudaMemRangeAttribute attribute, const void *devPtr, size_t count); + +/** + * \brief Query attributes of a given memory range. + * + * Query attributes of the memory range starting at \p devPtr with a size of \p count bytes. The + * memory range must refer to managed memory allocated via ::cudaMallocManaged or declared via + * __managed__ variables. The \p attributes array will be interpreted to have \p numAttributes + * entries. The \p dataSizes array will also be interpreted to have \p numAttributes entries. + * The results of the query will be stored in \p data. + * + * The list of supported attributes are given below. Please refer to ::cudaMemRangeGetAttribute for + * attribute descriptions and restrictions. + * + * - ::cudaMemRangeAttributeReadMostly + * - ::cudaMemRangeAttributePreferredLocation + * - ::cudaMemRangeAttributeAccessedBy + * - ::cudaMemRangeAttributeLastPrefetchLocation + * + * \param data - A two-dimensional array containing pointers to memory + * locations where the result of each attribute query will be written to. + * \param dataSizes - Array containing the sizes of each result + * \param attributes - An array of attributes to query + * (numAttributes and the number of attributes in this array should match) + * \param numAttributes - Number of attributes to query + * \param devPtr - Start of the range to query + * \param count - Size of the range to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemRangeGetAttribute, ::cudaMemAdvise, + * ::cudaMemPrefetchAsync, + * ::cuMemRangeGetAttributes + */ +extern __host__ cudaError_t CUDARTAPI cudaMemRangeGetAttributes(void **data, size_t *dataSizes, enum cudaMemRangeAttribute *attributes, size_t numAttributes, const void *devPtr, size_t count); + +/** @} */ /* END CUDART_MEMORY */ + +/** + * \defgroup CUDART_MEMORY_DEPRECATED Memory Management [DEPRECATED] + * + * ___MANBRIEF___ deprecated memory management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes deprecated memory management functions of the CUDA runtime + * application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Copies data between host and device + * + * \deprecated + * + * Copies \p count bytes from the memory area pointed to by \p src to the + * CUDA array \p dst starting at \p hOffset rows and \p wOffset bytes from + * the upper left corner, where \p kind specifies the direction + * of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param dst - Destination memory address + * \param wOffset - Destination starting X offset (columns in bytes) + * \param hOffset - Destination starting Y offset (rows) + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyHtoA, + * ::cuMemcpyDtoA + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * \deprecated + * + * Copies \p count bytes from the CUDA array \p src starting at \p hOffset rows + * and \p wOffset bytes from the upper left corner to the memory area pointed to + * by \p dst, where \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param dst - Destination memory address + * \param src - Source memory address + * \param wOffset - Source starting X offset (columns in bytes) + * \param hOffset - Source starting Y offset (rows) + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_sync + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAtoH, + * ::cuMemcpyAtoD + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind); + +/** + * \brief Copies data between host and device + * + * \deprecated + * + * Copies \p count bytes from the CUDA array \p src starting at \p hOffsetSrc + * rows and \p wOffsetSrc bytes from the upper left corner to the CUDA array + * \p dst starting at \p hOffsetDst rows and \p wOffsetDst bytes from the upper + * left corner, where \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param dst - Destination memory address + * \param wOffsetDst - Destination starting X offset (columns in bytes) + * \param hOffsetDst - Destination starting Y offset (rows) + * \param src - Source memory address + * \param wOffsetSrc - Source starting X offset (columns in bytes) + * \param hOffsetSrc - Source starting Y offset (rows) + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAtoA + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); + +/** + * \brief Copies data between host and device + * + * \deprecated + * + * Copies \p count bytes from the memory area pointed to by \p src to the + * CUDA array \p dst starting at \p hOffset rows and \p wOffset bytes from + * the upper left corner, where \p kind specifies the + * direction of the copy, and must be one of ::cudaMemcpyHostToHost, + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * ::cudaMemcpyToArrayAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If \p + * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream + * is non-zero, the copy may overlap with operations in other streams. + * + * \param dst - Destination memory address + * \param wOffset - Destination starting X offset (columns in bytes) + * \param hOffset - Destination starting Y offset (rows) + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpyFromArrayAsync, ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyHtoAAsync, + * ::cuMemcpy2DAsync + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** + * \brief Copies data between host and device + * + * \deprecated + * + * Copies \p count bytes from the CUDA array \p src starting at \p hOffset rows + * and \p wOffset bytes from the upper left corner to the memory area pointed to + * by \p dst, where \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * ::cudaMemcpyFromArrayAsync() is asynchronous with respect to the host, so + * the call may return before the copy is complete. The copy can optionally + * be associated to a stream by passing a non-zero \p stream argument. If \p + * kind is ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToHost and \p stream + * is non-zero, the copy may overlap with operations in other streams. + * + * \param dst - Destination memory address + * \param src - Source memory address + * \param wOffset - Source starting X offset (columns in bytes) + * \param hOffset - Source starting Y offset (rows) + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * \param stream - Stream identifier + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidMemcpyDirection + * \notefnerr + * \note_async + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cudaMemcpy, ::cudaMemcpy2D, ::cudaMemcpyToArray, + * ::cudaMemcpy2DToArray, ::cudaMemcpyFromArray, ::cudaMemcpy2DFromArray, + * ::cudaMemcpyArrayToArray, ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol, + * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync, + * ::cudaMemcpyToArrayAsync, ::cudaMemcpy2DToArrayAsync, + * ::cudaMemcpy2DFromArrayAsync, + * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync, + * ::cuMemcpyAtoHAsync, + * ::cuMemcpy2DAsync + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + +/** @} */ /* END CUDART_MEMORY_DEPRECATED */ + +/** + * \defgroup CUDART_MEMORY_POOLS Stream Ordered Memory Allocator + * + * ___MANBRIEF___ Functions for performing allocation and free operations in stream order. + * Functions for controlling the behavior of the underlying allocator. + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * + * @{ + * + * \section CUDART_MEMORY_POOLS_overview overview + * + * The asynchronous allocator allows the user to allocate and free in stream order. + * All asynchronous accesses of the allocation must happen between + * the stream executions of the allocation and the free. If the memory is accessed + * outside of the promised stream order, a use before allocation / use after free error + * will cause undefined behavior. + * + * The allocator is free to reallocate the memory as long as it can guarantee + * that compliant memory accesses will not overlap temporally. + * The allocator may refer to internal stream ordering as well as inter-stream dependencies + * (such as CUDA events and null stream dependencies) when establishing the temporal guarantee. + * The allocator may also insert inter-stream dependencies to establish the temporal guarantee. + * + * \section CUDART_MEMORY_POOLS_support Supported Platforms + * + * Whether or not a device supports the integrated stream ordered memory allocator + * may be queried by calling ::cudaDeviceGetAttribute() with the device attribute + * ::cudaDevAttrMemoryPoolsSupported. + */ + +/** + * \brief Allocates memory with stream ordered semantics + * + * Inserts an allocation operation into \p hStream. + * A pointer to the allocated memory is returned immediately in *dptr. + * The allocation must not be accessed until the the allocation operation completes. + * The allocation comes from the memory pool associated with the stream's device. + * + * \note The default memory pool of a device contains device memory from that device. + * \note Basic stream ordering allows future work submitted into the same stream to use the allocation. + * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + * operation completes before work submitted in a separate stream runs. + * \note During stream capture, this function results in the creation of an allocation node. In this case, + * the allocation is owned by the graph instead of the memory pool. The memory pool's properties + * are used to set the node's creation parameters. + * + * \param[out] devPtr - Returned device pointer + * \param[in] size - Number of bytes to allocate + * \param[in] hStream - The stream establishing the stream ordering contract and the memory pool to allocate from + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorNotSupported, + * ::cudaErrorOutOfMemory, + * \notefnerr + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cuMemAllocAsync, + * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) "cudaMallocAsync (C++ API)", + * ::cudaMallocFromPoolAsync, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute, ::cudaMemPoolGetAttribute + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream); + +/** + * \brief Frees memory with stream ordered semantics + * + * Inserts a free operation into \p hStream. + * The allocation must not be accessed after stream execution reaches the free. + * After this API returns, accessing the memory from any subsequent work launched on the GPU + * or querying its pointer attributes results in undefined behavior. + * + * \note During stream capture, this function results in the creation of a free node and + * must therefore be passed the address of a graph allocation. + * + * \param dptr - memory to free + * \param hStream - The stream establishing the stream ordering promise + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorNotSupported + * \notefnerr + * \note_null_stream + * \note_init_rt + * \note_callback + * + * \sa ::cuMemFreeAsync, ::cudaMallocAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr, cudaStream_t hStream); + +/** + * \brief Tries to release memory back to the OS + * + * Releases memory back to the OS until the pool contains fewer than minBytesToKeep + * reserved bytes, or there is no more memory that the allocator can safely release. + * The allocator cannot release OS allocations that back outstanding asynchronous allocations. + * The OS allocations may happen at different granularity from the user allocations. + * + * \note: Allocations that have not been freed count as outstanding. + * \note: Allocations that have been asynchronously freed but whose completion has + * not been observed on the host (eg. by a synchronize) can count as outstanding. + * + * \param[in] pool - The memory pool to trim + * \param[in] minBytesToKeep - If the pool has less than minBytesToKeep reserved, + * the TrimTo operation is a no-op. Otherwise the pool will be guaranteed to have + * at least minBytesToKeep bytes reserved after the operation. + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_callback + * + * \sa ::cuMemPoolTrimTo, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolTrimTo(cudaMemPool_t memPool, size_t minBytesToKeep); + +/** + * \brief Sets attributes of a memory pool + * + * Supported attributes are: + * - ::cudaMemPoolAttrReleaseThreshold: (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + * - ::cudaMemPoolReuseFollowEventDependencies: (value type = int) + * Allow ::cudaMallocAsync to use memory asynchronously freed + * in another stream as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + * - ::cudaMemPoolReuseAllowOpportunistic: (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + * - ::cudaMemPoolReuseAllowInternalDependencies: (value type = int) + * Allow ::cudaMallocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by ::cudaFreeAsync (default enabled). + * - ::cudaMemPoolAttrReservedMemHigh: (value type = cuuint64_t) + * Reset the high watermark that tracks the amount of backing memory that was + * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. + * - ::cudaMemPoolAttrUsedMemHigh: (value type = cuuint64_t) + * Reset the high watermark that tracks the amount of used memory that was + * allocated for the memory pool. It is illegal to set this attribute to a non-zero value. + * + * \param[in] pool - The memory pool to modify + * \param[in] attr - The attribute to modify + * \param[in] value - Pointer to the value to assign + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_callback + * + * \sa ::cuMemPoolSetAttribute, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate + + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value ); + +/** + * \brief Gets attributes of a memory pool + * + * Supported attributes are: + * - ::cudaMemPoolAttrReleaseThreshold: (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + * - ::cudaMemPoolReuseFollowEventDependencies: (value type = int) + * Allow ::cudaMallocAsync to use memory asynchronously freed + * in another stream as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + * - ::cudaMemPoolReuseAllowOpportunistic: (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + * - ::cudaMemPoolReuseAllowInternalDependencies: (value type = int) + * Allow ::cudaMallocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by ::cudaFreeAsync (default enabled). + * - ::cudaMemPoolAttrReservedMemCurrent: (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool. + * - ::cudaMemPoolAttrReservedMemHigh: (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since + * the last time it was reset. + * - ::cudaMemPoolAttrUsedMemCurrent: (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + * - ::cudaMemPoolAttrUsedMemHigh: (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the + * application since the last time it was reset. + * + * \param[in] pool - The memory pool to get attributes of + * \param[in] attr - The attribute to get + * \param[in] value - Retrieved value + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_callback + * + * \sa ::cuMemPoolGetAttribute, ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate + + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAttribute(cudaMemPool_t memPool, enum cudaMemPoolAttr attr, void *value ); + +/** + * \brief Controls visibility of pools between devices + * + * \param[in] pool - The pool being modified + * \param[in] map - Array of access descriptors. Each descriptor instructs the access to enable for a single gpu + * \param[in] count - Number of descriptors in the map array. + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa ::cuMemPoolSetAccess, ::cudaMemPoolGetAccess, ::cudaMallocAsync, cudaFreeAsync + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolSetAccess(cudaMemPool_t memPool, const struct cudaMemAccessDesc *descList, size_t count); + +/** + * \brief Returns the accessibility of a pool from a device + * + * Returns the accessibility of the pool's memory from the specified location. + * + * \param[out] flags - the accessibility of the pool from the specified location + * \param[in] memPool - the pool being queried + * \param[in] location - the location accessing the pool + * + * \sa ::cuMemPoolGetAccess, ::cudaMemPoolSetAccess + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolGetAccess(enum cudaMemAccessFlags *flags, cudaMemPool_t memPool, struct cudaMemLocation *location); + +/** + * \brief Creates a memory pool + * + * Creates a CUDA memory pool and returns the handle in \p pool. The \p poolProps determines + * the properties of the pool such as the backing device and IPC capabilities. + * + * By default, the pool's memory will be accessible from the device it is allocated on. + * + * \note Specifying cudaMemHandleTypeNone creates a memory pool that will not support IPC. + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorNotSupported + * + * \sa ::cuMemPoolCreate, ::cudaDeviceSetMemPool, ::cudaMallocFromPoolAsync, ::cudaMemPoolExportToShareableHandle, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool + + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolCreate(cudaMemPool_t *memPool, const struct cudaMemPoolProps *poolProps); + +/** + * \brief Destroys the specified memory pool + * + * If any pointers obtained from this pool haven't been freed or + * the pool has free operations that haven't completed + * when ::cudaMemPoolDestroy is invoked, the function will return immediately and the + * resources associated with the pool will be released automatically + * once there are no more outstanding allocations. + * + * Destroying the current mempool of a device sets the default mempool of + * that device as the current mempool for that device. + * + * \note A device's default memory pool cannot be destroyed. + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa cuMemPoolDestroy, ::cudaFreeAsync, ::cudaDeviceSetMemPool, ::cudaDeviceGetDefaultMemPool, ::cudaDeviceGetMemPool, ::cudaMemPoolCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolDestroy(cudaMemPool_t memPool); + +/** + * \brief Allocates memory from a specified pool with stream ordered semantics. + * + * Inserts an allocation operation into \p hStream. + * A pointer to the allocated memory is returned immediately in *dptr. + * The allocation must not be accessed until the the allocation operation completes. + * The allocation comes from the specified memory pool. + * + * \note + * - The specified memory pool may be from a device different than that of the specified \p hStream. + * + * - Basic stream ordering allows future work submitted into the same stream to use the allocation. + * Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation + * operation completes before work submitted in a separate stream runs. + * + * \note During stream capture, this function results in the creation of an allocation node. In this case, + * the allocation is owned by the graph instead of the memory pool. The memory pool's properties + * are used to set the node's creation parameters. + * + * \param[out] ptr - Returned device pointer + * \param[in] bytesize - Number of bytes to allocate + * \param[in] memPool - The pool to allocate from + * \param[in] stream - The stream establishing the stream ordering semantic + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorNotSupported, + * ::cudaErrorOutOfMemory + * + * \sa ::cuMemAllocFromPoolAsync, + * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream) "cudaMallocAsync (C++ API)", + * ::cudaMallocAsync, ::cudaFreeAsync, ::cudaDeviceGetDefaultMemPool, ::cudaMemPoolCreate, ::cudaMemPoolSetAccess, ::cudaMemPoolSetAttribute + */ +extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream); + +/** + * \brief Exports a memory pool to the requested handle type. + * + * Given an IPC capable mempool, create an OS handle to share the pool with another process. + * A recipient process can convert the shareable handle into a mempool with ::cudaMemPoolImportFromShareableHandle. + * Individual pointers can then be shared with the ::cudaMemPoolExportPointer and ::cudaMemPoolImportPointer APIs. + * The implementation of what the shareable handle is and how it can be transferred is defined by the requested + * handle type. + * + * \note: To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than cudaMemHandleTypeNone. + * + * \param[out] handle_out - pointer to the location in which to store the requested handle + * \param[in] pool - pool to export + * \param[in] handleType - the type of handle to create + * \param[in] flags - must be 0 + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorOutOfMemory + * + * \sa ::cuMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolExportPointer, ::cudaMemPoolImportPointer + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportToShareableHandle( + void *shareableHandle, + cudaMemPool_t memPool, + enum cudaMemAllocationHandleType handleType, + unsigned int flags); + +/** + * \brief imports a memory pool from a shared handle. + * + * Specific allocations can be imported from the imported pool with ::cudaMemPoolImportPointer. + * + * \note Imported memory pools do not support creating new allocations. + * As such imported memory pools may not be used in ::cudaDeviceSetMemPool + * or ::cudaMallocFromPoolAsync calls. + * + * \param[out] pool_out - Returned memory pool + * \param[in] handle - OS handle of the pool to open + * \param[in] handleType - The type of handle being imported + * \param[in] flags - must be 0 + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorOutOfMemory + * + * \sa ::cuMemPoolImportFromShareableHandle, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolExportPointer, ::cudaMemPoolImportPointer + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportFromShareableHandle( + cudaMemPool_t *memPool, + void *shareableHandle, + enum cudaMemAllocationHandleType handleType, + unsigned int flags); + +/** + * \brief Export data to share a memory pool allocation between processes. + * + * Constructs \p shareData_out for sharing a specific allocation from an already shared memory pool. + * The recipient process can import the allocation with the ::cudaMemPoolImportPointer api. + * The data is not a handle and may be shared through any IPC mechanism. + * + * \param[out] shareData_out - Returned export data + * \param[in] ptr - pointer to memory being exported + * + * \returns + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorOutOfMemory + * + * \sa ::cuMemPoolExportPointer, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolImportPointer + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolExportPointer(struct cudaMemPoolPtrExportData *exportData, void *ptr); + +/** + * \brief Import a memory pool allocation from another process. + * + * Returns in \p ptr_out a pointer to the imported memory. + * The imported memory must not be accessed before the allocation operation completes + * in the exporting process. The imported memory must be freed from all importing processes before + * being freed in the exporting process. The pointer may be freed with cudaFree + * or cudaFreeAsync. If ::cudaFreeAsync is used, the free must be completed + * on the importing process before the free operation on the exporting process. + * + * \note The ::cudaFreeAsync api may be used in the exporting process before + * the ::cudaFreeAsync operation completes in its stream as long as the + * ::cudaFreeAsync in the exporting process specifies a stream with + * a stream dependency on the importing process's ::cudaFreeAsync. + * + * \param[out] ptr_out - pointer to imported memory + * \param[in] pool - pool from which to import + * \param[in] shareData - data specifying the memory to import + * + * \returns + * ::CUDA_SUCCESS, + * ::CUDA_ERROR_INVALID_VALUE, + * ::CUDA_ERROR_NOT_INITIALIZED, + * ::CUDA_ERROR_OUT_OF_MEMORY + * + * \sa ::cuMemPoolImportPointer, ::cudaMemPoolExportToShareableHandle, ::cudaMemPoolImportFromShareableHandle, ::cudaMemPoolExportPointer + */ +extern __host__ cudaError_t CUDARTAPI cudaMemPoolImportPointer(void **ptr, cudaMemPool_t memPool, struct cudaMemPoolPtrExportData *exportData); + +/** @} */ /* END CUDART_MEMORY_POOLS */ + +/** + * \defgroup CUDART_UNIFIED Unified Addressing + * + * ___MANBRIEF___ unified addressing functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the unified addressing functions of the CUDA + * runtime application programming interface. + * + * @{ + * + * \section CUDART_UNIFIED_overview Overview + * + * CUDA devices can share a unified address space with the host. + * For these devices there is no distinction between a device + * pointer and a host pointer -- the same pointer value may be + * used to access memory from the host program and from a kernel + * running on the device (with exceptions enumerated below). + * + * \section CUDART_UNIFIED_support Supported Platforms + * + * Whether or not a device supports unified addressing may be + * queried by calling ::cudaGetDeviceProperties() with the device + * property ::cudaDeviceProp::unifiedAddressing. + * + * Unified addressing is automatically enabled in 64-bit processes . + * + * \section CUDART_UNIFIED_lookup Looking Up Information from Pointer Values + * + * It is possible to look up information about the memory which backs a + * pointer value. For instance, one may want to know if a pointer points + * to host or device memory. As another example, in the case of device + * memory, one may want to know on which CUDA device the memory + * resides. These properties may be queried using the function + * ::cudaPointerGetAttributes() + * + * Since pointers are unique, it is not necessary to specify information + * about the pointers specified to ::cudaMemcpy() and other copy functions. + * The copy direction ::cudaMemcpyDefault may be used to specify that the + * CUDA runtime should infer the location of the pointer from its value. + * + * \section CUDART_UNIFIED_automaphost Automatic Mapping of Host Allocated Host Memory + * + * All host memory allocated through all devices using ::cudaMallocHost() and + * ::cudaHostAlloc() is always directly accessible from all devices that + * support unified addressing. This is the case regardless of whether or + * not the flags ::cudaHostAllocPortable and ::cudaHostAllocMapped are + * specified. + * + * The pointer value through which allocated host memory may be accessed + * in kernels on all devices that support unified addressing is the same + * as the pointer value through which that memory is accessed on the host. + * It is not necessary to call ::cudaHostGetDevicePointer() to get the device + * pointer for these allocations. + * + * Note that this is not the case for memory allocated using the flag + * ::cudaHostAllocWriteCombined, as discussed below. + * + * \section CUDART_UNIFIED_autopeerregister Direct Access of Peer Memory + + * Upon enabling direct access from a device that supports unified addressing + * to another peer device that supports unified addressing using + * ::cudaDeviceEnablePeerAccess() all memory allocated in the peer device using + * ::cudaMalloc() and ::cudaMallocPitch() will immediately be accessible + * by the current device. The device pointer value through + * which any peer's memory may be accessed in the current device + * is the same pointer value through which that memory may be + * accessed from the peer device. + * + * \section CUDART_UNIFIED_exceptions Exceptions, Disjoint Addressing + * + * Not all memory may be accessed on devices through the same pointer + * value through which they are accessed on the host. These exceptions + * are host memory registered using ::cudaHostRegister() and host memory + * allocated using the flag ::cudaHostAllocWriteCombined. For these + * exceptions, there exists a distinct host and device address for the + * memory. The device address is guaranteed to not overlap any valid host + * pointer range and is guaranteed to have the same value across all devices + * that support unified addressing. + * + * This device address may be queried using ::cudaHostGetDevicePointer() + * when a device using unified addressing is current. Either the host + * or the unified device pointer value may be used to refer to this memory + * in ::cudaMemcpy() and similar functions using the ::cudaMemcpyDefault + * memory direction. + * + */ + +/** + * \brief Returns attributes about a specified pointer + * + * Returns in \p *attributes the attributes of the pointer \p ptr. + * If pointer was not allocated in, mapped by or registered with context + * supporting unified addressing ::cudaErrorInvalidValue is returned. + * + * \note In CUDA 11.0 forward passing host pointer will return ::cudaMemoryTypeUnregistered + * in ::cudaPointerAttributes::type and call will return ::cudaSuccess. + * + * The ::cudaPointerAttributes structure is defined as: + * \code + struct cudaPointerAttributes { + enum cudaMemoryType type; + int device; + void *devicePointer; + void *hostPointer; + } + \endcode + * In this structure, the individual fields mean + * + * - \ref ::cudaPointerAttributes::type identifies type of memory. It can be + * ::cudaMemoryTypeUnregistered for unregistered host memory, + * ::cudaMemoryTypeHost for registered host memory, ::cudaMemoryTypeDevice for device + * memory or ::cudaMemoryTypeManaged for managed memory. + * + * - \ref ::cudaPointerAttributes::device "device" is the device against which + * \p ptr was allocated. If \p ptr has memory type ::cudaMemoryTypeDevice + * then this identifies the device on which the memory referred to by \p ptr + * physically resides. If \p ptr has memory type ::cudaMemoryTypeHost then this + * identifies the device which was current when the allocation was made + * (and if that device is deinitialized then this allocation will vanish + * with that device's state). + * + * - \ref ::cudaPointerAttributes::devicePointer "devicePointer" is + * the device pointer alias through which the memory referred to by \p ptr + * may be accessed on the current device. + * If the memory referred to by \p ptr cannot be accessed directly by the + * current device then this is NULL. + * + * - \ref ::cudaPointerAttributes::hostPointer "hostPointer" is + * the host pointer alias through which the memory referred to by \p ptr + * may be accessed on the host. + * If the memory referred to by \p ptr cannot be accessed directly by the + * host then this is NULL. + * + * \param attributes - Attributes for the specified pointer + * \param ptr - Pointer to get attributes for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa ::cudaGetDeviceCount, ::cudaGetDevice, ::cudaSetDevice, + * ::cudaChooseDevice, + * ::cuPointerGetAttributes + */ +extern __host__ cudaError_t CUDARTAPI cudaPointerGetAttributes(struct cudaPointerAttributes *attributes, const void *ptr); + +/** @} */ /* END CUDART_UNIFIED */ + +/** + * \defgroup CUDART_PEER Peer Device Memory Access + * + * ___MANBRIEF___ peer device memory access functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the peer device memory access functions of the CUDA runtime + * application programming interface. + * + * @{ + */ + +/** + * \brief Queries if a device may directly access a peer device's memory. + * + * Returns in \p *canAccessPeer a value of 1 if device \p device is capable of + * directly accessing memory from \p peerDevice and 0 otherwise. If direct + * access of \p peerDevice from \p device is possible, then access may be + * enabled by calling ::cudaDeviceEnablePeerAccess(). + * + * \param canAccessPeer - Returned access capability + * \param device - Device from which allocations on \p peerDevice are to + * be directly accessed. + * \param peerDevice - Device on which the allocations to be directly accessed + * by \p device reside. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceEnablePeerAccess, + * ::cudaDeviceDisablePeerAccess, + * ::cuDeviceCanAccessPeer + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceCanAccessPeer(int *canAccessPeer, int device, int peerDevice); + +/** + * \brief Enables direct access to memory allocations on a peer device. + * + * On success, all allocations from \p peerDevice will immediately be accessible by + * the current device. They will remain accessible until access is explicitly + * disabled using ::cudaDeviceDisablePeerAccess() or either device is reset using + * ::cudaDeviceReset(). + * + * Note that access granted by this call is unidirectional and that in order to access + * memory on the current device from \p peerDevice, a separate symmetric call + * to ::cudaDeviceEnablePeerAccess() is required. + * + * Note that there are both device-wide and system-wide limitations per system + * configuration, as noted in the CUDA Programming Guide under the section + * "Peer-to-Peer Memory Access". + * + * Returns ::cudaErrorInvalidDevice if ::cudaDeviceCanAccessPeer() indicates + * that the current device cannot directly access memory from \p peerDevice. + * + * Returns ::cudaErrorPeerAccessAlreadyEnabled if direct access of + * \p peerDevice from the current device has already been enabled. + * + * Returns ::cudaErrorInvalidValue if \p flags is not 0. + * + * \param peerDevice - Peer device to enable direct access to from the current device + * \param flags - Reserved for future use and must be set to 0 + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice, + * ::cudaErrorPeerAccessAlreadyEnabled, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceCanAccessPeer, + * ::cudaDeviceDisablePeerAccess, + * ::cuCtxEnablePeerAccess + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags); + +/** + * \brief Disables direct access to memory allocations on a peer device. + * + * Returns ::cudaErrorPeerAccessNotEnabled if direct access to memory on + * \p peerDevice has not yet been enabled from the current device. + * + * \param peerDevice - Peer device to disable direct access to + * + * \return + * ::cudaSuccess, + * ::cudaErrorPeerAccessNotEnabled, + * ::cudaErrorInvalidDevice + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa ::cudaDeviceCanAccessPeer, + * ::cudaDeviceEnablePeerAccess, + * ::cuCtxDisablePeerAccess + */ +extern __host__ cudaError_t CUDARTAPI cudaDeviceDisablePeerAccess(int peerDevice); + +/** @} */ /* END CUDART_PEER */ + +/** \defgroup CUDART_OPENGL OpenGL Interoperability */ + +/** \defgroup CUDART_OPENGL_DEPRECATED OpenGL Interoperability [DEPRECATED] */ + +/** \defgroup CUDART_D3D9 Direct3D 9 Interoperability */ + +/** \defgroup CUDART_D3D9_DEPRECATED Direct3D 9 Interoperability [DEPRECATED] */ + +/** \defgroup CUDART_D3D10 Direct3D 10 Interoperability */ + +/** \defgroup CUDART_D3D10_DEPRECATED Direct3D 10 Interoperability [DEPRECATED] */ + +/** \defgroup CUDART_D3D11 Direct3D 11 Interoperability */ + +/** \defgroup CUDART_D3D11_DEPRECATED Direct3D 11 Interoperability [DEPRECATED] */ + +/** \defgroup CUDART_VDPAU VDPAU Interoperability */ + +/** \defgroup CUDART_EGL EGL Interoperability */ + +/** + * \defgroup CUDART_INTEROP Graphics Interoperability + * + * ___MANBRIEF___ graphics interoperability functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graphics interoperability functions of the CUDA + * runtime application programming interface. + * + * @{ + */ + +/** + * \brief Unregisters a graphics resource for access by CUDA + * + * Unregisters the graphics resource \p resource so it is not accessible by + * CUDA unless registered again. + * + * If \p resource is invalid then ::cudaErrorInvalidResourceHandle is + * returned. + * + * \param resource - Resource to unregister + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa + * ::cudaGraphicsD3D9RegisterResource, + * ::cudaGraphicsD3D10RegisterResource, + * ::cudaGraphicsD3D11RegisterResource, + * ::cudaGraphicsGLRegisterBuffer, + * ::cudaGraphicsGLRegisterImage, + * ::cuGraphicsUnregisterResource + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnregisterResource(cudaGraphicsResource_t resource); + +/** + * \brief Set usage flags for mapping a graphics resource + * + * Set \p flags for mapping the graphics resource \p resource. + * + * Changes to \p flags will take effect the next time \p resource is mapped. + * The \p flags argument may be any of the following: + * - ::cudaGraphicsMapFlagsNone: Specifies no hints about how \p resource will + * be used. It is therefore assumed that CUDA may read from or write to \p resource. + * - ::cudaGraphicsMapFlagsReadOnly: Specifies that CUDA will not write to \p resource. + * - ::cudaGraphicsMapFlagsWriteDiscard: Specifies CUDA will not read from \p resource and will + * write over the entire contents of \p resource, so none of the data + * previously stored in \p resource will be preserved. + * + * If \p resource is presently mapped for access by CUDA then ::cudaErrorUnknown is returned. + * If \p flags is not one of the above values then ::cudaErrorInvalidValue is returned. + * + * \param resource - Registered resource to set flags for + * \param flags - Parameters for resource mapping + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown, + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphicsMapResources, + * ::cuGraphicsResourceSetMapFlags + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceSetMapFlags(cudaGraphicsResource_t resource, unsigned int flags); + +/** + * \brief Map graphics resources for access by CUDA + * + * Maps the \p count graphics resources in \p resources for access by CUDA. + * + * The resources in \p resources may be accessed by CUDA until they + * are unmapped. The graphics API from which \p resources were registered + * should not access any resources while they are mapped by CUDA. If an + * application does so, the results are undefined. + * + * This function provides the synchronization guarantee that any graphics calls + * issued before ::cudaGraphicsMapResources() will complete before any subsequent CUDA + * work issued in \p stream begins. + * + * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle + * is returned. If any of \p resources are presently mapped for access by + * CUDA then ::cudaErrorUnknown is returned. + * + * \param count - Number of resources to map + * \param resources - Resources to map for CUDA + * \param stream - Stream for synchronization + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cudaGraphicsUnmapResources, + * ::cuGraphicsMapResources + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsMapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)); + +/** + * \brief Unmap graphics resources. + * + * Unmaps the \p count graphics resources in \p resources. + * + * Once unmapped, the resources in \p resources may not be accessed by CUDA + * until they are mapped again. + * + * This function provides the synchronization guarantee that any CUDA work issued + * in \p stream before ::cudaGraphicsUnmapResources() will complete before any + * subsequently issued graphics work begins. + * + * If \p resources contains any duplicate entries then ::cudaErrorInvalidResourceHandle + * is returned. If any of \p resources are not presently mapped for access by + * CUDA then ::cudaErrorUnknown is returned. + * + * \param count - Number of resources to unmap + * \param resources - Resources to unmap + * \param stream - Stream for synchronization + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \note_null_stream + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphicsMapResources, + * ::cuGraphicsUnmapResources + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsUnmapResources(int count, cudaGraphicsResource_t *resources, cudaStream_t stream __dv(0)); + +/** + * \brief Get an device pointer through which to access a mapped graphics resource. + * + * Returns in \p *devPtr a pointer through which the mapped graphics resource + * \p resource may be accessed. + * Returns in \p *size the size of the memory in bytes which may be accessed from that pointer. + * The value set in \p devPtr may change every time that \p resource is mapped. + * + * If \p resource is not a buffer then it cannot be accessed via a pointer and + * ::cudaErrorUnknown is returned. + * If \p resource is not mapped then ::cudaErrorUnknown is returned. + * * + * \param devPtr - Returned pointer through which \p resource may be accessed + * \param size - Returned size of the buffer accessible starting at \p *devPtr + * \param resource - Mapped resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphicsMapResources, + * ::cudaGraphicsSubResourceGetMappedArray, + * ::cuGraphicsResourceGetMappedPointer + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedPointer(void **devPtr, size_t *size, cudaGraphicsResource_t resource); + +/** + * \brief Get an array through which to access a subresource of a mapped graphics resource. + * + * Returns in \p *array an array through which the subresource of the mapped + * graphics resource \p resource which corresponds to array index \p arrayIndex + * and mipmap level \p mipLevel may be accessed. The value set in \p array may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::cudaErrorUnknown is returned. + * If \p arrayIndex is not a valid array index for \p resource then + * ::cudaErrorInvalidValue is returned. + * If \p mipLevel is not a valid mipmap level for \p resource then + * ::cudaErrorInvalidValue is returned. + * If \p resource is not mapped then ::cudaErrorUnknown is returned. + * + * \param array - Returned array through which a subresource of \p resource may be accessed + * \param resource - Mapped resource to access + * \param arrayIndex - Array index for array textures or cubemap face + * index as defined by ::cudaGraphicsCubeFace for + * cubemap textures for the subresource to access + * \param mipLevel - Mipmap level for the subresource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsSubResourceGetMappedArray + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsSubResourceGetMappedArray(cudaArray_t *array, cudaGraphicsResource_t resource, unsigned int arrayIndex, unsigned int mipLevel); + +/** + * \brief Get a mipmapped array through which to access a mapped graphics resource. + * + * Returns in \p *mipmappedArray a mipmapped array through which the mapped + * graphics resource \p resource may be accessed. The value set in \p mipmappedArray may + * change every time that \p resource is mapped. + * + * If \p resource is not a texture then it cannot be accessed via an array and + * ::cudaErrorUnknown is returned. + * If \p resource is not mapped then ::cudaErrorUnknown is returned. + * + * \param mipmappedArray - Returned mipmapped array through which \p resource may be accessed + * \param resource - Mapped resource to access + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorUnknown + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphicsResourceGetMappedPointer, + * ::cuGraphicsResourceGetMappedMipmappedArray + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedMipmappedArray(cudaMipmappedArray_t *mipmappedArray, cudaGraphicsResource_t resource); + +/** @} */ /* END CUDART_INTEROP */ + +/** + * \defgroup CUDART_TEXTURE Texture Reference Management [DEPRECATED] + * + * ___MANBRIEF___ texture reference management functions of the CUDA runtime + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the low level texture reference management functions + * of the CUDA runtime application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Binds a memory area to a texture + * + * \deprecated + * + * Binds \p size bytes of the memory area pointed to by \p devPtr to the + * texture reference \p texref. \p desc describes how the memory is interpreted + * when fetching values from the texture. Any memory previously bound to + * \p texref is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()" + * returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex1Dfetch() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * The total number of elements (or texels) in the linear address range + * cannot exceed ::cudaDeviceProp::maxTexture1DLinear[0]. + * The number of elements is computed as (\p size / elementSize), + * where elementSize is determined from \p desc. + * + * \param offset - Offset in bytes + * \param texref - Texture to bind + * \param devPtr - Memory area on device + * \param desc - Channel format + * \param size - Size of the memory area pointed to by devPtr + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetAddress, + * ::cuTexRefSetAddressMode, + * ::cuTexRefSetFormat, + * ::cuTexRefSetFlags, + * ::cuTexRefSetBorderColor + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t size __dv(UINT_MAX)); + +/** + * \brief Binds a 2D memory area to a texture + * + * \deprecated + * + * Binds the 2D memory area pointed to by \p devPtr to the + * texture reference \p texref. The size of the area is constrained by + * \p width in texel units, \p height in texel units, and \p pitch in byte + * units. \p desc describes how the memory is interpreted when fetching values + * from the texture. Any memory previously bound to \p texref is unbound. + * + * Since the hardware enforces an alignment requirement on texture base + * addresses, ::cudaBindTexture2D() returns in \p *offset a byte offset that + * must be applied to texture fetches in order to read from the desired memory. + * This offset must be divided by the texel size and passed to kernels that + * read from the texture so they can be applied to the ::tex2D() function. + * If the device memory pointer was returned from ::cudaMalloc(), the offset is + * guaranteed to be 0 and NULL may be passed as the \p offset parameter. + * + * \p width and \p height, which are specified in elements (or texels), cannot + * exceed ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] + * respectively. \p pitch, which is specified in bytes, cannot exceed + * ::cudaDeviceProp::maxTexture2DLinear[2]. + * + * The driver returns ::cudaErrorInvalidValue if \p pitch is not a multiple of + * ::cudaDeviceProp::texturePitchAlignment. + * + * \param offset - Offset in bytes + * \param texref - Texture reference to bind + * \param devPtr - 2D memory area on device + * \param desc - Channel format + * \param width - Width in texel units + * \param height - Height in texel units + * \param pitch - Pitch in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture< T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)", + * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaBindTextureToArray (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetAddress2D, + * ::cuTexRefSetFormat, + * ::cuTexRefSetFlags, + * ::cuTexRefSetAddressMode, + * ::cuTexRefSetBorderColor + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTexture2D(size_t *offset, const struct textureReference *texref, const void *devPtr, const struct cudaChannelFormatDesc *desc, size_t width, size_t height, size_t pitch); + +/** + * \brief Binds an array to a texture + * + * \deprecated + * + * Binds the CUDA array \p array to the texture reference \p texref. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA array previously bound to \p texref is unbound. + * + * \param texref - Texture to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetArray, + * ::cuTexRefSetFormat, + * ::cuTexRefSetFlags, + * ::cuTexRefSetAddressMode, + * ::cuTexRefSetFilterMode, + * ::cuTexRefSetBorderColor, + * ::cuTexRefSetMaxAnisotropy + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToArray(const struct textureReference *texref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc); + +/** + * \brief Binds a mipmapped array to a texture + * + * \deprecated + * + * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p texref. + * \p desc describes how the memory is interpreted when fetching values from + * the texture. Any CUDA mipmapped array previously bound to \p texref is unbound. + * + * \param texref - Texture to bind + * \param mipmappedArray - Memory mipmapped array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct texture< T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * ::cuTexRefSetMipmappedArray, + * ::cuTexRefSetMipmapFilterMode, + * ::cuTexRefSetMipmapLevelClamp, + * ::cuTexRefSetMipmapLevelBias, + * ::cuTexRefSetFormat, + * ::cuTexRefSetFlags, + * ::cuTexRefSetAddressMode, + * ::cuTexRefSetBorderColor, + * ::cuTexRefSetMaxAnisotropy + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindTextureToMipmappedArray(const struct textureReference *texref, cudaMipmappedArray_const_t mipmappedArray, const struct cudaChannelFormatDesc *desc); + +/** + * \brief Unbinds a texture + * + * \deprecated + * + * Unbinds the texture bound to \p texref. If \p texref is not currently bound, no operation is performed. + * + * \param texref - Texture to unbind + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct texture< T, dim, readMode>&) "cudaUnbindTexture (C++ API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)" + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaUnbindTexture(const struct textureReference *texref); + +/** + * \brief Get the alignment offset of a texture + * + * \deprecated + * + * Returns in \p *offset the offset that was returned when texture reference + * \p texref was bound. + * + * \param offset - Offset of texture reference in bytes + * \param texref - Texture to get offset of + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture, + * ::cudaErrorInvalidTextureBinding + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, ::cudaGetTextureReference, + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture< T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)" + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureAlignmentOffset(size_t *offset, const struct textureReference *texref); + +/** + * \brief Get the texture reference associated with a symbol + * + * \deprecated + * + * Returns in \p *texref the structure associated to the texture reference + * defined by symbol \p symbol. + * + * \param texref - Texture reference associated with symbol + * \param symbol - Texture to get reference for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidTexture + * \notefnerr + * \note_string_api_deprecation_50 + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaGetChannelDesc, + * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)", + * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)", + * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)", + * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)", + * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)", + * ::cuModuleGetTexRef + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetTextureReference(const struct textureReference **texref, const void *symbol); + +/** @} */ /* END CUDART_TEXTURE */ + +/** + * \defgroup CUDART_SURFACE Surface Reference Management [DEPRECATED] + * + * ___MANBRIEF___ surface reference management functions of the CUDA runtime + * API (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the low level surface reference management functions + * of the CUDA runtime application programming interface. + * + * Some functions have overloaded C++ API template versions documented separately in the + * \ref CUDART_HIGHLEVEL "C++ API Routines" module. + * + * @{ + */ + +/** + * \brief Binds an array to a surface + * + * \deprecated + * + * Binds the CUDA array \p array to the surface reference \p surfref. + * \p desc describes how the memory is interpreted when fetching values from + * the surface. Any CUDA array previously bound to \p surfref is unbound. + * + * \param surfref - Surface to bind + * \param array - Memory array on device + * \param desc - Channel format + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)", + * \ref ::cudaBindSurfaceToArray(const struct surface< T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)", + * ::cudaGetSurfaceReference, + * ::cuSurfRefSetArray + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaBindSurfaceToArray(const struct surfaceReference *surfref, cudaArray_const_t array, const struct cudaChannelFormatDesc *desc); + +/** + * \brief Get the surface reference associated with a symbol + * + * \deprecated + * + * Returns in \p *surfref the structure associated to the surface reference + * defined by symbol \p symbol. + * + * \param surfref - Surface reference associated with symbol + * \param symbol - Surface to get reference for + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidSurface + * \notefnerr + * \note_string_api_deprecation_50 + * \note_init_rt + * \note_callback + * + * \sa + * \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)", + * ::cuModuleGetSurfRef + */ +extern __CUDA_DEPRECATED __host__ cudaError_t CUDARTAPI cudaGetSurfaceReference(const struct surfaceReference **surfref, const void *symbol); + +/** @} */ /* END CUDART_SURFACE */ + +/** + * \defgroup CUDART_TEXTURE_OBJECT Texture Object Management + * + * ___MANBRIEF___ texture object management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the low level texture object management functions + * of the CUDA runtime application programming interface. The texture + * object API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Get the channel descriptor of an array + * + * Returns in \p *desc the channel descriptor of the CUDA array \p array. + * + * \param desc - Channel format + * \param array - Memory array on device + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa \ref ::cudaCreateChannelDesc(int, int, int, int, cudaChannelFormatKind) "cudaCreateChannelDesc (C API)", + * ::cudaCreateTextureObject, ::cudaCreateSurfaceObject + */ +extern __host__ cudaError_t CUDARTAPI cudaGetChannelDesc(struct cudaChannelFormatDesc *desc, cudaArray_const_t array); + +/** + * \brief Returns a channel descriptor using the specified format + * + * Returns a channel descriptor with format \p f and number of bits of each + * component \p x, \p y, \p z, and \p w. The ::cudaChannelFormatDesc is + * defined as: + * \code + struct cudaChannelFormatDesc { + int x, y, z, w; + enum cudaChannelFormatKind f; + }; + * \endcode + * + * where ::cudaChannelFormatKind is one of ::cudaChannelFormatKindSigned, + * ::cudaChannelFormatKindUnsigned, or ::cudaChannelFormatKindFloat. + * + * \param x - X component + * \param y - Y component + * \param z - Z component + * \param w - W component + * \param f - Channel format + * + * \return + * Channel descriptor with format \p f + * + * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)", + * ::cudaGetChannelDesc, ::cudaCreateTextureObject, ::cudaCreateSurfaceObject + */ +extern __host__ struct cudaChannelFormatDesc CUDARTAPI cudaCreateChannelDesc(int x, int y, int z, int w, enum cudaChannelFormatKind f); + +/** + * \brief Creates a texture object + * + * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes + * the data to texture from. \p pTexDesc describes how the data should be sampled. + * \p pResViewDesc is an optional argument that specifies an alternate format for + * the data described by \p pResDesc, and also describes the subresource region + * to restrict access to when texturing. \p pResViewDesc can only be specified if + * the type of resource is a CUDA array or a CUDA mipmapped array. + * + * Texture objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a texture object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The ::cudaResourceDesc structure is defined as: + * \code + struct cudaResourceDesc { + enum cudaResourceType resType; + + union { + struct { + cudaArray_t array; + } array; + struct { + cudaMipmappedArray_t mipmap; + } mipmap; + struct { + void *devPtr; + struct cudaChannelFormatDesc desc; + size_t sizeInBytes; + } linear; + struct { + void *devPtr; + struct cudaChannelFormatDesc desc; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; + }; + * \endcode + * where: + * - ::cudaResourceDesc::resType specifies the type of resource to texture from. + * CUresourceType is defined as: + * \code + enum cudaResourceType { + cudaResourceTypeArray = 0x00, + cudaResourceTypeMipmappedArray = 0x01, + cudaResourceTypeLinear = 0x02, + cudaResourceTypePitch2D = 0x03 + }; + * \endcode + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array + * must be set to a valid CUDA array handle. + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap + * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc::normalizedCoords must be set to true. + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr + * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment. + * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)). + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr + * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment. + * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width + * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed + * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively. + * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2]. + * + * + * The ::cudaTextureDesc struct is defined as + * \code + struct cudaTextureDesc { + enum cudaTextureAddressMode addressMode[3]; + enum cudaTextureFilterMode filterMode; + enum cudaTextureReadMode readMode; + int sRGB; + float borderColor[4]; + int normalizedCoords; + unsigned int maxAnisotropy; + enum cudaTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + int disableTrilinearOptimization; + }; + * \endcode + * where + * - ::cudaTextureDesc::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as: + * \code + enum cudaTextureAddressMode { + cudaAddressModeWrap = 0, + cudaAddressModeClamp = 1, + cudaAddressModeMirror = 2, + cudaAddressModeBorder = 3 + }; + * \endcode + * This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc::normalizedCoords + * is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp. + * + * - ::cudaTextureDesc::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as: + * \code + enum cudaTextureFilterMode { + cudaFilterModePoint = 0, + cudaFilterModeLinear = 1 + }; + * \endcode + * This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. + * + * - ::cudaTextureDesc::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as: + * \code + enum cudaTextureReadMode { + cudaReadModeElementType = 0, + cudaReadModeNormalizedFloat = 1 + }; + * \endcode + * Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of + * whether or not this ::cudaTextureDesc::readMode is set ::cudaReadModeNormalizedFloat is specified. + * + * - ::cudaTextureDesc::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch. + * + * - ::cudaTextureDesc::borderColor specifies the float values of color. where: + * ::cudaTextureDesc::borderColor[0] contains value of 'R', + * ::cudaTextureDesc::borderColor[1] contains value of 'G', + * ::cudaTextureDesc::borderColor[2] contains value of 'B', + * ::cudaTextureDesc::borderColor[3] contains value of 'A' + * Note that application using integer border color values will need to <reinterpret_cast> these values to float. + * The values are set only when the addressing mode specified by ::cudaTextureDesc::addressMode is cudaAddressModeBorder. + * + * - ::cudaTextureDesc::normalizedCoords specifies whether the texture coordinates will be normalized or not. + * + * - ::cudaTextureDesc::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be + * clamped to the range [1,16]. + * + * - ::cudaTextureDesc::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. + * + * - ::cudaTextureDesc::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. + * + * - ::cudaTextureDesc::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. + * + * - ::cudaTextureDesc::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. + * + * - ::cudaTextureDesc::disableTrilinearOptimization specifies whether the trilinear filtering optimizations will be disabled. + * + * The ::cudaResourceViewDesc struct is defined as + * \code + struct cudaResourceViewDesc { + enum cudaResourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + }; + * \endcode + * where: + * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should + * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block + * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format + * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have + * a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int + * format but with 4 channels. + * + * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the + * original resource. + * + * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. + * For non-mipmapped resources, this value has to be zero.::cudaTextureDesc::minMipmapLevelClamp and ::cudaTextureDesc::maxMipmapLevelClamp + * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, + * then the actual minimum mipmap level clamp will be 3.2. + * + * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value + * has to be zero. + * + * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. + * For non-layered resources, this value has to be zero. + * + * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * this value has to be zero. + * + * + * \param pTexObject - Texture object to create + * \param pResDesc - Resource descriptor + * \param pTexDesc - Texture descriptor + * \param pResViewDesc - Resource view descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDestroyTextureObject, + * ::cuTexObjectCreate + */ + +extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc); + +/** + * \brief Creates a texture object + * + * Creates a texture object and returns it in \p pTexObject. \p pResDesc describes + * the data to texture from. \p pTexDesc describes how the data should be sampled. + * \p pResViewDesc is an optional argument that specifies an alternate format for + * the data described by \p pResDesc, and also describes the subresource region + * to restrict access to when texturing. \p pResViewDesc can only be specified if + * the type of resource is a CUDA array or a CUDA mipmapped array. + * + * Texture objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a texture object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * The ::cudaResourceDesc structure is defined as: + * \code + struct cudaResourceDesc { + enum cudaResourceType resType; + + union { + struct { + cudaArray_t array; + } array; + struct { + cudaMipmappedArray_t mipmap; + } mipmap; + struct { + void *devPtr; + struct cudaChannelFormatDesc desc; + size_t sizeInBytes; + } linear; + struct { + void *devPtr; + struct cudaChannelFormatDesc desc; + size_t width; + size_t height; + size_t pitchInBytes; + } pitch2D; + } res; + }; + * \endcode + * where: + * - ::cudaResourceDesc::resType specifies the type of resource to texture from. + * CUresourceType is defined as: + * \code + enum cudaResourceType { + cudaResourceTypeArray = 0x00, + cudaResourceTypeMipmappedArray = 0x01, + cudaResourceTypeLinear = 0x02, + cudaResourceTypePitch2D = 0x03 + }; + * \endcode + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeArray, ::cudaResourceDesc::res::array::array + * must be set to a valid CUDA array handle. + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeMipmappedArray, ::cudaResourceDesc::res::mipmap::mipmap + * must be set to a valid CUDA mipmapped array handle and ::cudaTextureDesc_v2::normalizedCoords must be set to true. + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypeLinear, ::cudaResourceDesc::res::linear::devPtr + * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment. + * ::cudaResourceDesc::res::linear::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::linear::sizeInBytes + * specifies the size of the array in bytes. The total number of elements in the linear address range cannot exceed + * ::cudaDeviceProp::maxTexture1DLinear. The number of elements is computed as (sizeInBytes / sizeof(desc)). + * + * \par + * If ::cudaResourceDesc::resType is set to ::cudaResourceTypePitch2D, ::cudaResourceDesc::res::pitch2D::devPtr + * must be set to a valid device pointer, that is aligned to ::cudaDeviceProp::textureAlignment. + * ::cudaResourceDesc::res::pitch2D::desc describes the format and the number of components per array element. ::cudaResourceDesc::res::pitch2D::width + * and ::cudaResourceDesc::res::pitch2D::height specify the width and height of the array in elements, and cannot exceed + * ::cudaDeviceProp::maxTexture2DLinear[0] and ::cudaDeviceProp::maxTexture2DLinear[1] respectively. + * ::cudaResourceDesc::res::pitch2D::pitchInBytes specifies the pitch between two rows in bytes and has to be aligned to + * ::cudaDeviceProp::texturePitchAlignment. Pitch cannot exceed ::cudaDeviceProp::maxTexture2DLinear[2]. + * + * + * The ::cudaTextureDesc_v2 struct is defined as + * \code + struct cudaTextureDesc_v2 { + enum cudaTextureAddressMode addressMode[3]; + enum cudaTextureFilterMode filterMode; + enum cudaTextureReadMode readMode; + int sRGB; + float borderColor[4]; + int normalizedCoords; + unsigned int maxAnisotropy; + enum cudaTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + int disableTrilinearOptimization; + int seamlessCubemap; + }; + * \endcode + * where + * - ::cudaTextureDesc_v2::addressMode specifies the addressing mode for each dimension of the texture data. ::cudaTextureAddressMode is defined as: + * \code + enum cudaTextureAddressMode { + cudaAddressModeWrap = 0, + cudaAddressModeClamp = 1, + cudaAddressModeMirror = 2, + cudaAddressModeBorder = 3 + }; + * \endcode + * This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. Also, if ::cudaTextureDesc_v2::normalizedCoords + * is set to zero, ::cudaAddressModeWrap and ::cudaAddressModeMirror won't be supported and will be switched to ::cudaAddressModeClamp. + * + * - ::cudaTextureDesc_v2::filterMode specifies the filtering mode to be used when fetching from the texture. ::cudaTextureFilterMode is defined as: + * \code + enum cudaTextureFilterMode { + cudaFilterModePoint = 0, + cudaFilterModeLinear = 1 + }; + * \endcode + * This is ignored if ::cudaResourceDesc::resType is ::cudaResourceTypeLinear. + * + * - ::cudaTextureDesc_v2::readMode specifies whether integer data should be converted to floating point or not. ::cudaTextureReadMode is defined as: + * \code + enum cudaTextureReadMode { + cudaReadModeElementType = 0, + cudaReadModeNormalizedFloat = 1 + }; + * \endcode + * Note that this applies only to 8-bit and 16-bit integer formats. 32-bit integer format would not be promoted, regardless of + * whether or not this ::cudaTextureDesc_v2::readMode is set ::cudaReadModeNormalizedFloat is specified. + * + * - ::cudaTextureDesc_v2::sRGB specifies whether sRGB to linear conversion should be performed during texture fetch. + * + * - ::cudaTextureDesc_v2::borderColor specifies the float values of color. where: + * ::cudaTextureDesc_v2::borderColor[0] contains value of 'R', + * ::cudaTextureDesc_v2::borderColor[1] contains value of 'G', + * ::cudaTextureDesc_v2::borderColor[2] contains value of 'B', + * ::cudaTextureDesc_v2::borderColor[3] contains value of 'A' + * Note that application using integer border color values will need to <reinterpret_cast> these values to float. + * The values are set only when the addressing mode specified by ::cudaTextureDesc_v2::addressMode is cudaAddressModeBorder. + * + * - ::cudaTextureDesc_v2::normalizedCoords specifies whether the texture coordinates will be normalized or not. + * + * - ::cudaTextureDesc_v2::maxAnisotropy specifies the maximum anistropy ratio to be used when doing anisotropic filtering. This value will be + * clamped to the range [1,16]. + * + * - ::cudaTextureDesc_v2::mipmapFilterMode specifies the filter mode when the calculated mipmap level lies between two defined mipmap levels. + * + * - ::cudaTextureDesc_v2::mipmapLevelBias specifies the offset to be applied to the calculated mipmap level. + * + * - ::cudaTextureDesc_v2::minMipmapLevelClamp specifies the lower end of the mipmap level range to clamp access to. + * + * - ::cudaTextureDesc_v2::maxMipmapLevelClamp specifies the upper end of the mipmap level range to clamp access to. + * + * - ::cudaTextureDesc_v2::disableTrilinearOptimization specifies whether the trilinear filtering optimizations will be disabled. + * + * - ::cudaTextureDesc_v2::seamlessCubemap specifies whether seamless cube map filtering is enabled. This flag can only be specified if the + * underlying resource is a CUDA array or a CUDA mipmapped array that was created with the flag ::cudaArrayCubemap. + * When seamless cube map filtering is enabled, texture address modes specified by ::cudaTextureDesc_v2::addressMode are ignored. + * Instead, if the ::cudaTextureDesc_v2::filterMode is set to ::cudaFilterModePoint the address mode ::cudaAddressModeClamp will be applied for all dimensions. + * If the ::cudaTextureDesc_v2::filterMode is set to ::cudaFilterModeLinear seamless cube map filtering will be performed when sampling along the cube face borders. + * + * The ::cudaResourceViewDesc struct is defined as + * \code + struct cudaResourceViewDesc { + enum cudaResourceViewFormat format; + size_t width; + size_t height; + size_t depth; + unsigned int firstMipmapLevel; + unsigned int lastMipmapLevel; + unsigned int firstLayer; + unsigned int lastLayer; + }; + * \endcode + * where: + * - ::cudaResourceViewDesc::format specifies how the data contained in the CUDA array or CUDA mipmapped array should + * be interpreted. Note that this can incur a change in size of the texture data. If the resource view format is a block + * compressed format, then the underlying CUDA array or CUDA mipmapped array has to have a 32-bit unsigned integer format + * with 2 or 4 channels, depending on the block compressed format. For ex., BC1 and BC4 require the underlying CUDA array to have + * a 32-bit unsigned int with 2 channels. The other BC formats require the underlying resource to have the same 32-bit unsigned int + * format but with 4 channels. + * + * - ::cudaResourceViewDesc::width specifies the new width of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original width of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::cudaResourceViewDesc::height specifies the new height of the texture data. If the resource view format is a block + * compressed format, this value has to be 4 times the original height of the resource. For non block compressed formats, + * this value has to be equal to that of the original resource. + * + * - ::cudaResourceViewDesc::depth specifies the new depth of the texture data. This value has to be equal to that of the + * original resource. + * + * - ::cudaResourceViewDesc::firstMipmapLevel specifies the most detailed mipmap level. This will be the new mipmap level zero. + * For non-mipmapped resources, this value has to be zero.::cudaTextureDesc_v2::minMipmapLevelClamp and ::cudaTextureDesc_v2::maxMipmapLevelClamp + * will be relative to this value. For ex., if the firstMipmapLevel is set to 2, and a minMipmapLevelClamp of 1.2 is specified, + * then the actual minimum mipmap level clamp will be 3.2. + * + * - ::cudaResourceViewDesc::lastMipmapLevel specifies the least detailed mipmap level. For non-mipmapped resources, this value + * has to be zero. + * + * - ::cudaResourceViewDesc::firstLayer specifies the first layer index for layered textures. This will be the new layer zero. + * For non-layered resources, this value has to be zero. + * + * - ::cudaResourceViewDesc::lastLayer specifies the last layer index for layered textures. For non-layered resources, + * this value has to be zero. + * + * + * \param pTexObject - Texture object to create + * \param pResDesc - Resource descriptor + * \param pTexDesc - Texture descriptor + * \param pResViewDesc - Resource view descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDestroyTextureObject, + * ::cuTexObjectCreate + */ + +extern __host__ cudaError_t CUDARTAPI cudaCreateTextureObject_v2(cudaTextureObject_t *pTexObject, const struct cudaResourceDesc *pResDesc, const struct cudaTextureDesc_v2 *pTexDesc, const struct cudaResourceViewDesc *pResViewDesc); + +/** + * \brief Destroys a texture object + * + * Destroys the texture object specified by \p texObject. + * + * \param texObject - Texture object to destroy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaDestroyTextureObject(cudaTextureObject_t texObject); + +/** + * \brief Returns a texture object's resource descriptor + * + * Returns the resource descriptor for the texture object specified by \p texObject. + * + * \param pResDesc - Resource descriptor + * \param texObject - Texture object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectGetResourceDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaTextureObject_t texObject); + +/** + * \brief Returns a texture object's texture descriptor + * + * Returns the texture descriptor for the texture object specified by \p texObject. + * + * \param pTexDesc - Texture descriptor + * \param texObject - Texture object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectGetTextureDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc(struct cudaTextureDesc *pTexDesc, cudaTextureObject_t texObject); + +/** + * \brief Returns a texture object's texture descriptor + * + * Returns the texture descriptor for the texture object specified by \p texObject. + * + * \param pTexDesc - Texture descriptor + * \param texObject - Texture object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectGetTextureDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectTextureDesc_v2(struct cudaTextureDesc_v2 *pTexDesc, cudaTextureObject_t texObject); + +/** + * \brief Returns a texture object's resource view descriptor + * + * Returns the resource view descriptor for the texture object specified by \p texObject. + * If no resource view was specified, ::cudaErrorInvalidValue is returned. + * + * \param pResViewDesc - Resource view descriptor + * \param texObject - Texture object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaCreateTextureObject, + * ::cuTexObjectGetResourceViewDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetTextureObjectResourceViewDesc(struct cudaResourceViewDesc *pResViewDesc, cudaTextureObject_t texObject); + +/** @} */ /* END CUDART_TEXTURE_OBJECT */ + +/** + * \defgroup CUDART_SURFACE_OBJECT Surface Object Management + * + * ___MANBRIEF___ surface object management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the low level texture object management functions + * of the CUDA runtime application programming interface. The surface object + * API is only supported on devices of compute capability 3.0 or higher. + * + * @{ + */ + +/** + * \brief Creates a surface object + * + * Creates a surface object and returns it in \p pSurfObject. \p pResDesc describes + * the data to perform surface load/stores on. ::cudaResourceDesc::resType must be + * ::cudaResourceTypeArray and ::cudaResourceDesc::res::array::array + * must be set to a valid CUDA array handle. + * + * Surface objects are only supported on devices of compute capability 3.0 or higher. + * Additionally, a surface object is an opaque value, and, as such, should only be + * accessed through CUDA API calls. + * + * \param pSurfObject - Surface object to create + * \param pResDesc - Resource descriptor + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidChannelDescriptor, + * ::cudaErrorInvalidResourceHandle + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDestroySurfaceObject, + * ::cuSurfObjectCreate + */ + +extern __host__ cudaError_t CUDARTAPI cudaCreateSurfaceObject(cudaSurfaceObject_t *pSurfObject, const struct cudaResourceDesc *pResDesc); + +/** + * \brief Destroys a surface object + * + * Destroys the surface object specified by \p surfObject. + * + * \param surfObject - Surface object to destroy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa + * ::cudaCreateSurfaceObject, + * ::cuSurfObjectDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaDestroySurfaceObject(cudaSurfaceObject_t surfObject); + +/** + * \brief Returns a surface object's resource descriptor + * Returns the resource descriptor for the surface object specified by \p surfObject. + * + * \param pResDesc - Resource descriptor + * \param surfObject - Surface object + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaCreateSurfaceObject, + * ::cuSurfObjectGetResourceDesc + */ +extern __host__ cudaError_t CUDARTAPI cudaGetSurfaceObjectResourceDesc(struct cudaResourceDesc *pResDesc, cudaSurfaceObject_t surfObject); + +/** @} */ /* END CUDART_SURFACE_OBJECT */ + +/** + * \defgroup CUDART__VERSION Version Management + * + * @{ + */ + +/** + * \brief Returns the latest version of CUDA supported by the driver + * + * Returns in \p *driverVersion the latest version of CUDA supported by + * the driver. The version is returned as (1000 × major + 10 × minor). + * For example, CUDA 9.2 would be represented by 9020. If no driver is installed, + * then 0 is returned as the driver version. + * + * This function automatically returns ::cudaErrorInvalidValue + * if \p driverVersion is NULL. + * + * \param driverVersion - Returns the CUDA driver version. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaRuntimeGetVersion, + * ::cuDriverGetVersion + */ +extern __host__ cudaError_t CUDARTAPI cudaDriverGetVersion(int *driverVersion); + +/** + * \brief Returns the CUDA Runtime version + * + * Returns in \p *runtimeVersion the version number of the current CUDA + * Runtime instance. The version is returned as + * (1000 × major + 10 × minor). For example, + * CUDA 9.2 would be represented by 9020. + * + * This function automatically returns ::cudaErrorInvalidValue if + * the \p runtimeVersion argument is NULL. + * + * \param runtimeVersion - Returns the CUDA Runtime version. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDriverGetVersion, + * ::cuDriverGetVersion + */ +extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaRuntimeGetVersion(int *runtimeVersion); + +/** @} */ /* END CUDART__VERSION */ + +/** + * \defgroup CUDART_GRAPH Graph Management + * + * ___MANBRIEF___ graph management functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the graph management functions of CUDA + * runtime application programming interface. + * + * @{ + */ + +/** + * \brief Creates a graph + * + * Creates an empty graph, which is returned via \p pGraph. + * + * \param pGraph - Returns newly created graph + * \param flags - Graph creation flags, must be 0 + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode, + * ::cudaGraphInstantiate, + * ::cudaGraphDestroy, + * ::cudaGraphGetNodes, + * ::cudaGraphGetRootNodes, + * ::cudaGraphGetEdges, + * ::cudaGraphClone + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphCreate(cudaGraph_t *pGraph, unsigned int flags); + +/** + * \brief Creates a kernel execution node and adds it to a graph + * + * Creates a new kernel execution node and adds it to \p graph with \p numDependencies + * dependencies specified via \p pDependencies and arguments specified in \p pNodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * The cudaKernelNodeParams structure is defined as: + * + * \code + * struct cudaKernelNodeParams + * { + * void* func; + * dim3 gridDim; + * dim3 blockDim; + * unsigned int sharedMemBytes; + * void **kernelParams; + * void **extra; + * }; + * \endcode + * + * When the graph is launched, the node will invoke kernel \p func on a (\p gridDim.x x + * \p gridDim.y x \p gridDim.z) grid of blocks. Each block contains + * (\p blockDim.x x \p blockDim.y x \p blockDim.z) threads. + * + * \p sharedMem sets the amount of dynamic shared memory that will be + * available to each thread block. + * + * Kernel parameters to \p func can be specified in one of two ways: + * + * 1) Kernel parameters can be specified via \p kernelParams. If the kernel has N + * parameters, then \p kernelParams needs to be an array of N pointers. Each pointer, + * from \p kernelParams[0] to \p kernelParams[N-1], points to the region of memory from which the actual + * parameter will be copied. The number of kernel parameters and their offsets and sizes do not need + * to be specified as that information is retrieved directly from the kernel's image. + * + * 2) Kernel parameters can also be packaged by the application into a single buffer that is passed in + * via \p extra. This places the burden on the application of knowing each kernel + * parameter's size and alignment/padding within the buffer. The \p extra parameter exists + * to allow this function to take additional less commonly used arguments. \p extra specifies + * a list of names of extra settings and their corresponding values. Each extra setting name is + * immediately followed by the corresponding value. The list must be terminated with either NULL or + * CU_LAUNCH_PARAM_END. + * + * - ::CU_LAUNCH_PARAM_END, which indicates the end of the \p extra + * array; + * - ::CU_LAUNCH_PARAM_BUFFER_POINTER, which specifies that the next + * value in \p extra will be a pointer to a buffer + * containing all the kernel parameters for launching kernel + * \p func; + * - ::CU_LAUNCH_PARAM_BUFFER_SIZE, which specifies that the next + * value in \p extra will be a pointer to a size_t + * containing the size of the buffer specified with + * ::CU_LAUNCH_PARAM_BUFFER_POINTER; + * + * The error ::cudaErrorInvalidValue will be returned if kernel parameters are specified with both + * \p kernelParams and \p extra (i.e. both \p kernelParams and + * \p extra are non-NULL). + * + * The \p kernelParams or \p extra array, as well as the argument values it points to, + * are copied during this call. + * + * \note Kernels launched using graphs must not use texture and surface references. Reading or + * writing through any texture or surface reference is undefined behavior. + * This restriction does not apply to texture and surface objects. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param pNodeParams - Parameters for the GPU execution node + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDeviceFunction + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaLaunchKernel, + * ::cudaGraphKernelNodeGetParams, + * ::cudaGraphKernelNodeSetParams, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphAddKernelNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaKernelNodeParams *pNodeParams); + +/** + * \brief Returns a kernel node's parameters + * + * Returns the parameters of kernel node \p node in \p pNodeParams. + * The \p kernelParams or \p extra array returned in \p pNodeParams, + * as well as the argument values it points to, are owned by the node. + * This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cudaGraphKernelNodeSetParams to update the + * parameters of this node. + * + * The params will contain either \p kernelParams or \p extra, + * according to which of these was most recently set on the node. + * + * \param node - Node to get the parameters for + * \param pNodeParams - Pointer to return the parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDeviceFunction + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaLaunchKernel, + * ::cudaGraphAddKernelNode, + * ::cudaGraphKernelNodeSetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetParams(cudaGraphNode_t node, struct cudaKernelNodeParams *pNodeParams); + +/** + * \brief Sets a kernel node's parameters + * + * Sets the parameters of kernel node \p node to \p pNodeParams. + * + * \param node - Node to set the parameters for + * \param pNodeParams - Parameters to copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle, + * ::cudaErrorMemoryAllocation + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaLaunchKernel, + * ::cudaGraphAddKernelNode, + * ::cudaGraphKernelNodeGetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetParams(cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams); + +/** + * \brief Copies attributes from source node to destination node. + * + * Copies attributes from source node \p src to destination node \p dst. + * Both node must have the same context. + * + * \param[out] dst Destination node + * \param[in] src Source node + * For list of attributes see ::cudaKernelNodeAttrID + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidContext + * \notefnerr + * + * \sa + * ::cudaAccessPolicyWindow + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeCopyAttributes( + cudaGraphNode_t hSrc, + cudaGraphNode_t hDst); + +/** + * \brief Queries node attribute. + * + * Queries attribute \p attr from node \p hNode and stores it in corresponding + * member of \p value_out. + * + * \param[in] hNode + * \param[in] attr + * \param[out] value_out + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * + * \sa + * ::cudaAccessPolicyWindow + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeGetAttribute( + cudaGraphNode_t hNode, + cudaKernelNodeAttrID attr, + cudaKernelNodeAttrValue *value_out); + +/** + * \brief Sets node attribute. + * + * Sets attribute \p attr on node \p hNode from corresponding attribute of + * \p value. + * + * \param[out] hNode + * \param[in] attr + * \param[out] value + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidResourceHandle + * \notefnerr + * + * \sa + * ::cudaAccessPolicyWindow + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphKernelNodeSetAttribute( + cudaGraphNode_t hNode, + cudaKernelNodeAttrID attr, + const cudaKernelNodeAttrValue *value); + +/** + * \brief Creates a memcpy node and adds it to a graph + * + * Creates a new memcpy node and adds it to \p graph with \p numDependencies + * dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will perform the memcpy described by \p pCopyParams. + * See ::cudaMemcpy3D() for a description of the structure and its restrictions. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param pCopyParams - Parameters for the memory copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpy3D, + * ::cudaGraphAddMemcpyNodeToSymbol, + * ::cudaGraphAddMemcpyNodeFromSymbol, + * ::cudaGraphAddMemcpyNode1D, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemsetNode + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemcpy3DParms *pCopyParams); + +/** + * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph + * + * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with + * \p numDependencies dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p src to the memory area pointed to by \p offset bytes from the start + * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyToSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeFromSymbol, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemsetNode + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeToSymbol( + cudaGraphNode_t *pGraphNode, + cudaGraph_t graph, + const cudaGraphNode_t *pDependencies, + size_t numDependencies, + const void* symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph + * + * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with + * \p numDependencies dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area + * pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable + * that resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyFromSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeToSymbol, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemsetNode + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNodeFromSymbol( + cudaGraphNode_t* pGraphNode, + cudaGraph_t graph, + const cudaGraphNode_t* pDependencies, + size_t numDependencies, + void* dst, + const void* symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Creates a 1D memcpy node and adds it to a graph + * + * Creates a new 1D memcpy node and adds it to \p graph with \p numDependencies + * dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will copy \p count bytes from the memory + * area pointed to by \p src to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. Launching a + * memcpy node with dst and src pointers that do not match the direction of + * the copy results in an undefined behavior. + * + * Memcpy nodes have some additional restrictions with regards to managed memory, if the + * system contains at least one device which has a zero value for the device attribute + * ::cudaDevAttrConcurrentManagedAccess. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param dst - Destination memory address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpy, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParams1D, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemsetNode + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemcpyNode1D( + cudaGraphNode_t *pGraphNode, + cudaGraph_t graph, + const cudaGraphNode_t *pDependencies, + size_t numDependencies, + void* dst, + const void* src, + size_t count, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Returns a memcpy node's parameters + * + * Returns the parameters of memcpy node \p node in \p pNodeParams. + * + * \param node - Node to get the parameters for + * \param pNodeParams - Pointer to return the parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpy3D, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeSetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeGetParams(cudaGraphNode_t node, struct cudaMemcpy3DParms *pNodeParams); + +/** + * \brief Sets a memcpy node's parameters + * + * Sets the parameters of memcpy node \p node to \p pNodeParams. + * + * \param node - Node to set the parameters for + * \param pNodeParams - Parameters to copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpy3D, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphMemcpyNodeSetParams1D, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams(cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams); + +/** + * \brief Sets a memcpy node's parameters to copy to a symbol on the device + * + * Sets the parameters of memcpy node \p node to the copy described by the provided parameters. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p src to the memory area pointed to by \p offset bytes from the start + * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that + * resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of + * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault + * is only allowed on systems that support unified virtual addressing. + * + * \param node - Node to set the parameters for + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyToSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsToSymbol( + cudaGraphNode_t node, + const void* symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Sets a memcpy node's parameters to copy from a symbol on the device + * + * Sets the parameters of memcpy node \p node to the copy described by the provided parameters. + * + * When the graph is launched, the node will copy \p count bytes from the memory area + * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area + * pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable + * that resides in global or constant memory space. \p kind can be either + * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. + * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer + * is inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. + * + * \param node - Node to set the parameters for + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpyFromSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParamsFromSymbol( + cudaGraphNode_t node, + void* dst, + const void* symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Sets a memcpy node's parameters to perform a 1-dimensional copy + * + * Sets the parameters of memcpy node \p node to the copy described by the provided parameters. + * + * When the graph is launched, the node will copy \p count bytes from the memory + * area pointed to by \p src to the memory area pointed to by \p dst, where + * \p kind specifies the direction of the copy, and must be one of + * ::cudaMemcpyHostToHost, ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToHost, + * ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault. Passing + * ::cudaMemcpyDefault is recommended, in which case the type of transfer is + * inferred from the pointer values. However, ::cudaMemcpyDefault is only + * allowed on systems that support unified virtual addressing. Launching a + * memcpy node with dst and src pointers that do not match the direction of + * the copy results in an undefined behavior. + * + * \param node - Node to set the parameters for + * \param dst - Destination memory address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemcpy, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeGetParams + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphMemcpyNodeSetParams1D( + cudaGraphNode_t node, + void* dst, + const void* src, + size_t count, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Creates a memset node and adds it to a graph + * + * Creates a new memset node and adds it to \p graph with \p numDependencies + * dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * The element size must be 1, 2, or 4 bytes. + * When the graph is launched, the node will perform the memset described by \p pMemsetParams. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param pMemsetParams - Parameters for the memory set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorInvalidDevice + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemset2D, + * ::cudaGraphMemsetNodeGetParams, + * ::cudaGraphMemsetNodeSetParams, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemcpyNode + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemsetNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaMemsetParams *pMemsetParams); + +/** + * \brief Returns a memset node's parameters + * + * Returns the parameters of memset node \p node in \p pNodeParams. + * + * \param node - Node to get the parameters for + * \param pNodeParams - Pointer to return the parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemset2D, + * ::cudaGraphAddMemsetNode, + * ::cudaGraphMemsetNodeSetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeGetParams(cudaGraphNode_t node, struct cudaMemsetParams *pNodeParams); + +/** + * \brief Sets a memset node's parameters + * + * Sets the parameters of memset node \p node to \p pNodeParams. + * + * \param node - Node to set the parameters for + * \param pNodeParams - Parameters to copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaMemset2D, + * ::cudaGraphAddMemsetNode, + * ::cudaGraphMemsetNodeGetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphMemsetNodeSetParams(cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams); + +/** + * \brief Creates a host execution node and adds it to a graph + * + * Creates a new CPU execution node and adds it to \p graph with \p numDependencies + * dependencies specified via \p pDependencies and arguments specified in \p pNodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * When the graph is launched, the node will invoke the specified CPU function. + * Host nodes are not supported under MPS with pre-Volta GPUs. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param pNodeParams - Parameters for the host node + * + * \return + * ::cudaSuccess, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaLaunchHostFunc, + * ::cudaGraphHostNodeGetParams, + * ::cudaGraphHostNodeSetParams, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphAddHostNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaHostNodeParams *pNodeParams); + +/** + * \brief Returns a host node's parameters + * + * Returns the parameters of host node \p node in \p pNodeParams. + * + * \param node - Node to get the parameters for + * \param pNodeParams - Pointer to return the parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaLaunchHostFunc, + * ::cudaGraphAddHostNode, + * ::cudaGraphHostNodeSetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeGetParams(cudaGraphNode_t node, struct cudaHostNodeParams *pNodeParams); + +/** + * \brief Sets a host node's parameters + * + * Sets the parameters of host node \p node to \p nodeParams. + * + * \param node - Node to set the parameters for + * \param pNodeParams - Parameters to copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaLaunchHostFunc, + * ::cudaGraphAddHostNode, + * ::cudaGraphHostNodeGetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphHostNodeSetParams(cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams); + +/** + * \brief Creates a child graph node and adds it to a graph + * + * Creates a new node which executes an embedded graph, and adds it to \p graph with + * \p numDependencies dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * If \p hGraph contains allocation or free nodes, this call will return an error. + * + * The node executes an embedded child graph. The child graph is cloned in this call. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param childGraph - The graph to clone into this node + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphChildGraphNodeGetGraph, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode, + * ::cudaGraphClone + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphAddChildGraphNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaGraph_t childGraph); + +/** + * \brief Gets a handle to the embedded graph of a child graph node + * + * Gets a handle to the embedded graph in a child graph node. This call + * does not clone the graph. Changes to the graph will be reflected in + * the node, and the node retains ownership of the graph. + * + * Allocation and free nodes cannot be added to the returned graph. + * Attempting to do so will return an error. + * + * \param node - Node to get the embedded graph for + * \param pGraph - Location to store a handle to the graph + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphNodeFindInClone + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphChildGraphNodeGetGraph(cudaGraphNode_t node, cudaGraph_t *pGraph); + +/** + * \brief Creates an empty node and adds it to a graph + * + * Creates a new node which performs no operation, and adds it to \p graph with + * \p numDependencies dependencies specified via \p pDependencies. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p pGraphNode. + * + * An empty node performs no operation during execution, but can be used for + * transitive ordering. For example, a phased execution graph with 2 groups of n + * nodes with a barrier between them can be represented using an empty node and + * 2*n dependency edges, rather than no empty node and n^2 dependency edges. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphAddEmptyNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies); + +/** + * \brief Creates an event record node and adds it to a graph + * + * Creates a new event record node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and event specified in \p event. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * Each launch of the graph will record \p event to capture execution of the + * node's dependencies. + * + * These nodes may not be used in loops or conditionals. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param event - Event for the node + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddEventWaitNode, + * ::cudaEventRecordWithFlags, + * ::cudaStreamWaitEvent, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode, + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventRecordNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event); +#endif + +/** + * \brief Returns the event associated with an event record node + * + * Returns the event of event record node \p hNode in \p event_out. + * + * \param hNode - Node to get the event for + * \param event_out - Pointer to return the event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddEventRecordNode, + * ::cudaGraphEventRecordNodeSetEvent, + * ::cudaGraphEventWaitNodeGetEvent, + * ::cudaEventRecordWithFlags, + * ::cudaStreamWaitEvent + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out); +#endif + +/** + * \brief Sets an event record node's event + * + * Sets the event of event record node \p hNode to \p event. + * + * \param hNode - Node to set the event for + * \param event - Event to use + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddEventRecordNode, + * ::cudaGraphEventRecordNodeGetEvent, + * ::cudaGraphEventWaitNodeSetEvent, + * ::cudaEventRecordWithFlags, + * ::cudaStreamWaitEvent + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphEventRecordNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event); +#endif + +/** + * \brief Creates an event wait node and adds it to a graph + * + * Creates a new event wait node and adds it to \p hGraph with \p numDependencies + * dependencies specified via \p dependencies and event specified in \p event. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. + * A handle to the new node will be returned in \p phGraphNode. + * + * The graph node will wait for all work captured in \p event. See ::cuEventRecord() + * for details on what is captured by an event. The synchronization will be performed + * efficiently on the device when applicable. \p event may be from a different context + * or device than the launch stream. + * + * These nodes may not be used in loops or conditionals. + * + * \param phGraphNode - Returns newly created node + * \param hGraph - Graph to which to add the node + * \param dependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param event - Event for the node + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddEventRecordNode, + * ::cudaEventRecordWithFlags, + * ::cudaStreamWaitEvent, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode, + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphAddEventWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, cudaEvent_t event); +#endif + +/** + * \brief Returns the event associated with an event wait node + * + * Returns the event of event wait node \p hNode in \p event_out. + * + * \param hNode - Node to get the event for + * \param event_out - Pointer to return the event + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddEventWaitNode, + * ::cudaGraphEventWaitNodeSetEvent, + * ::cudaGraphEventRecordNodeGetEvent, + * ::cudaEventRecordWithFlags, + * ::cudaStreamWaitEvent + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeGetEvent(cudaGraphNode_t node, cudaEvent_t *event_out); +#endif + +/** + * \brief Sets an event wait node's event + * + * Sets the event of event wait node \p hNode to \p event. + * + * \param hNode - Node to set the event for + * \param event - Event to use + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddEventWaitNode, + * ::cudaGraphEventWaitNodeGetEvent, + * ::cudaGraphEventRecordNodeSetEvent, + * ::cudaEventRecordWithFlags, + * ::cudaStreamWaitEvent + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphEventWaitNodeSetEvent(cudaGraphNode_t node, cudaEvent_t event); +#endif + +/** + * \brief Creates an external semaphore signal node and adds it to a graph + * + * Creates a new external semaphore signal node and adds it to \p graph with \p + * numDependencies dependencies specified via \p dependencies and arguments specified + * in \p nodeParams. It is possible for \p numDependencies to be 0, in which case the + * node will be placed at the root of the graph. \p dependencies may not have any + * duplicate entries. A handle to the new node will be returned in \p pGraphNode. + * + * Performs a signal operation on a set of externally allocated semaphore objects + * when the node is launched. The operation(s) will occur after all of the node's + * dependencies have completed. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphExternalSemaphoresSignalNodeGetParams, + * ::cudaGraphExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaImportExternalSemaphore, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddEventRecordNode, + * ::cudaGraphAddEventWaitNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode, + */ +#if __CUDART_API_VERSION >= 11020 +extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresSignalNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams); +#endif + +/** + * \brief Returns an external semaphore signal node's parameters + * + * Returns the parameters of an external semaphore signal node \p hNode in \p params_out. + * The \p extSemArray and \p paramsArray returned in \p params_out, + * are owned by the node. This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cudaGraphExternalSemaphoresSignalNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaLaunchKernel, + * ::cudaGraphAddExternalSemaphoresSignalNode, + * ::cudaGraphExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync + */ +#if __CUDART_API_VERSION >= 11020 +extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresSignalNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreSignalNodeParams *params_out); +#endif + +/** + * \brief Sets an external semaphore signal node's parameters + * + * Sets the parameters of an external semaphore signal node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddExternalSemaphoresSignalNode, + * ::cudaGraphExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync + */ +#if __CUDART_API_VERSION >= 11020 +extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresSignalNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams); +#endif + +/** + * \brief Creates an external semaphore wait node and adds it to a graph + * + * Creates a new external semaphore wait node and adds it to \p graph with \p numDependencies + * dependencies specified via \p dependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p dependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p pGraphNode. + * + * Performs a wait operation on a set of externally allocated semaphore objects + * when the node is launched. The node's dependencies will not be launched until + * the wait operation has completed. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphExternalSemaphoresWaitNodeGetParams, + * ::cudaGraphExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphAddExternalSemaphoresSignalNode, + * ::cudaImportExternalSemaphore, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddEventRecordNode, + * ::cudaGraphAddEventWaitNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode, + */ +#if __CUDART_API_VERSION >= 11020 +extern __host__ cudaError_t CUDARTAPI cudaGraphAddExternalSemaphoresWaitNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams); +#endif + +/** + * \brief Returns an external semaphore wait node's parameters + * + * Returns the parameters of an external semaphore wait node \p hNode in \p params_out. + * The \p extSemArray and \p paramsArray returned in \p params_out, + * are owned by the node. This memory remains valid until the node is destroyed or its + * parameters are modified, and should not be modified + * directly. Use ::cudaGraphExternalSemaphoresSignalNodeSetParams to update the + * parameters of this node. + * + * \param hNode - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaLaunchKernel, + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaGraphExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync + */ +#if __CUDART_API_VERSION >= 11020 +extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresWaitNodeGetParams(cudaGraphNode_t hNode, struct cudaExternalSemaphoreWaitNodeParams *params_out); +#endif + +/** + * \brief Sets an external semaphore wait node's parameters + * + * Sets the parameters of an external semaphore wait node \p hNode to \p nodeParams. + * + * \param hNode - Node to set the parameters for + * \param nodeParams - Parameters to copy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaGraphExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync + */ +#if __CUDART_API_VERSION >= 11020 +extern __host__ cudaError_t CUDARTAPI cudaGraphExternalSemaphoresWaitNodeSetParams(cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams); +#endif + +/** + * \brief Creates an allocation node and adds it to a graph + * + * Creates a new allocation node and adds it to \p graph with \p numDependencies + * dependencies specified via \p pDependencies and arguments specified in \p nodeParams. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p pGraphNode. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param nodeParams - Parameters for the node + * + * When ::cudaGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in + * \p nodeParams.dptr. The allocation's address remains fixed across instantiations and launches. + * + * If the allocation is freed in the same graph, by creating a free node using ::cudaGraphAddMemFreeNode, + * the allocation can be accessed by nodes ordered after the allocation node but before the free node. + * These allocations cannot be freed outside the owning graph, and they can only be freed once in the + * owning graph. + * + * If the allocation is not freed in the same graph, then it can be accessed not only by nodes in the + * graph which are ordered after the allocation node, but also by stream operations ordered after the + * graph's execution but before the allocation is freed. + * + * Allocations which are not freed in the same graph can be freed by: + * - passing the allocation to ::cudaMemFreeAsync or ::cudaMemFree; + * - launching a graph with a free node for that allocation; or + * - specifying ::cudaGraphInstantiateFlagAutoFreeOnLaunch during instantiation, which makes + * each launch behave as though it called ::cudaMemFreeAsync for every unfreed allocation. + * + * It is not possible to free an allocation in both the owning graph and another graph. If the allocation + * is freed in the same graph, a free node cannot be added to another graph. If the allocation is freed + * in another graph, a free node can no longer be added to the owning graph. + * + * The following restrictions apply to graphs which contain allocation and/or memory free nodes: + * - Nodes and edges of the graph cannot be deleted. + * - The graph cannot be used in a child node. + * - Only one instantiation of the graph may exist at any point in time. + * - The graph cannot be cloned. + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue, + * ::cudaErrorOutOfMemory + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cudaGraphAddMemFreeNode, + * ::cudaGraphMemAllocNodeGetParams, + * ::cudaDeviceGraphMemTrim, + * ::cudaDeviceGetGraphMemAttribute, + * ::cudaDeviceSetGraphMemAttribute, + * ::cudaMallocAsync, + * ::cudaFreeAsync, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddEventRecordNode, + * ::cudaGraphAddEventWaitNode, + * ::cudaGraphAddExternalSemaphoresSignalNode, + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode + */ +#if __CUDART_API_VERSION >= 11040 +extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemAllocNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, struct cudaMemAllocNodeParams *nodeParams); +#endif + +/** + * \brief Returns a memory alloc node's parameters + * + * Returns the parameters of a memory alloc node \p hNode in \p params_out. + * The \p poolProps and \p accessDescs returned in \p params_out, are owned by the + * node. This memory remains valid until the node is destroyed. The returned + * parameters must not be modified. + * + * \param node - Node to get the parameters for + * \param params_out - Pointer to return the parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemAllocNode, + * ::cudaGraphMemFreeNodeGetParams + */ +#if __CUDART_API_VERSION >= 11040 +extern __host__ cudaError_t CUDARTAPI cudaGraphMemAllocNodeGetParams(cudaGraphNode_t node, struct cudaMemAllocNodeParams *params_out); +#endif + +/** + * \brief Creates a memory free node and adds it to a graph + * + * Creates a new memory free node and adds it to \p graph with \p numDependencies + * dependencies specified via \p pDependencies and address specified in \p dptr. + * It is possible for \p numDependencies to be 0, in which case the node will be placed + * at the root of the graph. \p pDependencies may not have any duplicate entries. A handle + * to the new node will be returned in \p pGraphNode. + * + * \param pGraphNode - Returns newly created node + * \param graph - Graph to which to add the node + * \param pDependencies - Dependencies of the node + * \param numDependencies - Number of dependencies + * \param dptr - Address of memory to free + * + * ::cudaGraphAddMemFreeNode will return ::cudaErrorInvalidValue if the user attempts to free: + * - an allocation twice in the same graph. + * - an address that was not returned by an allocation node. + * - an invalid address. + * + * The following restrictions apply to graphs which contain allocation and/or memory free nodes: + * - Nodes and edges of the graph cannot be deleted. + * - The graph cannot be used in a child node. + * - Only one instantiation of the graph may exist at any point in time. + * - The graph cannot be cloned. + * + * \return + * ::cudaSuccess, + * ::cudaErrorCudartUnloading, + * ::cudaErrorInitializationError, + * ::cudaErrorNotSupported, + * ::cudaErrorInvalidValue, + * ::cudaErrorOutOfMemory + * \note_graph_thread_safety + * \notefnerr + * + * \sa + * ::cudaGraphAddMemAllocNode, + * ::cudaGraphMemFreeNodeGetParams, + * ::cudaDeviceGraphMemTrim, + * ::cudaDeviceGetGraphMemAttribute, + * ::cudaDeviceSetGraphMemAttribute, + * ::cudaMallocAsync, + * ::cudaFreeAsync, + * ::cudaGraphCreate, + * ::cudaGraphDestroyNode, + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddEventRecordNode, + * ::cudaGraphAddEventWaitNode, + * ::cudaGraphAddExternalSemaphoresSignalNode, + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode + */ +#if __CUDART_API_VERSION >= 11040 +extern __host__ cudaError_t CUDARTAPI cudaGraphAddMemFreeNode(cudaGraphNode_t *pGraphNode, cudaGraph_t graph, const cudaGraphNode_t *pDependencies, size_t numDependencies, void *dptr); +#endif + +/** + * \brief Returns a memory free node's parameters + * + * Returns the address of a memory free node \p hNode in \p dptr_out. + * + * \param node - Node to get the parameters for + * \param dptr_out - Pointer to return the device address + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemFreeNode, + * ::cudaGraphMemFreeNodeGetParams + */ +#if __CUDART_API_VERSION >= 11040 +extern __host__ cudaError_t CUDARTAPI cudaGraphMemFreeNodeGetParams(cudaGraphNode_t node, void *dptr_out); +#endif + +/** + * \brief Free unused memory that was cached on the specified device for use with graphs back to the OS. + * + * Blocks which are not in use by a graph that is either currently executing or scheduled to execute are + * freed back to the operating system. + * + * \param device - The device for which cached memory should be freed. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemAllocNode, + * ::cudaGraphAddMemFreeNode, + * ::cudaDeviceGetGraphMemAttribute, + * ::cudaDeviceSetGraphMemAttribute, + * ::cudaMallocAsync, + * ::cudaFreeAsync, + */ +#if __CUDART_API_VERSION >= 11040 +extern __host__ cudaError_t CUDARTAPI cudaDeviceGraphMemTrim(int device); +#endif + +/** + * \brief Query asynchronous allocation attributes related to graphs + * + * Valid attributes are: + * + * - ::cudaGraphMemAttrUsedMemCurrent: Amount of memory, in bytes, currently associated with graphs + * - ::cudaGraphMemAttrUsedMemHigh: High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + * - ::cudaGraphMemAttrReservedMemCurrent: Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * - ::cudaGraphMemAttrReservedMemHigh: High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * + * \param device - Specifies the scope of the query + * \param attr - attribute to get + * \param value - retrieved value + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDeviceSetGraphMemAttribute, + * ::cudaGraphAddMemAllocNode, + * ::cudaGraphAddMemFreeNode, + * ::cudaDeviceGraphMemTrim, + * ::cudaMallocAsync, + * ::cudaFreeAsync, + */ +#if __CUDART_API_VERSION >= 11040 +extern __host__ cudaError_t CUDARTAPI cudaDeviceGetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value); +#endif + +/** + * \brief Set asynchronous allocation attributes related to graphs + * + * Valid attributes are: + * + * - ::cudaGraphMemAttrUsedMemHigh: High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + * - ::cudaGraphMemAttrReservedMemHigh: High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + * + * \param device - Specifies the scope of the query + * \param attr - attribute to get + * \param value - pointer to value to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidDevice + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaDeviceGetGraphMemAttribute, + * ::cudaGraphAddMemAllocNode, + * ::cudaGraphAddMemFreeNode, + * ::cudaDeviceGraphMemTrim, + * ::cudaMallocAsync, + * ::cudaFreeAsync, + */ +#if __CUDART_API_VERSION >= 11040 +extern __host__ cudaError_t CUDARTAPI cudaDeviceSetGraphMemAttribute(int device, enum cudaGraphMemAttributeType attr, void* value); +#endif + +/** + * \brief Clones a graph + * + * This function creates a copy of \p originalGraph and returns it in \p pGraphClone. + * All parameters are copied into the cloned graph. The original graph may be modified + * after this call without affecting the clone. + * + * Child graph nodes in the original graph are recursively copied into the clone. + * + * \param pGraphClone - Returns newly created cloned graph + * \param originalGraph - Graph to clone + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorMemoryAllocation + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphCreate, + * ::cudaGraphNodeFindInClone + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphClone(cudaGraph_t *pGraphClone, cudaGraph_t originalGraph); + +/** + * \brief Finds a cloned version of a node + * + * This function returns the node in \p clonedGraph corresponding to \p originalNode + * in the original graph. + * + * \p clonedGraph must have been cloned from \p originalGraph via ::cudaGraphClone. + * \p originalNode must have been in \p originalGraph at the time of the call to + * ::cudaGraphClone, and the corresponding cloned node in \p clonedGraph must not have + * been removed. The cloned node is then returned via \p pClonedNode. + * + * \param pNode - Returns handle to the cloned node + * \param originalNode - Handle to the original node + * \param clonedGraph - Cloned graph to query + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphClone + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphNodeFindInClone(cudaGraphNode_t *pNode, cudaGraphNode_t originalNode, cudaGraph_t clonedGraph); + +/** + * \brief Returns a node's type + * + * Returns the node type of \p node in \p pType. + * + * \param node - Node to query + * \param pType - Pointer to return the node type + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphGetNodes, + * ::cudaGraphGetRootNodes, + * ::cudaGraphChildGraphNodeGetGraph, + * ::cudaGraphKernelNodeGetParams, + * ::cudaGraphKernelNodeSetParams, + * ::cudaGraphHostNodeGetParams, + * ::cudaGraphHostNodeSetParams, + * ::cudaGraphMemcpyNodeGetParams, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemsetNodeGetParams, + * ::cudaGraphMemsetNodeSetParams + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetType(cudaGraphNode_t node, enum cudaGraphNodeType *pType); + +/** + * \brief Returns a graph's nodes + * + * Returns a list of \p graph's nodes. \p nodes may be NULL, in which case this + * function will return the number of nodes in \p numNodes. Otherwise, + * \p numNodes entries will be filled in. If \p numNodes is higher than the actual + * number of nodes, the remaining entries in \p nodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p numNodes. + * + * \param graph - Graph to query + * \param nodes - Pointer to return the nodes + * \param numNodes - See description + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphCreate, + * ::cudaGraphGetRootNodes, + * ::cudaGraphGetEdges, + * ::cudaGraphNodeGetType, + * ::cudaGraphNodeGetDependencies, + * ::cudaGraphNodeGetDependentNodes + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphGetNodes(cudaGraph_t graph, cudaGraphNode_t *nodes, size_t *numNodes); + +/** + * \brief Returns a graph's root nodes + * + * Returns a list of \p graph's root nodes. \p pRootNodes may be NULL, in which case this + * function will return the number of root nodes in \p pNumRootNodes. Otherwise, + * \p pNumRootNodes entries will be filled in. If \p pNumRootNodes is higher than the actual + * number of root nodes, the remaining entries in \p pRootNodes will be set to NULL, and the + * number of nodes actually obtained will be returned in \p pNumRootNodes. + * + * \param graph - Graph to query + * \param pRootNodes - Pointer to return the root nodes + * \param pNumRootNodes - See description + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphCreate, + * ::cudaGraphGetNodes, + * ::cudaGraphGetEdges, + * ::cudaGraphNodeGetType, + * ::cudaGraphNodeGetDependencies, + * ::cudaGraphNodeGetDependentNodes + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphGetRootNodes(cudaGraph_t graph, cudaGraphNode_t *pRootNodes, size_t *pNumRootNodes); + +/** + * \brief Returns a graph's dependency edges + * + * Returns a list of \p graph's dependency edges. Edges are returned via corresponding + * indices in \p from and \p to; that is, the node in \p to[i] has a dependency on the + * node in \p from[i]. \p from and \p to may both be NULL, in which + * case this function only returns the number of edges in \p numEdges. Otherwise, + * \p numEdges entries will be filled in. If \p numEdges is higher than the actual + * number of edges, the remaining entries in \p from and \p to will be set to NULL, and + * the number of edges actually returned will be written to \p numEdges. + * + * \param graph - Graph to get the edges from + * \param from - Location to return edge endpoints + * \param to - Location to return edge endpoints + * \param numEdges - See description + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphGetNodes, + * ::cudaGraphGetRootNodes, + * ::cudaGraphAddDependencies, + * ::cudaGraphRemoveDependencies, + * ::cudaGraphNodeGetDependencies, + * ::cudaGraphNodeGetDependentNodes + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphGetEdges(cudaGraph_t graph, cudaGraphNode_t *from, cudaGraphNode_t *to, size_t *numEdges); + +/** + * \brief Returns a node's dependencies + * + * Returns a list of \p node's dependencies. \p pDependencies may be NULL, in which case this + * function will return the number of dependencies in \p pNumDependencies. Otherwise, + * \p pNumDependencies entries will be filled in. If \p pNumDependencies is higher than the actual + * number of dependencies, the remaining entries in \p pDependencies will be set to NULL, and the + * number of nodes actually obtained will be returned in \p pNumDependencies. + * + * \param node - Node to query + * \param pDependencies - Pointer to return the dependencies + * \param pNumDependencies - See description + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphNodeGetDependentNodes, + * ::cudaGraphGetNodes, + * ::cudaGraphGetRootNodes, + * ::cudaGraphGetEdges, + * ::cudaGraphAddDependencies, + * ::cudaGraphRemoveDependencies + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependencies(cudaGraphNode_t node, cudaGraphNode_t *pDependencies, size_t *pNumDependencies); + +/** + * \brief Returns a node's dependent nodes + * + * Returns a list of \p node's dependent nodes. \p pDependentNodes may be NULL, in which + * case this function will return the number of dependent nodes in \p pNumDependentNodes. + * Otherwise, \p pNumDependentNodes entries will be filled in. If \p pNumDependentNodes is + * higher than the actual number of dependent nodes, the remaining entries in + * \p pDependentNodes will be set to NULL, and the number of nodes actually obtained will + * be returned in \p pNumDependentNodes. + * + * \param node - Node to query + * \param pDependentNodes - Pointer to return the dependent nodes + * \param pNumDependentNodes - See description + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphNodeGetDependencies, + * ::cudaGraphGetNodes, + * ::cudaGraphGetRootNodes, + * ::cudaGraphGetEdges, + * ::cudaGraphAddDependencies, + * ::cudaGraphRemoveDependencies + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetDependentNodes(cudaGraphNode_t node, cudaGraphNode_t *pDependentNodes, size_t *pNumDependentNodes); + +/** + * \brief Adds dependency edges to a graph. + * + * The number of dependencies to be added is defined by \p numDependencies + * Elements in \p pFrom and \p pTo at corresponding indices define a dependency. + * Each node in \p pFrom and \p pTo must belong to \p graph. + * + * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored. + * Specifying an existing dependency will return an error. + * + * \param graph - Graph to which dependencies are added + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be added + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphRemoveDependencies, + * ::cudaGraphGetEdges, + * ::cudaGraphNodeGetDependencies, + * ::cudaGraphNodeGetDependentNodes + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphAddDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies); + +/** + * \brief Removes dependency edges from a graph. + * + * The number of \p pDependencies to be removed is defined by \p numDependencies. + * Elements in \p pFrom and \p pTo at corresponding indices define a dependency. + * Each node in \p pFrom and \p pTo must belong to \p graph. + * + * If \p numDependencies is 0, elements in \p pFrom and \p pTo will be ignored. + * Specifying a non-existing dependency will return an error. + * + * \param graph - Graph from which to remove dependencies + * \param from - Array of nodes that provide the dependencies + * \param to - Array of dependent nodes + * \param numDependencies - Number of dependencies to be removed + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddDependencies, + * ::cudaGraphGetEdges, + * ::cudaGraphNodeGetDependencies, + * ::cudaGraphNodeGetDependentNodes + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphRemoveDependencies(cudaGraph_t graph, const cudaGraphNode_t *from, const cudaGraphNode_t *to, size_t numDependencies); + +/** + * \brief Remove a node from the graph + * + * Removes \p node from its graph. This operation also severs any dependencies of other nodes + * on \p node and vice versa. + * + * Dependencies cannot be removed from graphs which contain allocation or free nodes. + * Any attempt to do so will return an error. + * + * \param node - Node to remove + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphAddEmptyNode, + * ::cudaGraphAddKernelNode, + * ::cudaGraphAddHostNode, + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemsetNode + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphDestroyNode(cudaGraphNode_t node); + +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p graph as an executable graph. The graph is validated for any + * structural constraints or intra-node constraints which were not previously + * validated. If instantiation is successful, a handle to the instantiated graph + * is returned in \p pGraphExec. + * + * If there are any errors, diagnostic information may be returned in \p pErrorNode and + * \p pLogBuffer. This is the primary way to inspect instantiation errors. The output + * will be null terminated unless the diagnostics overflow + * the buffer. In this case, they will be truncated, and the last byte can be + * inspected to determine if truncation occurred. + * + * \param pGraphExec - Returns instantiated graph + * \param graph - Graph to instantiate + * \param pErrorNode - In case of an instantiation error, this may be modified to + * indicate a node contributing to the error + * \param pLogBuffer - A character buffer to store diagnostic messages + * \param bufferSize - Size of the log buffer in bytes + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphInstantiateWithFlags, + * ::cudaGraphCreate, + * ::cudaGraphUpload, + * ::cudaGraphLaunch, + * ::cudaGraphExecDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode, char *pLogBuffer, size_t bufferSize); + +/** + * \brief Creates an executable graph from a graph + * + * Instantiates \p graph as an executable graph. The graph is validated for any + * structural constraints or intra-node constraints which were not previously + * validated. If instantiation is successful, a handle to the instantiated graph + * is returned in \p pGraphExec. + * + * The \p flags parameter controls the behavior of instantiation and subsequent + * graph launches. Valid flags are: + * + * - ::cudaGraphInstantiateFlagAutoFreeOnLaunch, which configures a + * graph containing memory allocation nodes to automatically free any + * unfreed memory allocations before the graph is relaunched. + * + * - ::cudaGraphInstantiateFlagUseNodePriority, which causes the graph + * to use the priorities from the per-node attributes rather than the priority + * of the launch stream during execution. Note that priorities are only available + * on kernel nodes, and are copied from stream priority during stream capture. + * + * If \p graph contains any allocation or free nodes, there can be at most one + * executable graph in existence for that graph at a time. + * + * An attempt to instantiate a second executable graph before destroying the first + * with ::cudaGraphExecDestroy will result in an error. + * + * \param pGraphExec - Returns instantiated graph + * \param graph - Graph to instantiate + * \param flags - Flags to control instantiation. See ::CUgraphInstantiate_flags. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphInstantiate, + * ::cudaGraphCreate, + * ::cudaGraphUpload, + * ::cudaGraphLaunch, + * ::cudaGraphExecDestroy + */ +#if __CUDART_API_VERSION >= 11040 +extern __host__ cudaError_t CUDARTAPI cudaGraphInstantiateWithFlags(cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags); +#endif + +/** + * \brief Sets the parameters for a kernel node in the given graphExec + * + * Sets the parameters of a kernel node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p node in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p node must not have been removed from the original graph. The \p func field + * of \p nodeParams cannot be modified and must match the original value. + * All other values can be modified. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p node is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - kernel node from the graph from which graphExec was instantiated + * \param pNodeParams - Updated Parameters to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddKernelNode, + * ::cudaGraphKernelNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphExecKernelNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaKernelNodeParams *pNodeParams); + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec. + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained \p pNodeParams at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * The source and destination memory in \p pNodeParams must be allocated from the same + * contexts as the original source and destination memory. Both the instantiation-time + * memory operands and the memory operands in \p pNodeParams must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or + * either the original or new memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memcpy node from the graph which was used to instantiate graphExec + * \param pNodeParams - Updated Parameters to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParamsToSymbol, + * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphExecMemcpyNodeSetParams1D, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemcpy3DParms *pNodeParams); + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained the given params at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * \p src and \p symbol must be allocated from the same contexts as the original source and + * destination memory. The instantiation-time memory operands must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or + * the original memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memcpy node from the graph which was used to instantiate graphExec + * \param symbol - Device symbol address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeToSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsToSymbol, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsToSymbol( + cudaGraphExec_t hGraphExec, + cudaGraphNode_t node, + const void* symbol, + const void* src, + size_t count, + size_t offset, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained the given params at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * \p symbol and \p dst must be allocated from the same contexts as the original source and + * destination memory. The instantiation-time memory operands must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or + * the original memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memcpy node from the graph which was used to instantiate graphExec + * \param dst - Destination memory address + * \param symbol - Device symbol address + * \param count - Size in bytes to copy + * \param offset - Offset from start of symbol in bytes + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNodeFromSymbol, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParamsFromSymbol, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParamsToSymbol, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParamsFromSymbol( + cudaGraphExec_t hGraphExec, + cudaGraphNode_t node, + void* dst, + const void* symbol, + size_t count, + size_t offset, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional copy + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained the given params at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * \p src and \p dst must be allocated from the same contexts as the original source + * and destination memory. The instantiation-time memory operands must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or + * the original memory operands are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memcpy node from the graph which was used to instantiate graphExec + * \param dst - Destination memory address + * \param src - Source memory address + * \param count - Size in bytes to copy + * \param kind - Type of transfer + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemcpyNode, + * ::cudaGraphAddMemcpyNode1D, + * ::cudaGraphMemcpyNodeSetParams, + * ::cudaGraphMemcpyNodeSetParams1D, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemcpyNodeSetParams1D( + cudaGraphExec_t hGraphExec, + cudaGraphNode_t node, + void* dst, + const void* src, + size_t count, + enum cudaMemcpyKind kind); +#endif + +/** + * \brief Sets the parameters for a memset node in the given graphExec. + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained \p pNodeParams at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * The destination memory in \p pNodeParams must be allocated from the same + * context as the original destination memory. Both the instantiation-time + * memory operand and the memory operand in \p pNodeParams must be 1-dimensional. + * Zero-length operations are not supported. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * Returns cudaErrorInvalidValue if the memory operand's mappings changed or + * either the original or new memory operand are multidimensional. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Memset node from the graph which was used to instantiate graphExec + * \param pNodeParams - Updated Parameters to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddMemsetNode, + * ::cudaGraphMemsetNodeSetParams, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphExecMemsetNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaMemsetParams *pNodeParams); + +/** + * \brief Sets the parameters for a host node in the given graphExec. + * + * Updates the work represented by \p node in \p hGraphExec as though \p node had + * contained \p pNodeParams at instantiation. \p node must remain in the graph which was + * used to instantiate \p hGraphExec. Changed edges to and from \p node are ignored. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Host node from the graph which was used to instantiate graphExec + * \param pNodeParams - Updated Parameters to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddHostNode, + * ::cudaGraphHostNodeSetParams, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphExecHostNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, const struct cudaHostNodeParams *pNodeParams); + +/** + * \brief Updates node parameters in the child graph node in the given graphExec. + * + * Updates the work represented by \p node in \p hGraphExec as though the nodes contained + * in \p node's graph had the parameters contained in \p childGraph's nodes at instantiation. + * \p node must remain in the graph which was used to instantiate \p hGraphExec. + * Changed edges to and from \p node are ignored. + * + * The modifications only affect future launches of \p hGraphExec. Already enqueued + * or running launches of \p hGraphExec are not affected by this call. \p node is also + * not modified by this call. + * + * The topology of \p childGraph, as well as the node insertion order, must match that + * of the graph contained in \p node. See ::cudaGraphExecUpdate() for a list of restrictions + * on what can be updated in an instantiated graph. The update is recursive, so child graph + * nodes contained within the top level child graph will also be updated. + + * \param hGraphExec - The executable graph in which to set the specified node + * \param node - Host node from the graph which was used to instantiate graphExec + * \param childGraph - The graph supplying the updated parameters + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddChildGraphNode, + * ::cudaGraphChildGraphNodeGetGraph, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphExecChildGraphNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t node, cudaGraph_t childGraph); +#endif + +/** + * \brief Sets the event for an event record node in the given graphExec + * + * Sets the event of an event record node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Event record node from the graph from which graphExec was instantiated + * \param event - Updated event to use + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddEventRecordNode, + * ::cudaGraphEventRecordNodeGetEvent, + * ::cudaGraphEventWaitNodeSetEvent, + * ::cudaEventRecordWithFlags, + * ::cudaStreamWaitEvent, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventRecordNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event); +#endif + +/** + * \brief Sets the event for an event wait node in the given graphExec + * + * Sets the event of an event wait node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Event wait node from the graph from which graphExec was instantiated + * \param event - Updated event to use + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddEventWaitNode, + * ::cudaGraphEventWaitNodeGetEvent, + * ::cudaGraphEventRecordNodeSetEvent, + * ::cudaEventRecordWithFlags, + * ::cudaStreamWaitEvent, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphExecEventWaitNodeSetEvent(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, cudaEvent_t event); +#endif + +/** + * \brief Sets the parameters for an external semaphore signal node in the given graphExec + * + * Sets the parameters of an external semaphore signal node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * Changing \p nodeParams->numExtSems is not supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - semaphore signal node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddExternalSemaphoresSignalNode, + * ::cudaImportExternalSemaphore, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresWaitNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +#if __CUDART_API_VERSION >= 11020 +extern __host__ cudaError_t CUDARTAPI cudaGraphExecExternalSemaphoresSignalNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreSignalNodeParams *nodeParams); +#endif + +/** + * \brief Sets the parameters for an external semaphore wait node in the given graphExec + * + * Sets the parameters of an external semaphore wait node in an executable graph \p hGraphExec. + * The node is identified by the corresponding node \p hNode in the + * non-executable graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * Changing \p nodeParams->numExtSems is not supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - semaphore wait node from the graph from which graphExec was instantiated + * \param nodeParams - Updated Parameters to set + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphAddExternalSemaphoresWaitNode, + * ::cudaImportExternalSemaphore, + * ::cudaSignalExternalSemaphoresAsync, + * ::cudaWaitExternalSemaphoresAsync, + * ::cudaGraphExecKernelNodeSetParams, + * ::cudaGraphExecMemcpyNodeSetParams, + * ::cudaGraphExecMemsetNodeSetParams, + * ::cudaGraphExecHostNodeSetParams, + * ::cudaGraphExecChildGraphNodeSetParams, + * ::cudaGraphExecEventRecordNodeSetEvent, + * ::cudaGraphExecEventWaitNodeSetEvent, + * ::cudaGraphExecExternalSemaphoresSignalNodeSetParams, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + */ +#if __CUDART_API_VERSION >= 11020 +extern __host__ cudaError_t CUDARTAPI cudaGraphExecExternalSemaphoresWaitNodeSetParams(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, const struct cudaExternalSemaphoreWaitNodeParams *nodeParams); +#endif + +/** + * \brief Enables or disables the specified node in the given graphExec + * + * Sets \p hNode to be either enabled or disabled. Disabled nodes are functionally equivalent + * to empty nodes until they are reenabled. Existing node parameters are not affected by + * disabling/enabling the node. + * + * The node is identified by the corresponding node \p hNode in the non-executable + * graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * The modifications only affect future launches of \p hGraphExec. Already + * enqueued or running launches of \p hGraphExec are not affected by this call. + * \p hNode is also not modified by this call. + * + * \note Currently only kernel, memset and memcpy nodes are supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Node from the graph from which graphExec was instantiated + * \param isEnabled - Node is enabled if != 0, otherwise the node is disabled + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphNodeGetEnabled, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + * ::cudaGraphLaunch + */ +#if __CUDART_API_VERSION >= 11060 +extern __host__ cudaError_t CUDARTAPI cudaGraphNodeSetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int isEnabled); +#endif + +/** + * \brief Query whether a node in the given graphExec is enabled + * + * Sets isEnabled to 1 if \p hNode is enabled, or 0 if \p hNode is disabled. + * + * The node is identified by the corresponding node \p hNode in the non-executable + * graph, from which the executable graph was instantiated. + * + * \p hNode must not have been removed from the original graph. + * + * \note Currently only kernel, memset and memcpy nodes are supported. + * + * \param hGraphExec - The executable graph in which to set the specified node + * \param hNode - Node from the graph from which graphExec was instantiated + * \param isEnabled - Location to return the enabled status of the node + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphNodeSetEnabled, + * ::cudaGraphExecUpdate, + * ::cudaGraphInstantiate + * ::cudaGraphLaunch + */ +#if __CUDART_API_VERSION >= 11060 +extern __host__ cudaError_t CUDARTAPI cudaGraphNodeGetEnabled(cudaGraphExec_t hGraphExec, cudaGraphNode_t hNode, unsigned int *isEnabled); +#endif + +/** + * \brief Check whether an executable graph can be updated with a graph and perform the update if possible + * + * Updates the node parameters in the instantiated graph specified by \p hGraphExec with the + * node parameters in a topologically identical graph specified by \p hGraph. + * + * Limitations: + * + * - Kernel nodes: + * - The owning context of the function cannot change. + * - A node whose function originally did not use CUDA dynamic parallelism cannot be updated + * to a function which uses CDP. + * - A cooperative node cannot be updated to a non-cooperative node, and vice-versa. + * - If the graph was instantiated with cudaGraphInstantiateFlagUseNodePriority, the + * priority attribute cannot change. Equality is checked on the originally requested + * priority values, before they are clamped to the device's supported range. + * - Memset and memcpy nodes: + * - The CUDA device(s) to which the operand(s) was allocated/mapped cannot change. + * - The source/destination memory must be allocated from the same contexts as the original + * source/destination memory. + * - Only 1D memsets can be changed. + * - Additional memcpy node restrictions: + * - Changing either the source or destination memory type(i.e. CU_MEMORYTYPE_DEVICE, + * CU_MEMORYTYPE_ARRAY, etc.) is not supported. + * + * Note: The API may add further restrictions in future releases. The return code should always be checked. + * + * cudaGraphExecUpdate sets \p updateResult_out to cudaGraphExecUpdateErrorTopologyChanged under + * the following conditions: + * + * - The count of nodes directly in \p hGraphExec and \p hGraph differ, in which case \p hErrorNode_out + * is NULL. + * - A node is deleted in \p hGraph but not not its pair from \p hGraphExec, in which case \p hErrorNode_out + * is NULL. + * - A node is deleted in \p hGraphExec but not its pair from \p hGraph, in which case \p hErrorNode_out is + * the pairless node from \p hGraph. + * - The dependent nodes of a pair differ, in which case \p hErrorNode_out is the node from \p hGraph. + * + * cudaGraphExecUpdate sets \p updateResult_out to: + * - cudaGraphExecUpdateError if passed an invalid value. + * - cudaGraphExecUpdateErrorTopologyChanged if the graph topology changed + * - cudaGraphExecUpdateErrorNodeTypeChanged if the type of a node changed, in which case + * \p hErrorNode_out is set to the node from \p hGraph. + * - cudaGraphExecUpdateErrorFunctionChanged if the function of a kernel node changed (CUDA driver < 11.2) + * - cudaGraphExecUpdateErrorUnsupportedFunctionChange if the func field of a kernel changed in an + * unsupported way(see note above), in which case \p hErrorNode_out is set to the node from \p hGraph + * - cudaGraphExecUpdateErrorParametersChanged if any parameters to a node changed in a way + * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph + * - cudaGraphExecUpdateErrorAttributesChanged if any attributes of a node changed in a way + * that is not supported, in which case \p hErrorNode_out is set to the node from \p hGraph + * - cudaGraphExecUpdateErrorNotSupported if something about a node is unsupported, like + * the node's type or configuration, in which case \p hErrorNode_out is set to the node from \p hGraph + * + * If \p updateResult_out isn't set in one of the situations described above, the update check passes + * and cudaGraphExecUpdate updates \p hGraphExec to match the contents of \p hGraph. If an error happens + * during the update, \p updateResult_out will be set to cudaGraphExecUpdateError; otherwise, + * \p updateResult_out is set to cudaGraphExecUpdateSuccess. + * + * cudaGraphExecUpdate returns cudaSuccess when the updated was performed successfully. It returns + * cudaErrorGraphExecUpdateFailure if the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + * + * \param hGraphExec The instantiated graph to be updated + * \param hGraph The graph containing the updated parameters + * \param hErrorNode_out The node which caused the permissibility check to forbid the update, if any + * \param updateResult_out Whether the graph update was permitted. If was forbidden, the reason why + * + * \return + * ::cudaSuccess, + * ::cudaErrorGraphExecUpdateFailure, + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphInstantiate, + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphExecUpdate(cudaGraphExec_t hGraphExec, cudaGraph_t hGraph, cudaGraphNode_t *hErrorNode_out, enum cudaGraphExecUpdateResult *updateResult_out); + +/** + * \brief Uploads an executable graph in a stream + * + * Uploads \p hGraphExec to the device in \p hStream without executing it. Uploads of + * the same \p hGraphExec will be serialized. Each upload is ordered behind both any + * previous work in \p hStream and any previous launches of \p hGraphExec. + * Uses memory cached by \p stream to back the allocations owned by \p graphExec. + * + * \param hGraphExec - Executable graph to upload + * \param hStream - Stream in which to upload the graph + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * \notefnerr + * \note_init_rt + * + * \sa + * ::cudaGraphInstantiate, + * ::cudaGraphLaunch, + * ::cudaGraphExecDestroy + */ +#if __CUDART_API_VERSION >= 11010 + extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream); +#endif + +/** + * \brief Launches an executable graph in a stream + * + * Executes \p graphExec in \p stream. Only one instance of \p graphExec may be executing + * at a time. Each launch is ordered behind both any previous work in \p stream + * and any previous launches of \p graphExec. To execute a graph concurrently, it must be + * instantiated multiple times into multiple executable graphs. + * + * If any allocations created by \p graphExec remain unfreed (from a previous launch) and + * \p graphExec was not instantiated with ::cudaGraphInstantiateFlagAutoFreeOnLaunch, + * the launch will fail with ::cudaErrorInvalidValue. + * + * \param graphExec - Executable graph to launch + * \param stream - Stream in which to launch the graph + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * + * \sa + * ::cudaGraphInstantiate, + * ::cudaGraphUpload, + * ::cudaGraphExecDestroy + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream); + +/** + * \brief Destroys an executable graph + * + * Destroys the executable graph specified by \p graphExec. + * + * \param graphExec - Executable graph to destroy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa + * ::cudaGraphInstantiate, + * ::cudaGraphUpload, + * ::cudaGraphLaunch + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphExecDestroy(cudaGraphExec_t graphExec); + +/** + * \brief Destroys a graph + * + * Destroys the graph specified by \p graph, as well as all of its nodes. + * + * \param graph - Graph to destroy + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * \note_graph_thread_safety + * \notefnerr + * \note_init_rt + * \note_callback + * \note_destroy_ub + * + * \sa + * ::cudaGraphCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphDestroy(cudaGraph_t graph); + +/** + * \brief Write a DOT file describing graph structure + * + * Using the provided \p graph, write to \p path a DOT formatted description of the graph. + * By default this includes the graph topology, node types, node id, kernel names and memcpy direction. + * \p flags can be specified to write more detailed information about each node type such as + * parameter values, kernel attributes, node and function handles. + * + * \param graph - The graph to create a DOT file from + * \param path - The path to write the DOT file to + * \param flags - Flags from cudaGraphDebugDotFlags for specifying which additional node information to write + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorOperatingSystem + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphDebugDotPrint(cudaGraph_t graph, const char *path, unsigned int flags); + +/** + * \brief Create a user object + * + * Create a user object with the specified destructor callback and initial reference count. The + * initial references are owned by the caller. + * + * Destructor callbacks cannot make CUDA API calls and should avoid blocking behavior, as they + * are executed by a shared internal thread. Another thread may be signaled to perform such + * actions, if it does not block forward progress of tasks scheduled through CUDA. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object_out - Location to return the user object handle + * \param ptr - The pointer to pass to the destroy function + * \param destroy - Callback to free the user object when it is no longer in use + * \param initialRefcount - The initial refcount to create the object with, typically 1. The + * initial references are owned by the calling thread. + * \param flags - Currently it is required to pass ::cudaUserObjectNoDestructorSync, + * which is the only defined flag. This indicates that the destroy + * callback cannot be waited on by any CUDA API. Users requiring + * synchronization of the callback should signal its completion + * manually. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaUserObjectRetain, + * ::cudaUserObjectRelease, + * ::cudaGraphRetainUserObject, + * ::cudaGraphReleaseUserObject, + * ::cudaGraphCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaUserObjectCreate(cudaUserObject_t *object_out, void *ptr, cudaHostFn_t destroy, unsigned int initialRefcount, unsigned int flags); + +/** + * \brief Retain a reference to a user object + * + * Retains new references to a user object. The new references are owned by the caller. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object - The object to retain + * \param count - The number of references to retain, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaUserObjectCreate, + * ::cudaUserObjectRelease, + * ::cudaGraphRetainUserObject, + * ::cudaGraphReleaseUserObject, + * ::cudaGraphCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaUserObjectRetain(cudaUserObject_t object, unsigned int count __dv(1)); + +/** + * \brief Release a reference to a user object + * + * Releases user object references owned by the caller. The object's destructor is invoked if + * the reference count reaches zero. + * + * It is undefined behavior to release references not owned by the caller, or to use a user + * object handle after all references are released. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param object - The object to release + * \param count - The number of references to release, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaUserObjectCreate, + * ::cudaUserObjectRetain, + * ::cudaGraphRetainUserObject, + * ::cudaGraphReleaseUserObject, + * ::cudaGraphCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaUserObjectRelease(cudaUserObject_t object, unsigned int count __dv(1)); + +/** + * \brief Retain a reference to a user object from a graph + * + * Creates or moves user object references that will be owned by a CUDA graph. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param graph - The graph to associate the reference with + * \param object - The user object to retain a reference for + * \param count - The number of references to add to the graph, typically 1. Must be + * nonzero and not larger than INT_MAX. + * \param flags - The optional flag ::cudaGraphUserObjectMove transfers references + * from the calling thread, rather than create new references. Pass 0 + * to create new references. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaUserObjectCreate + * ::cudaUserObjectRetain, + * ::cudaUserObjectRelease, + * ::cudaGraphReleaseUserObject, + * ::cudaGraphCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphRetainUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1), unsigned int flags __dv(0)); + +/** + * \brief Release a user object reference from a graph + * + * Releases user object references owned by a graph. + * + * See CUDA User Objects in the CUDA C++ Programming Guide for more information on user objects. + * + * \param graph - The graph that will release the reference + * \param object - The user object to release a reference for + * \param count - The number of references to release, typically 1. Must be nonzero + * and not larger than INT_MAX. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue + * + * \sa + * ::cudaUserObjectCreate + * ::cudaUserObjectRetain, + * ::cudaUserObjectRelease, + * ::cudaGraphRetainUserObject, + * ::cudaGraphCreate + */ +extern __host__ cudaError_t CUDARTAPI cudaGraphReleaseUserObject(cudaGraph_t graph, cudaUserObject_t object, unsigned int count __dv(1)); + +/** @} */ /* END CUDART_GRAPH */ + +/** + * \defgroup CUDART_DRIVER_ENTRY_POINT Driver Entry Point Access + * + * ___MANBRIEF___ driver entry point access functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the driver entry point access functions of CUDA + * runtime application programming interface. + * + * @{ + */ + +/** + * \brief Returns the requested driver API function pointer + * + * Returns in \p **funcPtr the address of the CUDA driver function for the requested flags. + * + * For a requested driver symbol, if the CUDA version in which the driver symbol was + * introduced is less than or equal to the CUDA runtime version, the API will return + * the function pointer to the corresponding versioned driver function. + * + * The pointer returned by the API should be cast to a function pointer matching the + * requested driver function's definition in the API header file. The function pointer + * typedef can be picked up from the corresponding typedefs header file. For example, + * cudaTypedefs.h consists of function pointer typedefs for driver APIs defined in cuda.h. + * + * The API will return ::cudaErrorSymbolNotFound if the requested driver function is not + * supported on the platform, no ABI compatible driver function exists for the CUDA runtime + * version or if the driver symbol is invalid. + * + * The requested flags can be: + * - ::cudaEnableDefault: This is the default mode. This is equivalent to + * ::cudaEnablePerThreadDefaultStream if the code is compiled with + * --default-stream per-thread compilation flag or the macro CUDA_API_PER_THREAD_DEFAULT_STREAM + * is defined; ::cudaEnableLegacyStream otherwise. + * - ::cudaEnableLegacyStream: This will enable the search for all driver symbols + * that match the requested driver symbol name except the corresponding per-thread versions. + * - ::cudaEnablePerThreadDefaultStream: This will enable the search for all + * driver symbols that match the requested driver symbol name including the per-thread + * versions. If a per-thread version is not found, the API will return the legacy version + * of the driver function. + * + * \param symbol - The base name of the driver API function to look for. As an example, + * for the driver API ::cuMemAlloc_v2, \p symbol would be cuMemAlloc. + * Note that the API will use the CUDA runtime version to return the + * address to the most recent ABI compatible driver symbol, ::cuMemAlloc + * or ::cuMemAlloc_v2. + * \param funcPtr - Location to return the function pointer to the requested driver function + * \param flags - Flags to specify search options. + * + * \return + * ::cudaSuccess, + * ::cudaErrorInvalidValue, + * ::cudaErrorNotSupported, + * ::cudaErrorSymbolNotFound + * \note_version_mixing + * \note_init_rt + * \note_callback + * + * \sa + * ::cuGetProcAddress + */ +extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags); + +/** @} */ /* END CUDART_DRIVER_ENTRY_POINT */ + +/** \cond impl_private */ +extern __host__ cudaError_t CUDARTAPI cudaGetExportTable(const void **ppExportTable, const cudaUUID_t *pExportTableId); +/** \endcond impl_private */ + +/** + * \defgroup CUDART_HIGHLEVEL C++ API Routines + * + * ___MANBRIEF___ C++ high level API functions of the CUDA runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the C++ high level API functions of the CUDA runtime + * application programming interface. To use these functions, your + * application needs to be compiled with the \p nvcc compiler. + * + * \brief C++-style interface built on top of CUDA runtime API + */ + +/** + * \defgroup CUDART_DRIVER Interactions with the CUDA Driver API + * + * ___MANBRIEF___ interactions between CUDA Driver API and CUDA Runtime API + * (___CURRENT_FILE___) ___ENDMANBRIEF___ + * + * This section describes the interactions between the CUDA Driver API and the CUDA Runtime API + * + * @{ + * + * \section CUDART_CUDA_primary Primary Contexts + * + * There exists a one to one relationship between CUDA devices in the CUDA Runtime + * API and ::CUcontext s in the CUDA Driver API within a process. The specific + * context which the CUDA Runtime API uses for a device is called the device's + * primary context. From the perspective of the CUDA Runtime API, a device and + * its primary context are synonymous. + * + * \section CUDART_CUDA_init Initialization and Tear-Down + * + * CUDA Runtime API calls operate on the CUDA Driver API ::CUcontext which is current to + * to the calling host thread. + * + * The function ::cudaSetDevice() makes the primary context for the + * specified device current to the calling thread by calling ::cuCtxSetCurrent(). + * + * The CUDA Runtime API will automatically initialize the primary context for + * a device at the first CUDA Runtime API call which requires an active context. + * If no ::CUcontext is current to the calling thread when a CUDA Runtime API call + * which requires an active context is made, then the primary context for a device + * will be selected, made current to the calling thread, and initialized. + * + * The context which the CUDA Runtime API initializes will be initialized using + * the parameters specified by the CUDA Runtime API functions + * ::cudaSetDeviceFlags(), + * ::cudaD3D9SetDirect3DDevice(), + * ::cudaD3D10SetDirect3DDevice(), + * ::cudaD3D11SetDirect3DDevice(), + * ::cudaGLSetGLDevice(), and + * ::cudaVDPAUSetVDPAUDevice(). + * Note that these functions will fail with ::cudaErrorSetOnActiveProcess if they are + * called when the primary context for the specified device has already been initialized. + * (or if the current device has already been initialized, in the case of + * ::cudaSetDeviceFlags()). + * + * Primary contexts will remain active until they are explicitly deinitialized + * using ::cudaDeviceReset(). The function ::cudaDeviceReset() will deinitialize the + * primary context for the calling thread's current device immediately. The context + * will remain current to all of the threads that it was current to. The next CUDA + * Runtime API call on any thread which requires an active context will trigger the + * reinitialization of that device's primary context. + * + * Note that primary contexts are shared resources. It is recommended that + * the primary context not be reset except just before exit or to recover from an + * unspecified launch failure. + * + * \section CUDART_CUDA_context Context Interoperability + * + * Note that the use of multiple ::CUcontext s per device within a single process + * will substantially degrade performance and is strongly discouraged. Instead, + * it is highly recommended that the implicit one-to-one device-to-context mapping + * for the process provided by the CUDA Runtime API be used. + * + * If a non-primary ::CUcontext created by the CUDA Driver API is current to a + * thread then the CUDA Runtime API calls to that thread will operate on that + * ::CUcontext, with some exceptions listed below. Interoperability between data + * types is discussed in the following sections. + * + * The function ::cudaPointerGetAttributes() will return the error + * ::cudaErrorIncompatibleDriverContext if the pointer being queried was allocated by a + * non-primary context. The function ::cudaDeviceEnablePeerAccess() and the rest of + * the peer access API may not be called when a non-primary ::CUcontext is current. + * To use the pointer query and peer access APIs with a context created using the + * CUDA Driver API, it is necessary that the CUDA Driver API be used to access + * these features. + * + * All CUDA Runtime API state (e.g, global variables' addresses and values) travels + * with its underlying ::CUcontext. In particular, if a ::CUcontext is moved from one + * thread to another then all CUDA Runtime API state will move to that thread as well. + * + * Please note that attaching to legacy contexts (those with a version of 3010 as returned + * by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return + * ::cudaErrorIncompatibleDriverContext in such cases. + * + * \section CUDART_CUDA_stream Interactions between CUstream and cudaStream_t + * + * The types ::CUstream and ::cudaStream_t are identical and may be used interchangeably. + * + * \section CUDART_CUDA_event Interactions between CUevent and cudaEvent_t + * + * The types ::CUevent and ::cudaEvent_t are identical and may be used interchangeably. + * + * \section CUDART_CUDA_array Interactions between CUarray and cudaArray_t + * + * The types ::CUarray and struct ::cudaArray * represent the same data type and may be used + * interchangeably by casting the two types between each other. + * + * In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *, + * it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *. + * + * In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray, + * it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray . + * + * \section CUDART_CUDA_graphicsResource Interactions between CUgraphicsResource and cudaGraphicsResource_t + * + * The types ::CUgraphicsResource and ::cudaGraphicsResource_t represent the same data type and may be used + * interchangeably by casting the two types between each other. + * + * In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a + * ::cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource + * to a ::cudaGraphicsResource_t. + * + * In order to use a ::cudaGraphicsResource_t in a CUDA Driver API function which takes a + * ::CUgraphicsResource, it is necessary to explicitly cast the ::cudaGraphicsResource_t + * to a ::CUgraphicsResource. + * + * \section CUDART_CUDA_texture_objects Interactions between CUtexObject and cudaTextureObject_t + * + * The types ::CUtexObject and ::cudaTextureObject_t represent the same data type and may be used + * interchangeably by casting the two types between each other. + * + * In order to use a ::CUtexObject in a CUDA Runtime API function which takes a ::cudaTextureObject_t, + * it is necessary to explicitly cast the ::CUtexObject to a ::cudaTextureObject_t. + * + * In order to use a ::cudaTextureObject_t in a CUDA Driver API function which takes a ::CUtexObject, + * it is necessary to explicitly cast the ::cudaTextureObject_t to a ::CUtexObject. + * + * \section CUDART_CUDA_surface_objects Interactions between CUsurfObject and cudaSurfaceObject_t + * + * The types ::CUsurfObject and ::cudaSurfaceObject_t represent the same data type and may be used + * interchangeably by casting the two types between each other. + * + * In order to use a ::CUsurfObject in a CUDA Runtime API function which takes a ::cudaSurfaceObject_t, + * it is necessary to explicitly cast the ::CUsurfObject to a ::cudaSurfaceObject_t. + * + * In order to use a ::cudaSurfaceObject_t in a CUDA Driver API function which takes a ::CUsurfObject, + * it is necessary to explicitly cast the ::cudaSurfaceObject_t to a ::CUsurfObject. + * + * \section CUDART_CUDA_module Interactions between CUfunction and cudaFunction_t + * + * The types ::CUfunction and ::cudaFunction_t represent the same data type and may be used + * interchangeably by casting the two types between each other. + * + * In order to use a ::cudaFunction_t in a CUDA Driver API function which takes a ::CUfunction, + * it is necessary to explicitly cast the ::cudaFunction_t to a ::CUfunction. + * + */ + + /** + * \brief Get pointer to device entry function that matches entry function \p symbolPtr + * + * Returns in \p functionPtr the device entry function corresponding to the symbol \p symbolPtr. + * + * \param functionPtr - Returns the device entry function + * \param symbolPtr - Pointer to device entry function to search for + * + * \return + * ::cudaSuccess + * + */ +extern __host__ cudaError_t CUDARTAPI_CDECL cudaGetFuncBySymbol(cudaFunction_t* functionPtr, const void* symbolPtr); + +/** @} */ /* END CUDART_DRIVER */ + +#if defined(__CUDA_API_VERSION_INTERNAL) + #undef cudaMemcpy + #undef cudaMemcpyToSymbol + #undef cudaMemcpyFromSymbol + #undef cudaMemcpy2D + #undef cudaMemcpyToArray + #undef cudaMemcpy2DToArray + #undef cudaMemcpyFromArray + #undef cudaMemcpy2DFromArray + #undef cudaMemcpyArrayToArray + #undef cudaMemcpy2DArrayToArray + #undef cudaMemcpy3D + #undef cudaMemcpy3DPeer + #undef cudaMemset + #undef cudaMemset2D + #undef cudaMemset3D + #undef cudaMemcpyAsync + #undef cudaMemcpyToSymbolAsync + #undef cudaMemcpyFromSymbolAsync + #undef cudaMemcpy2DAsync + #undef cudaMemcpyToArrayAsync + #undef cudaMemcpy2DToArrayAsync + #undef cudaMemcpyFromArrayAsync + #undef cudaMemcpy2DFromArrayAsync + #undef cudaMemcpy3DAsync + #undef cudaMemcpy3DPeerAsync + #undef cudaMemsetAsync + #undef cudaMemset2DAsync + #undef cudaMemset3DAsync + #undef cudaStreamQuery + #undef cudaStreamGetFlags + #undef cudaStreamGetPriority + #undef cudaEventRecord + #undef cudaEventRecordWithFlags + #undef cudaStreamWaitEvent + #undef cudaStreamAddCallback + #undef cudaStreamAttachMemAsync + #undef cudaStreamSynchronize + #undef cudaLaunchKernel + #undef cudaLaunchKernelExC + #undef cudaLaunchHostFunc + #undef cudaMemPrefetchAsync + #undef cudaLaunchCooperativeKernel + #undef cudaSignalExternalSemaphoresAsync + #undef cudaWaitExternalSemaphoresAsync + #undef cudaGraphUpload + #undef cudaGraphLaunch + #undef cudaStreamBeginCapture + #undef cudaStreamEndCapture + #undef cudaStreamIsCapturing + #undef cudaStreamGetCaptureInfo + #undef cudaStreamGetCaptureInfo_v2 + #undef cudaStreamCopyAttributes + #undef cudaStreamGetAttribute + #undef cudaStreamSetAttribute + #undef cudaMallocAsync + #undef cudaFreeAsync + #undef cudaMallocFromPoolAsync + #undef cudaGetDriverEntryPoint + + extern __host__ cudaError_t CUDARTAPI cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyHostToDevice)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbol(void *dst, const void *symbol, size_t count, size_t offset __dv(0), enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToHost)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2D(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArray(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArray(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArray(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t count, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DArrayToArray(cudaArray_t dst, size_t wOffsetDst, size_t hOffsetDst, cudaArray_const_t src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, enum cudaMemcpyKind kind __dv(cudaMemcpyDeviceToDevice)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy3D(const struct cudaMemcpy3DParms *p); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeer(const struct cudaMemcpy3DPeerParms *p); + extern __host__ cudaError_t CUDARTAPI cudaMemset(void *devPtr, int value, size_t count); + extern __host__ cudaError_t CUDARTAPI cudaMemset2D(void *devPtr, size_t pitch, int value, size_t width, size_t height); + extern __host__ cudaError_t CUDARTAPI cudaMemset3D(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpyAsync(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyToSymbolAsync(const void *symbol, const void *src, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromSymbolAsync(void *dst, const void *symbol, size_t count, size_t offset, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy2DAsync(void *dst, size_t dpitch, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DToArrayAsync(cudaArray_t dst, size_t wOffset, size_t hOffset, const void *src, size_t spitch, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpyFromArrayAsync(void *dst, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t count, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy2DFromArrayAsync(void *dst, size_t dpitch, cudaArray_const_t src, size_t wOffset, size_t hOffset, size_t width, size_t height, enum cudaMemcpyKind kind, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemcpy3DAsync(const struct cudaMemcpy3DParms *p, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaMemcpy3DPeerAsync(const struct cudaMemcpy3DPeerParms *p, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset2DAsync(void *devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaMemset3DAsync(struct cudaPitchedPtr pitchedDevPtr, int value, struct cudaExtent extent, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaStreamQuery(cudaStream_t stream); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetFlags(cudaStream_t hStream, unsigned int *flags); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamGetPriority(cudaStream_t hStream, int *priority); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecord(cudaEvent_t event, cudaStream_t stream __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaEventRecordWithFlags(cudaEvent_t event, cudaStream_t stream __dv(0), unsigned int flags __dv(0)); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, unsigned int flags); + extern __host__ cudaError_t CUDARTAPI cudaStreamAddCallback(cudaStream_t stream, cudaStreamCallback_t callback, void *userData, unsigned int flags); + extern __host__ __cudart_builtin__ cudaError_t CUDARTAPI cudaStreamAttachMemAsync(cudaStream_t stream, void *devPtr, size_t length, unsigned int flags); + extern __host__ cudaError_t CUDARTAPI cudaStreamSynchronize(cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args); + extern __host__ cudaError_t CUDARTAPI cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaLaunchHostFunc(cudaStream_t stream, cudaHostFn_t fn, void *userData); + extern __host__ cudaError_t CUDARTAPI cudaMemPrefetchAsync(const void *devPtr, size_t count, int dstDevice, cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync_ptsz(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaSignalExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreSignalParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync_ptsz(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams_v1 *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaWaitExternalSemaphoresAsync_v2(const cudaExternalSemaphore_t *extSemArray, const struct cudaExternalSemaphoreWaitParams *paramsArray, unsigned int numExtSems, cudaStream_t stream __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaGraphUpload(cudaGraphExec_t graphExec, cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaGraphLaunch(cudaGraphExec_t graphExec, cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaStreamBeginCapture(cudaStream_t stream, enum cudaStreamCaptureMode mode); + extern __host__ cudaError_t CUDARTAPI cudaStreamEndCapture(cudaStream_t stream, cudaGraph_t *pGraph); + extern __host__ cudaError_t CUDARTAPI cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus); + extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out); + extern __host__ cudaError_t CUDARTAPI cudaStreamGetCaptureInfo_v2(cudaStream_t stream, enum cudaStreamCaptureStatus *captureStatus_out, unsigned long long *id_out __dv(0), cudaGraph_t *graph_out __dv(0), const cudaGraphNode_t **dependencies_out __dv(0), size_t *numDependencies_out __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaStreamUpdateCaptureDependencies_ptsz(cudaStream_t stream, cudaGraphNode_t *dependencies, size_t numDependencies, unsigned int flags __dv(0)); + extern __host__ cudaError_t CUDARTAPI cudaStreamCopyAttributes(cudaStream_t dstStream, cudaStream_t srcStream); + extern __host__ cudaError_t CUDARTAPI cudaStreamGetAttribute(cudaStream_t stream, cudaStreamAttrID attr, cudaStreamAttrValue *value); + extern __host__ cudaError_t CUDARTAPI cudaStreamSetAttribute(cudaStream_t stream, cudaStreamAttrID attr, const cudaStreamAttrValue *param); + + extern __host__ cudaError_t CUDARTAPI cudaMallocAsync(void **devPtr, size_t size, cudaStream_t hStream); + extern __host__ cudaError_t CUDARTAPI cudaFreeAsync(void *devPtr, cudaStream_t hStream); + extern __host__ cudaError_t CUDARTAPI cudaMallocFromPoolAsync(void **ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream); + extern __host__ cudaError_t CUDARTAPI cudaGetDriverEntryPoint(const char *symbol, void **funcPtr, unsigned long long flags); + +#elif defined(__CUDART_API_PER_THREAD_DEFAULT_STREAM) + // nvcc stubs reference the 'cudaLaunch'/'cudaLaunchKernel' identifier even if it was defined + // to 'cudaLaunch_ptsz'/'cudaLaunchKernel_ptsz'. Redirect through a static inline function. + #undef cudaLaunchKernel + static __inline__ __host__ cudaError_t cudaLaunchKernel(const void *func, + dim3 gridDim, dim3 blockDim, + void **args, size_t sharedMem, + cudaStream_t stream) + { + return cudaLaunchKernel_ptsz(func, gridDim, blockDim, args, sharedMem, stream); + } + #define cudaLaunchKernel __CUDART_API_PTSZ(cudaLaunchKernel) + #undef cudaLaunchKernelExC + static __inline__ __host__ cudaError_t cudaLaunchKernelExC(const cudaLaunchConfig_t *config, + const void *func, + void **args) + { + return cudaLaunchKernelExC_ptsz(config, func, args); + } + #define cudaLaunchKernelExC __CUDART_API_PTSZ(cudaLaunchKernelExC) +#endif + +#if defined(__cplusplus) +} + +#endif /* __cplusplus */ + +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ + +#undef __dv +#undef __CUDA_DEPRECATED + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_API_H__ +#endif + +#endif /* !__CUDA_RUNTIME_API_H__ */ diff --git a/ext/cudart/include/cuda_surface_types.h b/ext/cudart/include/cuda_surface_types.h new file mode 100644 index 0000000000000000000000000000000000000000..68e88cfe0e0bbf12eee81fed496c86f39904d4e2 --- /dev/null +++ b/ext/cudart/include/cuda_surface_types.h @@ -0,0 +1,103 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_SURFACE_TYPES_H__) +#define __CUDA_SURFACE_TYPES_H__ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "channel_descriptor.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +template<class T, int dim = 1> +struct __device_builtin_surface_type__ surface : public surfaceReference +{ +#if !defined(__CUDACC_RTC__) + __host__ surface(void) + { + channelDesc = cudaCreateChannelDesc<T>(); + } + + __host__ surface(struct cudaChannelFormatDesc desc) + { + channelDesc = desc; + } +#endif /* !__CUDACC_RTC__ */ +}; + +template<int dim> +struct __device_builtin_surface_type__ surface<void, dim> : public surfaceReference +{ +#if !defined(__CUDACC_RTC__) + __host__ surface(void) + { + channelDesc = cudaCreateChannelDesc<void>(); + } +#endif /* !__CUDACC_RTC__ */ +}; + +#endif /* __cplusplus && __CUDACC__ */ + +#endif /* !__CUDA_SURFACE_TYPES_H__ */ diff --git a/ext/cudart/include/cuda_texture_types.h b/ext/cudart/include/cuda_texture_types.h new file mode 100644 index 0000000000000000000000000000000000000000..21c8d944930456a914ba86e0972dbf6ba1bc5bba --- /dev/null +++ b/ext/cudart/include/cuda_texture_types.h @@ -0,0 +1,109 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_TEXTURE_TYPES_H__) +#define __CUDA_TEXTURE_TYPES_H__ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#if !defined(__CUDACC_RTC__) +#define EXCLUDE_FROM_RTC +#include "channel_descriptor.h" +#undef EXCLUDE_FROM_RTC +#endif /* !__CUDACC_RTC__ */ +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType> +struct __device_builtin_texture_type__ texture : public textureReference +{ +#if !defined(__CUDACC_RTC__) + __host__ texture(int norm = 0, + enum cudaTextureFilterMode fMode = cudaFilterModePoint, + enum cudaTextureAddressMode aMode = cudaAddressModeClamp) + { + normalized = norm; + filterMode = fMode; + addressMode[0] = aMode; + addressMode[1] = aMode; + addressMode[2] = aMode; + channelDesc = cudaCreateChannelDesc<T>(); + sRGB = 0; + } + + __host__ texture(int norm, + enum cudaTextureFilterMode fMode, + enum cudaTextureAddressMode aMode, + struct cudaChannelFormatDesc desc) + { + normalized = norm; + filterMode = fMode; + addressMode[0] = aMode; + addressMode[1] = aMode; + addressMode[2] = aMode; + channelDesc = desc; + sRGB = 0; + } +#endif /* !__CUDACC_RTC__ */ +}; + +#endif /* __cplusplus && __CUDACC__ */ + +#endif /* !__CUDA_TEXTURE_TYPES_H__ */ diff --git a/ext/cudart/include/cudart_platform.h b/ext/cudart/include/cudart_platform.h new file mode 100644 index 0000000000000000000000000000000000000000..0f022bbe349eba2219a6b74f1ea315c1ce8551b7 --- /dev/null +++ b/ext/cudart/include/cudart_platform.h @@ -0,0 +1,57 @@ +/* + * Copyright 2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#ifndef __CUDART_PLATFORM_H__ +#define __CUDART_PLATFORM_H__ + +#if ((defined(__linux__) || defined(__QNX__)) && (defined(__arm__) || defined(__aarch64__) || defined(__x86_64__))) +#define isEglSupported 1 +#endif + +#endif diff --git a/ext/cudart/include/device_atomic_functions.h b/ext/cudart/include/device_atomic_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..c96298f2e2e5d68f71ba6529277dd6ec2bc8daae --- /dev/null +++ b/ext/cudart/include/device_atomic_functions.h @@ -0,0 +1,211 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DEVICE_ATOMIC_FUNCTIONS_H__) +#define __DEVICE_ATOMIC_FUNCTIONS_H__ + +#if defined(__CUDACC_RTC__) +#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +#ifdef __CUDA_ARCH__ +extern "C" +{ +extern __device__ __device_builtin__ int __iAtomicAdd(int *address, int val); +extern __device__ __device_builtin__ unsigned int __uAtomicAdd(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ int __iAtomicExch(int *address, int val); +extern __device__ __device_builtin__ unsigned int __uAtomicExch(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ float __fAtomicExch(float *address, float val); +extern __device__ __device_builtin__ int __iAtomicMin(int *address, int val); +extern __device__ __device_builtin__ unsigned int __uAtomicMin(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ int __iAtomicMax(int *address, int val); +extern __device__ __device_builtin__ unsigned int __uAtomicMax(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ unsigned int __uAtomicInc(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ unsigned int __uAtomicDec(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ int __iAtomicAnd(int *address, int val); +extern __device__ __device_builtin__ unsigned int __uAtomicAnd(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ int __iAtomicOr(int *address, int val); +extern __device__ __device_builtin__ unsigned int __uAtomicOr(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ int __iAtomicXor(int *address, int val); +extern __device__ __device_builtin__ unsigned int __uAtomicXor(unsigned int *address, unsigned int val); +extern __device__ __device_builtin__ int __iAtomicCAS(int *address, int compare, int val); +extern __device__ __device_builtin__ unsigned int __uAtomicCAS(unsigned int *address, unsigned int compare, unsigned int val); +} +#endif /* __CUDA_ARCH__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) __DEF_IF_HOST + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\ + "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70." +#else +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." +#endif + +extern "C" +{ +#ifdef __CUDA_ARCH__ +extern __device__ __device_builtin__ unsigned long long int __ullAtomicAdd(unsigned long long int *address, unsigned long long int val); +extern __device__ __device_builtin__ unsigned long long int __ullAtomicExch(unsigned long long int *address, unsigned long long int val); +extern __device__ __device_builtin__ unsigned long long int __ullAtomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val); +#endif /* __CUDA_ARCH__ */ +extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) int __any(int cond); +extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) int __all(int cond); +} + + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__any)) bool any(bool cond) __DEF_IF_HOST + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__all)) bool all(bool cond) __DEF_IF_HOST + +#undef __DEPRECATED__ +#undef __WSB_DEPRECATION_MESSAGE + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEF_IF_HOST +#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__ + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "device_atomic_functions.hpp" +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#endif /* !__DEVICE_ATOMIC_FUNCTIONS_H__ */ diff --git a/ext/cudart/include/device_atomic_functions.hpp b/ext/cudart/include/device_atomic_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..50e427c39b164e6e145c3930cd66cc03806175a6 --- /dev/null +++ b/ext/cudart/include/device_atomic_functions.hpp @@ -0,0 +1,224 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__) +#define __DEVICE_ATOMIC_FUNCTIONS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val) +{ + return __iAtomicAdd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val) +{ + return __iAtomicAdd(address, (unsigned int)-(int)val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd(address, (unsigned int)-(int)val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val) +{ + return __iAtomicExch(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val) +{ + return __uAtomicExch(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val) +{ + return __fAtomicExch(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val) +{ + return __iAtomicMin(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val) +{ + return __uAtomicMin(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val) +{ + return __iAtomicMax(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val) +{ + return __uAtomicMax(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val) +{ + return __uAtomicInc(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val) +{ + return __uAtomicDec(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val) +{ + return __iAtomicAnd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val) +{ + return __uAtomicAnd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val) +{ + return __iAtomicOr(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val) +{ + return __uAtomicOr(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val) +{ + return __iAtomicXor(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val) +{ + return __uAtomicXor(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val) +{ + return __iAtomicCAS(address, compare, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val) +{ + return __uAtomicCAS(address, compare, val); +} + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val) +{ + return __ullAtomicAdd(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val) +{ + return __ullAtomicExch(address, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val) +{ + return __ullAtomicCAS(address, compare, val); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond) +{ + return (bool)__any((int)cond); +} + +__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond) +{ + return (bool)__all((int)cond); +} + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__ + +#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */ + diff --git a/ext/cudart/include/device_double_functions.h b/ext/cudart/include/device_double_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..82b25e59b40aeaf1e475ff3179e49640a44918b8 --- /dev/null +++ b/ext/cudart/include/device_double_functions.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__ +#endif + +#include "crt/device_double_functions.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__ +#endif diff --git a/ext/cudart/include/device_functions.h b/ext/cudart/include/device_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..0094cc9a0a57f53f47421a8ecc400fb84c26babe --- /dev/null +++ b/ext/cudart/include/device_functions.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__ +#endif + +#include "crt/device_functions.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__ +#endif diff --git a/ext/cudart/include/device_launch_parameters.h b/ext/cudart/include/device_launch_parameters.h new file mode 100644 index 0000000000000000000000000000000000000000..8f552db8faab7d21e90e06a1ea2184a5563d3bf2 --- /dev/null +++ b/ext/cudart/include/device_launch_parameters.h @@ -0,0 +1,118 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__) +#define __DEVICE_LAUNCH_PARAMETERS_H__ + +#include "vector_types.h" + +#if !defined(__STORAGE__) + +#if defined(__CUDACC_RTC__) +#define __STORAGE__ \ + extern const __device__ +#else /* !__CUDACC_RTC__ */ +#define __STORAGE__ \ + extern const +#endif /* __CUDACC_RTC__ */ + +#endif /* __STORAGE__ */ + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +uint3 __device_builtin__ __STORAGE__ threadIdx; +uint3 __device_builtin__ __STORAGE__ blockIdx; +dim3 __device_builtin__ __STORAGE__ blockDim; +dim3 __device_builtin__ __STORAGE__ gridDim; +int __device_builtin__ __STORAGE__ warpSize; + +#undef __STORAGE__ + +#if defined(__cplusplus) +} +#endif /* __cplusplus */ + +#if !defined(__cudaGet_threadIdx) + +#define __cudaGet_threadIdx() \ + threadIdx + +#endif /* __cudaGet_threadIdx */ + +#if !defined(__cudaGet_blockIdx) + +#define __cudaGet_blockIdx() \ + blockIdx + +#endif /* __cudaGet_blockIdx */ + +#if !defined(__cudaGet_blockDim) + +#define __cudaGet_blockDim() \ + blockDim + +#endif /* __cudaGet_blockDim */ + +#if !defined(__cudaGet_gridDim) + +#define __cudaGet_gridDim() \ + gridDim + +#endif /* __cudaGet_gridDim */ + +#if !defined(__cudaGet_warpSize) + +#define __cudaGet_warpSize() \ + warpSize + +#endif /* __cudaGet_warpSize */ + +#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */ diff --git a/ext/cudart/include/device_types.h b/ext/cudart/include/device_types.h new file mode 100644 index 0000000000000000000000000000000000000000..4b575a1014c6cdb9bf2f722c2a67e329186079e6 --- /dev/null +++ b/ext/cudart/include/device_types.h @@ -0,0 +1,81 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DEVICE_TYPES_H__) +#define __DEVICE_TYPES_H__ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__ +#endif + +#ifndef __DOXYGEN_ONLY__ +#include "crt/host_defines.h" +#endif + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +enum __device_builtin__ cudaRoundMode +{ + cudaRoundNearest, + cudaRoundZero, + cudaRoundPosInf, + cudaRoundMinInf +}; + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_TYPES_H__ +#endif + +#endif /* !__DEVICE_TYPES_H__ */ diff --git a/ext/cudart/include/driver_functions.h b/ext/cudart/include/driver_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..94767974220594550d496cad4d14c45349b27737 --- /dev/null +++ b/ext/cudart/include/driver_functions.h @@ -0,0 +1,145 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DRIVER_FUNCTIONS_H__) +#define __DRIVER_FUNCTIONS_H__ + +#include "builtin_types.h" +#include "crt/host_defines.h" +#include "driver_types.h" + +/** + * \addtogroup CUDART_MEMORY + * + * @{ + */ + +/** + * \brief Returns a cudaPitchedPtr based on input parameters + * + * Returns a ::cudaPitchedPtr based on the specified input parameters \p d, + * \p p, \p xsz, and \p ysz. + * + * \param d - Pointer to allocated memory + * \param p - Pitch of allocated memory in bytes + * \param xsz - Logical width of allocation in elements + * \param ysz - Logical height of allocation in elements + * + * \return + * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz + * + * \sa make_cudaExtent, make_cudaPos + */ +static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) +{ + struct cudaPitchedPtr s; + + s.ptr = d; + s.pitch = p; + s.xsize = xsz; + s.ysize = ysz; + + return s; +} + +/** + * \brief Returns a cudaPos based on input parameters + * + * Returns a ::cudaPos based on the specified input parameters \p x, + * \p y, and \p z. + * + * \param x - X position + * \param y - Y position + * \param z - Z position + * + * \return + * ::cudaPos specified by \p x, \p y, and \p z + * + * \sa make_cudaExtent, make_cudaPitchedPtr + */ +static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) +{ + struct cudaPos p; + + p.x = x; + p.y = y; + p.z = z; + + return p; +} + +/** + * \brief Returns a cudaExtent based on input parameters + * + * Returns a ::cudaExtent based on the specified input parameters \p w, + * \p h, and \p d. + * + * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory + * \param h - Height in elements + * \param d - Depth in elements + * + * \return + * ::cudaExtent specified by \p w, \p h, and \p d + * + * \sa make_cudaPitchedPtr, make_cudaPos + */ +static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) +{ + struct cudaExtent e; + + e.width = w; + e.height = h; + e.depth = d; + + return e; +} + +/** @} */ /* END CUDART_MEMORY */ + +#endif /* !__DRIVER_FUNCTIONS_H__ */ diff --git a/ext/cudart/include/driver_types.h b/ext/cudart/include/driver_types.h new file mode 100644 index 0000000000000000000000000000000000000000..47b54f94d6a1dcd278ddda0dd0fa5a4ca866a5ff --- /dev/null +++ b/ext/cudart/include/driver_types.h @@ -0,0 +1,3093 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__DRIVER_TYPES_H__) +#define __DRIVER_TYPES_H__ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__ +#endif + +#ifndef __DOXYGEN_ONLY__ +#include "crt/host_defines.h" +#endif +#include "vector_types.h" + + + +/** + * \defgroup CUDART_TYPES Data types used by CUDA Runtime + * \ingroup CUDART + * + * @{ + */ + +/******************************************************************************* +* * +* TYPE DEFINITIONS USED BY RUNTIME API * +* * +*******************************************************************************/ + +#if !defined(__CUDA_INTERNAL_COMPILATION__) + +#if !defined(__CUDACC_RTC__) +#include <limits.h> +#include <stddef.h> +#endif /* !defined(__CUDACC_RTC__) */ + +#define cudaHostAllocDefault 0x00 /**< Default page-locked allocation flag */ +#define cudaHostAllocPortable 0x01 /**< Pinned memory accessible by all CUDA contexts */ +#define cudaHostAllocMapped 0x02 /**< Map allocation into device space */ +#define cudaHostAllocWriteCombined 0x04 /**< Write-combined memory */ + +#define cudaHostRegisterDefault 0x00 /**< Default host memory registration flag */ +#define cudaHostRegisterPortable 0x01 /**< Pinned memory accessible by all CUDA contexts */ +#define cudaHostRegisterMapped 0x02 /**< Map registered memory into device space */ +#define cudaHostRegisterIoMemory 0x04 /**< Memory-mapped I/O space */ +#define cudaHostRegisterReadOnly 0x08 /**< Memory-mapped read-only */ + +#define cudaPeerAccessDefault 0x00 /**< Default peer addressing enable flag */ + +#define cudaStreamDefault 0x00 /**< Default stream flag */ +#define cudaStreamNonBlocking 0x01 /**< Stream does not synchronize with stream 0 (the NULL stream) */ + + /** + * Legacy stream handle + * + * Stream handle that can be passed as a cudaStream_t to use an implicit stream + * with legacy synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define cudaStreamLegacy ((cudaStream_t)0x1) + +/** + * Per-thread stream handle + * + * Stream handle that can be passed as a cudaStream_t to use an implicit stream + * with per-thread synchronization behavior. + * + * See details of the \link_sync_behavior + */ +#define cudaStreamPerThread ((cudaStream_t)0x2) + +#define cudaEventDefault 0x00 /**< Default event flag */ +#define cudaEventBlockingSync 0x01 /**< Event uses blocking synchronization */ +#define cudaEventDisableTiming 0x02 /**< Event will not record timing data */ +#define cudaEventInterprocess 0x04 /**< Event is suitable for interprocess use. cudaEventDisableTiming must be set */ + +#define cudaEventRecordDefault 0x00 /**< Default event record flag */ +#define cudaEventRecordExternal 0x01 /**< Event is captured in the graph as an external event node when performing stream capture */ + +#define cudaEventWaitDefault 0x00 /**< Default event wait flag */ +#define cudaEventWaitExternal 0x01 /**< Event is captured in the graph as an external event node when performing stream capture */ + +#define cudaDeviceScheduleAuto 0x00 /**< Device flag - Automatic scheduling */ +#define cudaDeviceScheduleSpin 0x01 /**< Device flag - Spin default scheduling */ +#define cudaDeviceScheduleYield 0x02 /**< Device flag - Yield default scheduling */ +#define cudaDeviceScheduleBlockingSync 0x04 /**< Device flag - Use blocking synchronization */ +#define cudaDeviceBlockingSync 0x04 /**< Device flag - Use blocking synchronization + * \deprecated This flag was deprecated as of CUDA 4.0 and + * replaced with ::cudaDeviceScheduleBlockingSync. */ +#define cudaDeviceScheduleMask 0x07 /**< Device schedule flags mask */ +#define cudaDeviceMapHost 0x08 /**< Device flag - Support mapped pinned allocations */ +#define cudaDeviceLmemResizeToMax 0x10 /**< Device flag - Keep local memory allocation after launch */ +#define cudaDeviceMask 0x1f /**< Device flags mask */ + +#define cudaArrayDefault 0x00 /**< Default CUDA array allocation flag */ +#define cudaArrayLayered 0x01 /**< Must be set in cudaMalloc3DArray to create a layered CUDA array */ +#define cudaArraySurfaceLoadStore 0x02 /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array */ +#define cudaArrayCubemap 0x04 /**< Must be set in cudaMalloc3DArray to create a cubemap CUDA array */ +#define cudaArrayTextureGather 0x08 /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array */ +#define cudaArrayColorAttachment 0x20 /**< Must be set in cudaExternalMemoryGetMappedMipmappedArray if the mipmapped array is used as a color target in a graphics API */ +#define cudaArraySparse 0x40 /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA mipmapped array */ +#define cudaArrayDeferredMapping 0x80 /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a deferred mapping CUDA array or CUDA mipmapped array */ + +#define cudaIpcMemLazyEnablePeerAccess 0x01 /**< Automatically enable peer access between remote devices as needed */ + +#define cudaMemAttachGlobal 0x01 /**< Memory can be accessed by any stream on any device*/ +#define cudaMemAttachHost 0x02 /**< Memory cannot be accessed by any stream on any device */ +#define cudaMemAttachSingle 0x04 /**< Memory can only be accessed by a single stream on the associated device */ + +#define cudaOccupancyDefault 0x00 /**< Default behavior */ +#define cudaOccupancyDisableCachingOverride 0x01 /**< Assume global caching is enabled and cannot be automatically turned off */ + +#define cudaCpuDeviceId ((int)-1) /**< Device id that represents the CPU */ +#define cudaInvalidDeviceId ((int)-2) /**< Device id that represents an invalid device */ + +/** + * If set, each kernel launched as part of ::cudaLaunchCooperativeKernelMultiDevice only + * waits for prior work in the stream corresponding to that GPU to complete before the + * kernel begins execution. + */ +#define cudaCooperativeLaunchMultiDeviceNoPreSync 0x01 + +/** + * If set, any subsequent work pushed in a stream that participated in a call to + * ::cudaLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on + * the GPU corresponding to that stream to complete before it begins execution. + */ +#define cudaCooperativeLaunchMultiDeviceNoPostSync 0x02 + +#endif /* !__CUDA_INTERNAL_COMPILATION__ */ + +/** \cond impl_private */ +#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED) +#define __CUDA_DEPRECATED +#elif defined(_MSC_VER) +#define __CUDA_DEPRECATED __declspec(deprecated) +#elif defined(__GNUC__) +#define __CUDA_DEPRECATED __attribute__((deprecated)) +#else +#define __CUDA_DEPRECATED +#endif +/** \endcond impl_private */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +/** + * CUDA error types + */ +enum __device_builtin__ cudaError +{ + /** + * The API call returned with no errors. In the case of query calls, this + * also means that the operation being queried is complete (see + * ::cudaEventQuery() and ::cudaStreamQuery()). + */ + cudaSuccess = 0, + + /** + * This indicates that one or more of the parameters passed to the API call + * is not within an acceptable range of values. + */ + cudaErrorInvalidValue = 1, + + /** + * The API call failed because it was unable to allocate enough memory to + * perform the requested operation. + */ + cudaErrorMemoryAllocation = 2, + + /** + * The API call failed because the CUDA driver and runtime could not be + * initialized. + */ + cudaErrorInitializationError = 3, + + /** + * This indicates that a CUDA Runtime API call cannot be executed because + * it is being called during process shut down, at a point in time after + * CUDA driver has been unloaded. + */ + cudaErrorCudartUnloading = 4, + + /** + * This indicates profiler is not initialized for this run. This can + * happen when the application is running with external profiling tools + * like visual profiler. + */ + cudaErrorProfilerDisabled = 5, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to attempt to enable/disable the profiling via ::cudaProfilerStart or + * ::cudaProfilerStop without initialization. + */ + cudaErrorProfilerNotInitialized = 6, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cudaProfilerStart() when profiling is already enabled. + */ + cudaErrorProfilerAlreadyStarted = 7, + + /** + * \deprecated + * This error return is deprecated as of CUDA 5.0. It is no longer an error + * to call cudaProfilerStop() when profiling is already disabled. + */ + cudaErrorProfilerAlreadyStopped = 8, + + /** + * This indicates that a kernel launch is requesting resources that can + * never be satisfied by the current device. Requesting more shared memory + * per block than the device supports will trigger this error, as will + * requesting too many threads or blocks. See ::cudaDeviceProp for more + * device limitations. + */ + cudaErrorInvalidConfiguration = 9, + + /** + * This indicates that one or more of the pitch-related parameters passed + * to the API call is not within the acceptable range for pitch. + */ + cudaErrorInvalidPitchValue = 12, + + /** + * This indicates that the symbol name/identifier passed to the API call + * is not a valid name or identifier. + */ + cudaErrorInvalidSymbol = 13, + + /** + * This indicates that at least one host pointer passed to the API call is + * not a valid host pointer. + * \deprecated + * This error return is deprecated as of CUDA 10.1. + */ + cudaErrorInvalidHostPointer = 16, + + /** + * This indicates that at least one device pointer passed to the API call is + * not a valid device pointer. + * \deprecated + * This error return is deprecated as of CUDA 10.1. + */ + cudaErrorInvalidDevicePointer = 17, + + /** + * This indicates that the texture passed to the API call is not a valid + * texture. + */ + cudaErrorInvalidTexture = 18, + + /** + * This indicates that the texture binding is not valid. This occurs if you + * call ::cudaGetTextureAlignmentOffset() with an unbound texture. + */ + cudaErrorInvalidTextureBinding = 19, + + /** + * This indicates that the channel descriptor passed to the API call is not + * valid. This occurs if the format is not one of the formats specified by + * ::cudaChannelFormatKind, or if one of the dimensions is invalid. + */ + cudaErrorInvalidChannelDescriptor = 20, + + /** + * This indicates that the direction of the memcpy passed to the API call is + * not one of the types specified by ::cudaMemcpyKind. + */ + cudaErrorInvalidMemcpyDirection = 21, + + /** + * This indicated that the user has taken the address of a constant variable, + * which was forbidden up until the CUDA 3.1 release. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Variables in constant + * memory may now have their address taken by the runtime via + * ::cudaGetSymbolAddress(). + */ + cudaErrorAddressOfConstant = 22, + + /** + * This indicated that a texture fetch was not able to be performed. + * This was previously used for device emulation of texture operations. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorTextureFetchFailed = 23, + + /** + * This indicated that a texture was not bound for access. + * This was previously used for device emulation of texture operations. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorTextureNotBound = 24, + + /** + * This indicated that a synchronization operation had failed. + * This was previously used for some device emulation functions. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorSynchronizationError = 25, + + /** + * This indicates that a non-float texture was being accessed with linear + * filtering. This is not supported by CUDA. + */ + cudaErrorInvalidFilterSetting = 26, + + /** + * This indicates that an attempt was made to read a non-float texture as a + * normalized float. This is not supported by CUDA. + */ + cudaErrorInvalidNormSetting = 27, + + /** + * Mixing of device and device emulation code was not allowed. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorMixedDeviceExecution = 28, + + /** + * This indicates that the API call is not yet implemented. Production + * releases of CUDA will never return this error. + * \deprecated + * This error return is deprecated as of CUDA 4.1. + */ + cudaErrorNotYetImplemented = 31, + + /** + * This indicated that an emulated device pointer exceeded the 32-bit address + * range. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorMemoryValueTooLarge = 32, + + /** + * This indicates that the CUDA driver that the application has loaded is a + * stub library. Applications that run with the stub rather than a real + * driver loaded will result in CUDA API returning this error. + */ + cudaErrorStubLibrary = 34, + + /** + * This indicates that the installed NVIDIA CUDA driver is older than the + * CUDA runtime library. This is not a supported configuration. Users should + * install an updated NVIDIA display driver to allow the application to run. + */ + cudaErrorInsufficientDriver = 35, + + /** + * This indicates that the API call requires a newer CUDA driver than the one + * currently installed. Users should install an updated NVIDIA CUDA driver + * to allow the API call to succeed. + */ + cudaErrorCallRequiresNewerDriver = 36, + + /** + * This indicates that the surface passed to the API call is not a valid + * surface. + */ + cudaErrorInvalidSurface = 37, + + /** + * This indicates that multiple global or constant variables (across separate + * CUDA source files in the application) share the same string name. + */ + cudaErrorDuplicateVariableName = 43, + + /** + * This indicates that multiple textures (across separate CUDA source + * files in the application) share the same string name. + */ + cudaErrorDuplicateTextureName = 44, + + /** + * This indicates that multiple surfaces (across separate CUDA source + * files in the application) share the same string name. + */ + cudaErrorDuplicateSurfaceName = 45, + + /** + * This indicates that all CUDA devices are busy or unavailable at the current + * time. Devices are often busy/unavailable due to use of + * ::cudaComputeModeProhibited, ::cudaComputeModeExclusiveProcess, or when long + * running CUDA kernels have filled up the GPU and are blocking new work + * from starting. They can also be unavailable due to memory constraints + * on a device that already has active CUDA work being performed. + */ + cudaErrorDevicesUnavailable = 46, + + /** + * This indicates that the current context is not compatible with this + * the CUDA Runtime. This can only occur if you are using CUDA + * Runtime/Driver interoperability and have created an existing Driver + * context using the driver API. The Driver context may be incompatible + * either because the Driver context was created using an older version + * of the API, because the Runtime API call expects a primary driver + * context and the Driver context is not primary, or because the Driver + * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions + * with the CUDA Driver API" for more information. + */ + cudaErrorIncompatibleDriverContext = 49, + + /** + * The device function being invoked (usually via ::cudaLaunchKernel()) was not + * previously configured via the ::cudaConfigureCall() function. + */ + cudaErrorMissingConfiguration = 52, + + /** + * This indicated that a previous kernel launch failed. This was previously + * used for device emulation of kernel launches. + * \deprecated + * This error return is deprecated as of CUDA 3.1. Device emulation mode was + * removed with the CUDA 3.1 release. + */ + cudaErrorPriorLaunchFailure = 53, + + /** + * This error indicates that a device runtime grid launch did not occur + * because the depth of the child grid would exceed the maximum supported + * number of nested grid launches. + */ + cudaErrorLaunchMaxDepthExceeded = 65, + + /** + * This error indicates that a grid launch did not occur because the kernel + * uses file-scoped textures which are unsupported by the device runtime. + * Kernels launched via the device runtime only support textures created with + * the Texture Object API's. + */ + cudaErrorLaunchFileScopedTex = 66, + + /** + * This error indicates that a grid launch did not occur because the kernel + * uses file-scoped surfaces which are unsupported by the device runtime. + * Kernels launched via the device runtime only support surfaces created with + * the Surface Object API's. + */ + cudaErrorLaunchFileScopedSurf = 67, + + /** + * This error indicates that a call to ::cudaDeviceSynchronize made from + * the device runtime failed because the call was made at grid depth greater + * than than either the default (2 levels of grids) or user specified device + * limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on + * launched grids at a greater depth successfully, the maximum nested + * depth at which ::cudaDeviceSynchronize will be called must be specified + * with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit + * api before the host-side launch of a kernel using the device runtime. + * Keep in mind that additional levels of sync depth require the runtime + * to reserve large amounts of device memory that cannot be used for + * user allocations. + */ + cudaErrorSyncDepthExceeded = 68, + + /** + * This error indicates that a device runtime grid launch failed because + * the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount. + * For this launch to proceed successfully, ::cudaDeviceSetLimit must be + * called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher + * than the upper bound of outstanding launches that can be issued to the + * device runtime. Keep in mind that raising the limit of pending device + * runtime launches will require the runtime to reserve device memory that + * cannot be used for user allocations. + */ + cudaErrorLaunchPendingCountExceeded = 69, + + /** + * The requested device function does not exist or is not compiled for the + * proper device architecture. + */ + cudaErrorInvalidDeviceFunction = 98, + + /** + * This indicates that no CUDA-capable devices were detected by the installed + * CUDA driver. + */ + cudaErrorNoDevice = 100, + + /** + * This indicates that the device ordinal supplied by the user does not + * correspond to a valid CUDA device or that the action requested is + * invalid for the specified device. + */ + cudaErrorInvalidDevice = 101, + + /** + * This indicates that the device doesn't have a valid Grid License. + */ + cudaErrorDeviceNotLicensed = 102, + + /** + * By default, the CUDA runtime may perform a minimal set of self-tests, + * as well as CUDA driver tests, to establish the validity of both. + * Introduced in CUDA 11.2, this error return indicates that at least one + * of these tests has failed and the validity of either the runtime + * or the driver could not be established. + */ + cudaErrorSoftwareValidityNotEstablished = 103, + + /** + * This indicates an internal startup failure in the CUDA runtime. + */ + cudaErrorStartupFailure = 127, + + /** + * This indicates that the device kernel image is invalid. + */ + cudaErrorInvalidKernelImage = 200, + + /** + * This most frequently indicates that there is no context bound to the + * current thread. This can also be returned if the context passed to an + * API call is not a valid handle (such as a context that has had + * ::cuCtxDestroy() invoked on it). This can also be returned if a user + * mixes different API versions (i.e. 3010 context with 3020 API calls). + * See ::cuCtxGetApiVersion() for more details. + */ + cudaErrorDeviceUninitialized = 201, + + /** + * This indicates that the buffer object could not be mapped. + */ + cudaErrorMapBufferObjectFailed = 205, + + /** + * This indicates that the buffer object could not be unmapped. + */ + cudaErrorUnmapBufferObjectFailed = 206, + + /** + * This indicates that the specified array is currently mapped and thus + * cannot be destroyed. + */ + cudaErrorArrayIsMapped = 207, + + /** + * This indicates that the resource is already mapped. + */ + cudaErrorAlreadyMapped = 208, + + /** + * This indicates that there is no kernel image available that is suitable + * for the device. This can occur when a user specifies code generation + * options for a particular CUDA source file that do not include the + * corresponding device configuration. + */ + cudaErrorNoKernelImageForDevice = 209, + + /** + * This indicates that a resource has already been acquired. + */ + cudaErrorAlreadyAcquired = 210, + + /** + * This indicates that a resource is not mapped. + */ + cudaErrorNotMapped = 211, + + /** + * This indicates that a mapped resource is not available for access as an + * array. + */ + cudaErrorNotMappedAsArray = 212, + + /** + * This indicates that a mapped resource is not available for access as a + * pointer. + */ + cudaErrorNotMappedAsPointer = 213, + + /** + * This indicates that an uncorrectable ECC error was detected during + * execution. + */ + cudaErrorECCUncorrectable = 214, + + /** + * This indicates that the ::cudaLimit passed to the API call is not + * supported by the active device. + */ + cudaErrorUnsupportedLimit = 215, + + /** + * This indicates that a call tried to access an exclusive-thread device that + * is already in use by a different thread. + */ + cudaErrorDeviceAlreadyInUse = 216, + + /** + * This error indicates that P2P access is not supported across the given + * devices. + */ + cudaErrorPeerAccessUnsupported = 217, + + /** + * A PTX compilation failed. The runtime may fall back to compiling PTX if + * an application does not contain a suitable binary for the current device. + */ + cudaErrorInvalidPtx = 218, + + /** + * This indicates an error with the OpenGL or DirectX context. + */ + cudaErrorInvalidGraphicsContext = 219, + + /** + * This indicates that an uncorrectable NVLink error was detected during the + * execution. + */ + cudaErrorNvlinkUncorrectable = 220, + + /** + * This indicates that the PTX JIT compiler library was not found. The JIT Compiler + * library is used for PTX compilation. The runtime may fall back to compiling PTX + * if an application does not contain a suitable binary for the current device. + */ + cudaErrorJitCompilerNotFound = 221, + + /** + * This indicates that the provided PTX was compiled with an unsupported toolchain. + * The most common reason for this, is the PTX was generated by a compiler newer + * than what is supported by the CUDA driver and PTX JIT compiler. + */ + cudaErrorUnsupportedPtxVersion = 222, + + /** + * This indicates that the JIT compilation was disabled. The JIT compilation compiles + * PTX. The runtime may fall back to compiling PTX if an application does not contain + * a suitable binary for the current device. + */ + cudaErrorJitCompilationDisabled = 223, + + /** + * This indicates that the provided execution affinity is not supported by the device. + */ + cudaErrorUnsupportedExecAffinity = 224, + + /** + * This indicates that the device kernel source is invalid. + */ + cudaErrorInvalidSource = 300, + + /** + * This indicates that the file specified was not found. + */ + cudaErrorFileNotFound = 301, + + /** + * This indicates that a link to a shared object failed to resolve. + */ + cudaErrorSharedObjectSymbolNotFound = 302, + + /** + * This indicates that initialization of a shared object failed. + */ + cudaErrorSharedObjectInitFailed = 303, + + /** + * This error indicates that an OS call failed. + */ + cudaErrorOperatingSystem = 304, + + /** + * This indicates that a resource handle passed to the API call was not + * valid. Resource handles are opaque types like ::cudaStream_t and + * ::cudaEvent_t. + */ + cudaErrorInvalidResourceHandle = 400, + + /** + * This indicates that a resource required by the API call is not in a + * valid state to perform the requested operation. + */ + cudaErrorIllegalState = 401, + + /** + * This indicates that a named symbol was not found. Examples of symbols + * are global/constant variable names, driver function names, texture names, + * and surface names. + */ + cudaErrorSymbolNotFound = 500, + + /** + * This indicates that asynchronous operations issued previously have not + * completed yet. This result is not actually an error, but must be indicated + * differently than ::cudaSuccess (which indicates completion). Calls that + * may return this value include ::cudaEventQuery() and ::cudaStreamQuery(). + */ + cudaErrorNotReady = 600, + + /** + * The device encountered a load or store instruction on an invalid memory address. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorIllegalAddress = 700, + + /** + * This indicates that a launch did not occur because it did not have + * appropriate resources. Although this error is similar to + * ::cudaErrorInvalidConfiguration, this error usually indicates that the + * user has attempted to pass too many arguments to the device kernel, or the + * kernel launch specifies too many threads for the kernel's register count. + */ + cudaErrorLaunchOutOfResources = 701, + + /** + * This indicates that the device kernel took too long to execute. This can + * only occur if timeouts are enabled - see the device property + * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled" + * for more information. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorLaunchTimeout = 702, + + /** + * This error indicates a kernel launch that uses an incompatible texturing + * mode. + */ + cudaErrorLaunchIncompatibleTexturing = 703, + + /** + * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is + * trying to re-enable peer addressing on from a context which has already + * had peer addressing enabled. + */ + cudaErrorPeerAccessAlreadyEnabled = 704, + + /** + * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to + * disable peer addressing which has not been enabled yet via + * ::cudaDeviceEnablePeerAccess(). + */ + cudaErrorPeerAccessNotEnabled = 705, + + /** + * This indicates that the user has called ::cudaSetValidDevices(), + * ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), + * ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or + * ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by + * calling non-device management operations (allocating memory and + * launching kernels are examples of non-device management operations). + * This error can also be returned if using runtime/driver + * interoperability and there is an existing ::CUcontext active on the + * host thread. + */ + cudaErrorSetOnActiveProcess = 708, + + /** + * This error indicates that the context current to the calling thread + * has been destroyed using ::cuCtxDestroy, or is a primary context which + * has not yet been initialized. + */ + cudaErrorContextIsDestroyed = 709, + + /** + * An assert triggered in device code during kernel execution. The device + * cannot be used again. All existing allocations are invalid. To continue + * using CUDA, the process must be terminated and relaunched. + */ + cudaErrorAssert = 710, + + /** + * This error indicates that the hardware resources required to enable + * peer access have been exhausted for one or more of the devices + * passed to ::cudaEnablePeerAccess(). + */ + cudaErrorTooManyPeers = 711, + + /** + * This error indicates that the memory range passed to ::cudaHostRegister() + * has already been registered. + */ + cudaErrorHostMemoryAlreadyRegistered = 712, + + /** + * This error indicates that the pointer passed to ::cudaHostUnregister() + * does not correspond to any currently registered memory region. + */ + cudaErrorHostMemoryNotRegistered = 713, + + /** + * Device encountered an error in the call stack during kernel execution, + * possibly due to stack corruption or exceeding the stack size limit. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorHardwareStackError = 714, + + /** + * The device encountered an illegal instruction during kernel execution + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorIllegalInstruction = 715, + + /** + * The device encountered a load or store instruction + * on a memory address which is not aligned. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorMisalignedAddress = 716, + + /** + * While executing a kernel, the device encountered an instruction + * which can only operate on memory locations in certain address spaces + * (global, shared, or local), but was supplied a memory address not + * belonging to an allowed address space. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorInvalidAddressSpace = 717, + + /** + * The device encountered an invalid program counter. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorInvalidPc = 718, + + /** + * An exception occurred on the device while executing a kernel. Common + * causes include dereferencing an invalid device pointer and accessing + * out of bounds shared memory. Less common cases can be system specific - more + * information about these cases can be found in the system specific user guide. + * This leaves the process in an inconsistent state and any further CUDA work + * will return the same error. To continue using CUDA, the process must be terminated + * and relaunched. + */ + cudaErrorLaunchFailure = 719, + + /** + * This error indicates that the number of blocks launched per grid for a kernel that was + * launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice + * exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor + * or ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors + * as specified by the device attribute ::cudaDevAttrMultiProcessorCount. + */ + cudaErrorCooperativeLaunchTooLarge = 720, + + /** + * This error indicates the attempted operation is not permitted. + */ + cudaErrorNotPermitted = 800, + + /** + * This error indicates the attempted operation is not supported + * on the current system or device. + */ + cudaErrorNotSupported = 801, + + /** + * This error indicates that the system is not yet ready to start any CUDA + * work. To continue using CUDA, verify the system configuration is in a + * valid state and all required driver daemons are actively running. + * More information about this error can be found in the system specific + * user guide. + */ + cudaErrorSystemNotReady = 802, + + /** + * This error indicates that there is a mismatch between the versions of + * the display driver and the CUDA driver. Refer to the compatibility documentation + * for supported versions. + */ + cudaErrorSystemDriverMismatch = 803, + + /** + * This error indicates that the system was upgraded to run with forward compatibility + * but the visible hardware detected by CUDA does not support this configuration. + * Refer to the compatibility documentation for the supported hardware matrix or ensure + * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES + * environment variable. + */ + cudaErrorCompatNotSupportedOnDevice = 804, + + /** + * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server. + */ + cudaErrorMpsConnectionFailed = 805, + + /** + * This error indicates that the remote procedural call between the MPS server and the MPS client failed. + */ + cudaErrorMpsRpcFailure = 806, + + /** + * This error indicates that the MPS server is not ready to accept new MPS client requests. + * This error can be returned when the MPS server is in the process of recovering from a fatal failure. + */ + cudaErrorMpsServerNotReady = 807, + + /** + * This error indicates that the hardware resources required to create MPS client have been exhausted. + */ + cudaErrorMpsMaxClientsReached = 808, + + /** + * This error indicates the the hardware resources required to device connections have been exhausted. + */ + cudaErrorMpsMaxConnectionsReached = 809, + + /** + * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched. + */ + cudaErrorMpsClientTerminated = 810, + + /** + * The operation is not permitted when the stream is capturing. + */ + cudaErrorStreamCaptureUnsupported = 900, + + /** + * The current capture sequence on the stream has been invalidated due to + * a previous error. + */ + cudaErrorStreamCaptureInvalidated = 901, + + /** + * The operation would have resulted in a merge of two independent capture + * sequences. + */ + cudaErrorStreamCaptureMerge = 902, + + /** + * The capture was not initiated in this stream. + */ + cudaErrorStreamCaptureUnmatched = 903, + + /** + * The capture sequence contains a fork that was not joined to the primary + * stream. + */ + cudaErrorStreamCaptureUnjoined = 904, + + /** + * A dependency would have been created which crosses the capture sequence + * boundary. Only implicit in-stream ordering dependencies are allowed to + * cross the boundary. + */ + cudaErrorStreamCaptureIsolation = 905, + + /** + * The operation would have resulted in a disallowed implicit dependency on + * a current capture sequence from cudaStreamLegacy. + */ + cudaErrorStreamCaptureImplicit = 906, + + /** + * The operation is not permitted on an event which was last recorded in a + * capturing stream. + */ + cudaErrorCapturedEvent = 907, + + /** + * A stream capture sequence not initiated with the ::cudaStreamCaptureModeRelaxed + * argument to ::cudaStreamBeginCapture was passed to ::cudaStreamEndCapture in a + * different thread. + */ + cudaErrorStreamCaptureWrongThread = 908, + + /** + * This indicates that the wait operation has timed out. + */ + cudaErrorTimeout = 909, + + /** + * This error indicates that the graph update was not performed because it included + * changes which violated constraints specific to instantiated graph update. + */ + cudaErrorGraphExecUpdateFailure = 910, + + /** + * This indicates that an async error has occurred in a device outside of CUDA. + * If CUDA was waiting for an external device's signal before consuming shared data, + * the external device signaled an error indicating that the data is not valid for + * consumption. This leaves the process in an inconsistent state and any further CUDA + * work will return the same error. To continue using CUDA, the process must be + * terminated and relaunched. + */ + cudaErrorExternalDevice = 911, + + /** + * This indicates that a kernel launch error has occurred due to cluster + * misconfiguration. + */ + cudaErrorInvalidClusterSize = 912, + + /** + * This indicates that an unknown internal error has occurred. + */ + cudaErrorUnknown = 999, + + /** + * Any unhandled CUDA driver error is added to this value and returned via + * the runtime. Production releases of CUDA should not return such errors. + * \deprecated + * This error return is deprecated as of CUDA 4.1. + */ + cudaErrorApiFailureBase = 10000 +}; + +/** + * Channel format kind + */ +enum __device_builtin__ cudaChannelFormatKind +{ + cudaChannelFormatKindSigned = 0, /**< Signed channel format */ + cudaChannelFormatKindUnsigned = 1, /**< Unsigned channel format */ + cudaChannelFormatKindFloat = 2, /**< Float channel format */ + cudaChannelFormatKindNone = 3, /**< No channel format */ + cudaChannelFormatKindNV12 = 4, /**< Unsigned 8-bit integers, planar 4:2:0 YUV format */ + cudaChannelFormatKindUnsignedNormalized8X1 = 5, /**< 1 channel unsigned 8-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized8X2 = 6, /**< 2 channel unsigned 8-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized8X4 = 7, /**< 4 channel unsigned 8-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized16X1 = 8, /**< 1 channel unsigned 16-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized16X2 = 9, /**< 2 channel unsigned 16-bit normalized integer */ + cudaChannelFormatKindUnsignedNormalized16X4 = 10, /**< 4 channel unsigned 16-bit normalized integer */ + cudaChannelFormatKindSignedNormalized8X1 = 11, /**< 1 channel signed 8-bit normalized integer */ + cudaChannelFormatKindSignedNormalized8X2 = 12, /**< 2 channel signed 8-bit normalized integer */ + cudaChannelFormatKindSignedNormalized8X4 = 13, /**< 4 channel signed 8-bit normalized integer */ + cudaChannelFormatKindSignedNormalized16X1 = 14, /**< 1 channel signed 16-bit normalized integer */ + cudaChannelFormatKindSignedNormalized16X2 = 15, /**< 2 channel signed 16-bit normalized integer */ + cudaChannelFormatKindSignedNormalized16X4 = 16, /**< 4 channel signed 16-bit normalized integer */ + cudaChannelFormatKindUnsignedBlockCompressed1 = 17, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed1SRGB = 18, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/ + cudaChannelFormatKindUnsignedBlockCompressed2 = 19, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed2SRGB = 20, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding */ + cudaChannelFormatKindUnsignedBlockCompressed3 = 21, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed3SRGB = 22, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding */ + cudaChannelFormatKindUnsignedBlockCompressed4 = 23, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */ + cudaChannelFormatKindSignedBlockCompressed4 = 24, /**< 1 channel signed normalized block-compressed (BC4 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed5 = 25, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */ + cudaChannelFormatKindSignedBlockCompressed5 = 26, /**< 2 channel signed normalized block-compressed (BC5 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed6H = 27, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */ + cudaChannelFormatKindSignedBlockCompressed6H = 28, /**< 3 channel signed half-float block-compressed (BC6H compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed7 = 29, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */ + cudaChannelFormatKindUnsignedBlockCompressed7SRGB = 30 /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */ +}; + +/** + * CUDA Channel format descriptor + */ +struct __device_builtin__ cudaChannelFormatDesc +{ + int x; /**< x */ + int y; /**< y */ + int z; /**< z */ + int w; /**< w */ + enum cudaChannelFormatKind f; /**< Channel format kind */ +}; + +/** + * CUDA array + */ +typedef struct cudaArray *cudaArray_t; + +/** + * CUDA array (as source copy argument) + */ +typedef const struct cudaArray *cudaArray_const_t; + +struct cudaArray; + +/** + * CUDA mipmapped array + */ +typedef struct cudaMipmappedArray *cudaMipmappedArray_t; + +/** + * CUDA mipmapped array (as source argument) + */ +typedef const struct cudaMipmappedArray *cudaMipmappedArray_const_t; + +struct cudaMipmappedArray; + +/** + * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers + */ +#define cudaArraySparsePropertiesSingleMipTail 0x1 + +/** + * Sparse CUDA array and CUDA mipmapped array properties + */ +struct __device_builtin__ cudaArraySparseProperties { + struct { + unsigned int width; /**< Tile width in elements */ + unsigned int height; /**< Tile height in elements */ + unsigned int depth; /**< Tile depth in elements */ + } tileExtent; + unsigned int miptailFirstLevel; /**< First mip level at which the mip tail begins */ + unsigned long long miptailSize; /**< Total size of the mip tail. */ + unsigned int flags; /**< Flags will either be zero or ::cudaArraySparsePropertiesSingleMipTail */ + unsigned int reserved[4]; +}; + +/** + * CUDA array and CUDA mipmapped array memory requirements + */ +struct __device_builtin__ cudaArrayMemoryRequirements { + size_t size; /**< Total size of the array. */ + size_t alignment; /**< Alignment necessary for mapping the array. */ + unsigned int reserved[4]; +}; + +/** + * CUDA memory types + */ +enum __device_builtin__ cudaMemoryType +{ + cudaMemoryTypeUnregistered = 0, /**< Unregistered memory */ + cudaMemoryTypeHost = 1, /**< Host memory */ + cudaMemoryTypeDevice = 2, /**< Device memory */ + cudaMemoryTypeManaged = 3 /**< Managed memory */ +}; + +/** + * CUDA memory copy types + */ +enum __device_builtin__ cudaMemcpyKind +{ + cudaMemcpyHostToHost = 0, /**< Host -> Host */ + cudaMemcpyHostToDevice = 1, /**< Host -> Device */ + cudaMemcpyDeviceToHost = 2, /**< Device -> Host */ + cudaMemcpyDeviceToDevice = 3, /**< Device -> Device */ + cudaMemcpyDefault = 4 /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */ +}; + +/** + * CUDA Pitched memory pointer + * + * \sa ::make_cudaPitchedPtr + */ +struct __device_builtin__ cudaPitchedPtr +{ + void *ptr; /**< Pointer to allocated memory */ + size_t pitch; /**< Pitch of allocated memory in bytes */ + size_t xsize; /**< Logical width of allocation in elements */ + size_t ysize; /**< Logical height of allocation in elements */ +}; + +/** + * CUDA extent + * + * \sa ::make_cudaExtent + */ +struct __device_builtin__ cudaExtent +{ + size_t width; /**< Width in elements when referring to array memory, in bytes when referring to linear memory */ + size_t height; /**< Height in elements */ + size_t depth; /**< Depth in elements */ +}; + +/** + * CUDA 3D position + * + * \sa ::make_cudaPos + */ +struct __device_builtin__ cudaPos +{ + size_t x; /**< x */ + size_t y; /**< y */ + size_t z; /**< z */ +}; + +/** + * CUDA 3D memory copying parameters + */ +struct __device_builtin__ cudaMemcpy3DParms +{ + cudaArray_t srcArray; /**< Source memory address */ + struct cudaPos srcPos; /**< Source position offset */ + struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */ + + cudaArray_t dstArray; /**< Destination memory address */ + struct cudaPos dstPos; /**< Destination position offset */ + struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */ + + struct cudaExtent extent; /**< Requested memory copy size */ + enum cudaMemcpyKind kind; /**< Type of transfer */ +}; + +/** + * CUDA 3D cross-device memory copying parameters + */ +struct __device_builtin__ cudaMemcpy3DPeerParms +{ + cudaArray_t srcArray; /**< Source memory address */ + struct cudaPos srcPos; /**< Source position offset */ + struct cudaPitchedPtr srcPtr; /**< Pitched source memory address */ + int srcDevice; /**< Source device */ + + cudaArray_t dstArray; /**< Destination memory address */ + struct cudaPos dstPos; /**< Destination position offset */ + struct cudaPitchedPtr dstPtr; /**< Pitched destination memory address */ + int dstDevice; /**< Destination device */ + + struct cudaExtent extent; /**< Requested memory copy size */ +}; + +/** + * CUDA Memset node parameters + */ +struct __device_builtin__ cudaMemsetParams { + void *dst; /**< Destination device pointer */ + size_t pitch; /**< Pitch of destination device pointer. Unused if height is 1 */ + unsigned int value; /**< Value to be set */ + unsigned int elementSize; /**< Size of each element in bytes. Must be 1, 2, or 4. */ + size_t width; /**< Width of the row in elements */ + size_t height; /**< Number of rows */ +}; + +/** + * Specifies performance hint with ::cudaAccessPolicyWindow for hitProp and missProp members. + */ +enum __device_builtin__ cudaAccessProperty { + cudaAccessPropertyNormal = 0, /**< Normal cache persistence. */ + cudaAccessPropertyStreaming = 1, /**< Streaming access is less likely to persit from cache. */ + cudaAccessPropertyPersisting = 2 /**< Persisting access is more likely to persist in cache.*/ +}; + +/** + * Specifies an access policy for a window, a contiguous extent of memory + * beginning at base_ptr and ending at base_ptr + num_bytes. + * Partition into many segments and assign segments such that. + * sum of "hit segments" / window == approx. ratio. + * sum of "miss segments" / window == approx 1-ratio. + * Segments and ratio specifications are fitted to the capabilities of + * the architecture. + * Accesses in a hit segment apply the hitProp access policy. + * Accesses in a miss segment apply the missProp access policy. + */ +struct __device_builtin__ cudaAccessPolicyWindow { + void *base_ptr; /**< Starting address of the access policy window. CUDA driver may align it. */ + size_t num_bytes; /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */ + float hitRatio; /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */ + enum cudaAccessProperty hitProp; /**< ::CUaccessProperty set for hit. */ + enum cudaAccessProperty missProp; /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING. */ +}; + +#ifdef _WIN32 +#define CUDART_CB __stdcall +#else +#define CUDART_CB +#endif + +/** + * CUDA host function + * \param userData Argument value passed to the function + */ +typedef void (CUDART_CB *cudaHostFn_t)(void *userData); + +/** + * CUDA host node parameters + */ +struct __device_builtin__ cudaHostNodeParams { + cudaHostFn_t fn; /**< The function to call when the node executes */ + void* userData; /**< Argument to pass to the function */ +}; + +/** + * Possible stream capture statuses returned by ::cudaStreamIsCapturing + */ +enum __device_builtin__ cudaStreamCaptureStatus { + cudaStreamCaptureStatusNone = 0, /**< Stream is not capturing */ + cudaStreamCaptureStatusActive = 1, /**< Stream is actively capturing */ + cudaStreamCaptureStatusInvalidated = 2 /**< Stream is part of a capture sequence that + has been invalidated, but not terminated */ +}; + +/** + * Possible modes for stream capture thread interactions. For more details see + * ::cudaStreamBeginCapture and ::cudaThreadExchangeStreamCaptureMode + */ +enum __device_builtin__ cudaStreamCaptureMode { + cudaStreamCaptureModeGlobal = 0, + cudaStreamCaptureModeThreadLocal = 1, + cudaStreamCaptureModeRelaxed = 2 +}; + +enum __device_builtin__ cudaSynchronizationPolicy { + cudaSyncPolicyAuto = 1, + cudaSyncPolicySpin = 2, + cudaSyncPolicyYield = 3, + cudaSyncPolicyBlockingSync = 4 +}; + +/** + * Cluster scheduling policies. These may be passed to ::cudaFuncSetAttribute + */ +enum __device_builtin__ cudaClusterSchedulingPolicy { + cudaClusterSchedulingPolicyDefault = 0, /**< the default policy */ + cudaClusterSchedulingPolicySpread = 1, /**< spread the blocks within a cluster to the SMs */ + cudaClusterSchedulingPolicyLoadBalancing = 2 /**< allow the hardware to load-balance the blocks in a cluster to the SMs */ +}; + +/** + * Flags for ::cudaStreamUpdateCaptureDependencies + */ +enum __device_builtin__ cudaStreamUpdateCaptureDependenciesFlags { + cudaStreamAddCaptureDependencies = 0x0, /**< Add new nodes to the dependency set */ + cudaStreamSetCaptureDependencies = 0x1 /**< Replace the dependency set with the new nodes */ +}; + +/** + * Flags for user objects for graphs + */ +enum __device_builtin__ cudaUserObjectFlags { + cudaUserObjectNoDestructorSync = 0x1 /**< Indicates the destructor execution is not synchronized by any CUDA handle. */ +}; + +/** + * Flags for retaining user object references for graphs + */ +enum __device_builtin__ cudaUserObjectRetainFlags { + cudaGraphUserObjectMove = 0x1 /**< Transfer references from the caller rather than creating new references. */ +}; + +/** + * CUDA graphics interop resource + */ +struct cudaGraphicsResource; + +/** + * CUDA graphics interop register flags + */ +enum __device_builtin__ cudaGraphicsRegisterFlags +{ + cudaGraphicsRegisterFlagsNone = 0, /**< Default */ + cudaGraphicsRegisterFlagsReadOnly = 1, /**< CUDA will not write to this resource */ + cudaGraphicsRegisterFlagsWriteDiscard = 2, /**< CUDA will only write to and will not read from this resource */ + cudaGraphicsRegisterFlagsSurfaceLoadStore = 4, /**< CUDA will bind this resource to a surface reference */ + cudaGraphicsRegisterFlagsTextureGather = 8 /**< CUDA will perform texture gather operations on this resource */ +}; + +/** + * CUDA graphics interop map flags + */ +enum __device_builtin__ cudaGraphicsMapFlags +{ + cudaGraphicsMapFlagsNone = 0, /**< Default; Assume resource can be read/written */ + cudaGraphicsMapFlagsReadOnly = 1, /**< CUDA will not write to this resource */ + cudaGraphicsMapFlagsWriteDiscard = 2 /**< CUDA will only write to and will not read from this resource */ +}; + +/** + * CUDA graphics interop array indices for cube maps + */ +enum __device_builtin__ cudaGraphicsCubeFace +{ + cudaGraphicsCubeFacePositiveX = 0x00, /**< Positive X face of cubemap */ + cudaGraphicsCubeFaceNegativeX = 0x01, /**< Negative X face of cubemap */ + cudaGraphicsCubeFacePositiveY = 0x02, /**< Positive Y face of cubemap */ + cudaGraphicsCubeFaceNegativeY = 0x03, /**< Negative Y face of cubemap */ + cudaGraphicsCubeFacePositiveZ = 0x04, /**< Positive Z face of cubemap */ + cudaGraphicsCubeFaceNegativeZ = 0x05 /**< Negative Z face of cubemap */ +}; + +/** + * CUDA resource types + */ +enum __device_builtin__ cudaResourceType +{ + cudaResourceTypeArray = 0x00, /**< Array resource */ + cudaResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */ + cudaResourceTypeLinear = 0x02, /**< Linear resource */ + cudaResourceTypePitch2D = 0x03 /**< Pitch 2D resource */ +}; + +/** + * CUDA texture resource view formats + */ +enum __device_builtin__ cudaResourceViewFormat +{ + cudaResViewFormatNone = 0x00, /**< No resource view format (use underlying resource format) */ + cudaResViewFormatUnsignedChar1 = 0x01, /**< 1 channel unsigned 8-bit integers */ + cudaResViewFormatUnsignedChar2 = 0x02, /**< 2 channel unsigned 8-bit integers */ + cudaResViewFormatUnsignedChar4 = 0x03, /**< 4 channel unsigned 8-bit integers */ + cudaResViewFormatSignedChar1 = 0x04, /**< 1 channel signed 8-bit integers */ + cudaResViewFormatSignedChar2 = 0x05, /**< 2 channel signed 8-bit integers */ + cudaResViewFormatSignedChar4 = 0x06, /**< 4 channel signed 8-bit integers */ + cudaResViewFormatUnsignedShort1 = 0x07, /**< 1 channel unsigned 16-bit integers */ + cudaResViewFormatUnsignedShort2 = 0x08, /**< 2 channel unsigned 16-bit integers */ + cudaResViewFormatUnsignedShort4 = 0x09, /**< 4 channel unsigned 16-bit integers */ + cudaResViewFormatSignedShort1 = 0x0a, /**< 1 channel signed 16-bit integers */ + cudaResViewFormatSignedShort2 = 0x0b, /**< 2 channel signed 16-bit integers */ + cudaResViewFormatSignedShort4 = 0x0c, /**< 4 channel signed 16-bit integers */ + cudaResViewFormatUnsignedInt1 = 0x0d, /**< 1 channel unsigned 32-bit integers */ + cudaResViewFormatUnsignedInt2 = 0x0e, /**< 2 channel unsigned 32-bit integers */ + cudaResViewFormatUnsignedInt4 = 0x0f, /**< 4 channel unsigned 32-bit integers */ + cudaResViewFormatSignedInt1 = 0x10, /**< 1 channel signed 32-bit integers */ + cudaResViewFormatSignedInt2 = 0x11, /**< 2 channel signed 32-bit integers */ + cudaResViewFormatSignedInt4 = 0x12, /**< 4 channel signed 32-bit integers */ + cudaResViewFormatHalf1 = 0x13, /**< 1 channel 16-bit floating point */ + cudaResViewFormatHalf2 = 0x14, /**< 2 channel 16-bit floating point */ + cudaResViewFormatHalf4 = 0x15, /**< 4 channel 16-bit floating point */ + cudaResViewFormatFloat1 = 0x16, /**< 1 channel 32-bit floating point */ + cudaResViewFormatFloat2 = 0x17, /**< 2 channel 32-bit floating point */ + cudaResViewFormatFloat4 = 0x18, /**< 4 channel 32-bit floating point */ + cudaResViewFormatUnsignedBlockCompressed1 = 0x19, /**< Block compressed 1 */ + cudaResViewFormatUnsignedBlockCompressed2 = 0x1a, /**< Block compressed 2 */ + cudaResViewFormatUnsignedBlockCompressed3 = 0x1b, /**< Block compressed 3 */ + cudaResViewFormatUnsignedBlockCompressed4 = 0x1c, /**< Block compressed 4 unsigned */ + cudaResViewFormatSignedBlockCompressed4 = 0x1d, /**< Block compressed 4 signed */ + cudaResViewFormatUnsignedBlockCompressed5 = 0x1e, /**< Block compressed 5 unsigned */ + cudaResViewFormatSignedBlockCompressed5 = 0x1f, /**< Block compressed 5 signed */ + cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */ + cudaResViewFormatSignedBlockCompressed6H = 0x21, /**< Block compressed 6 signed half-float */ + cudaResViewFormatUnsignedBlockCompressed7 = 0x22 /**< Block compressed 7 */ +}; + +/** + * CUDA resource descriptor + */ +struct __device_builtin__ cudaResourceDesc { + enum cudaResourceType resType; /**< Resource type */ + + union { + struct { + cudaArray_t array; /**< CUDA array */ + } array; + struct { + cudaMipmappedArray_t mipmap; /**< CUDA mipmapped array */ + } mipmap; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t sizeInBytes; /**< Size in bytes */ + } linear; + struct { + void *devPtr; /**< Device pointer */ + struct cudaChannelFormatDesc desc; /**< Channel descriptor */ + size_t width; /**< Width of the array in elements */ + size_t height; /**< Height of the array in elements */ + size_t pitchInBytes; /**< Pitch between two rows in bytes */ + } pitch2D; + } res; +}; + +/** + * CUDA resource view descriptor + */ +struct __device_builtin__ cudaResourceViewDesc +{ + enum cudaResourceViewFormat format; /**< Resource view format */ + size_t width; /**< Width of the resource view */ + size_t height; /**< Height of the resource view */ + size_t depth; /**< Depth of the resource view */ + unsigned int firstMipmapLevel; /**< First defined mipmap level */ + unsigned int lastMipmapLevel; /**< Last defined mipmap level */ + unsigned int firstLayer; /**< First layer index */ + unsigned int lastLayer; /**< Last layer index */ +}; + +/** + * CUDA pointer attributes + */ +struct __device_builtin__ cudaPointerAttributes +{ + /** + * The type of memory - ::cudaMemoryTypeUnregistered, ::cudaMemoryTypeHost, + * ::cudaMemoryTypeDevice or ::cudaMemoryTypeManaged. + */ + enum cudaMemoryType type; + + /** + * The device against which the memory was allocated or registered. + * If the memory type is ::cudaMemoryTypeDevice then this identifies + * the device on which the memory referred physically resides. If + * the memory type is ::cudaMemoryTypeHost or::cudaMemoryTypeManaged then + * this identifies the device which was current when the memory was allocated + * or registered (and if that device is deinitialized then this allocation + * will vanish with that device's state). + */ + int device; + + /** + * The address which may be dereferenced on the current device to access + * the memory or NULL if no such address exists. + */ + void *devicePointer; + + /** + * The address which may be dereferenced on the host to access the + * memory or NULL if no such address exists. + * + * \note CUDA doesn't check if unregistered memory is allocated so this field + * may contain invalid pointer if an invalid pointer has been passed to CUDA. + */ + void *hostPointer; +}; + +/** + * CUDA function attributes + */ +struct __device_builtin__ cudaFuncAttributes +{ + /** + * The size in bytes of statically-allocated shared memory per block + * required by this function. This does not include dynamically-allocated + * shared memory requested by the user at runtime. + */ + size_t sharedSizeBytes; + + /** + * The size in bytes of user-allocated constant memory required by this + * function. + */ + size_t constSizeBytes; + + /** + * The size in bytes of local memory used by each thread of this function. + */ + size_t localSizeBytes; + + /** + * The maximum number of threads per block, beyond which a launch of the + * function would fail. This number depends on both the function and the + * device on which the function is currently loaded. + */ + int maxThreadsPerBlock; + + /** + * The number of registers used by each thread of this function. + */ + int numRegs; + + /** + * The PTX virtual architecture version for which the function was + * compiled. This value is the major PTX version * 10 + the minor PTX + * version, so a PTX version 1.3 function would return the value 13. + */ + int ptxVersion; + + /** + * The binary architecture version for which the function was compiled. + * This value is the major binary version * 10 + the minor binary version, + * so a binary version 1.3 function would return the value 13. + */ + int binaryVersion; + + /** + * The attribute to indicate whether the function has been compiled with + * user specified option "-Xptxas --dlcm=ca" set. + */ + int cacheModeCA; + + /** + * The maximum size in bytes of dynamic shared memory per block for + * this function. Any launch must have a dynamic shared memory size + * smaller than this value. + */ + int maxDynamicSharedSizeBytes; + + /** + * On devices where the L1 cache and shared memory use the same hardware resources, + * this sets the shared memory carveout preference, in percent of the maximum shared memory. + * Refer to ::cudaDevAttrMaxSharedMemoryPerMultiprocessor. + * This is only a hint, and the driver can choose a different ratio if required to execute the function. + * See ::cudaFuncSetAttribute + */ + int preferredShmemCarveout; +}; + +/** + * CUDA function attributes that can be set using ::cudaFuncSetAttribute + */ +enum __device_builtin__ cudaFuncAttribute +{ + cudaFuncAttributeMaxDynamicSharedMemorySize = 8, /**< Maximum dynamic shared memory size */ + cudaFuncAttributePreferredSharedMemoryCarveout = 9, /**< Preferred shared memory-L1 cache split */ + cudaFuncAttributeClusterDimMustBeSet = 10, /**< Indicator to enforce valid cluster dimension specification on kernel launch */ + cudaFuncAttributeRequiredClusterWidth = 11, /**< Required cluster width */ + cudaFuncAttributeRequiredClusterHeight = 12, /**< Required cluster height */ + cudaFuncAttributeRequiredClusterDepth = 13, /**< Required cluster depth */ + cudaFuncAttributeNonPortableClusterSizeAllowed = 14, /**< Whether non-portable cluster scheduling policy is supported */ + cudaFuncAttributeClusterSchedulingPolicyPreference = 15, /**< Required cluster scheduling policy preference */ + cudaFuncAttributeMax +}; + +/** + * CUDA function cache configurations + */ +enum __device_builtin__ cudaFuncCache +{ + cudaFuncCachePreferNone = 0, /**< Default function cache configuration, no preference */ + cudaFuncCachePreferShared = 1, /**< Prefer larger shared memory and smaller L1 cache */ + cudaFuncCachePreferL1 = 2, /**< Prefer larger L1 cache and smaller shared memory */ + cudaFuncCachePreferEqual = 3 /**< Prefer equal size L1 cache and shared memory */ +}; + +/** + * CUDA shared memory configuration + */ + +enum __device_builtin__ cudaSharedMemConfig +{ + cudaSharedMemBankSizeDefault = 0, + cudaSharedMemBankSizeFourByte = 1, + cudaSharedMemBankSizeEightByte = 2 +}; + +/** + * Shared memory carveout configurations. These may be passed to cudaFuncSetAttribute + */ +enum __device_builtin__ cudaSharedCarveout { + cudaSharedmemCarveoutDefault = -1, /**< No preference for shared memory or L1 (default) */ + cudaSharedmemCarveoutMaxShared = 100, /**< Prefer maximum available shared memory, minimum L1 cache */ + cudaSharedmemCarveoutMaxL1 = 0 /**< Prefer maximum available L1 cache, minimum shared memory */ +}; + +/** + * CUDA device compute modes + */ +enum __device_builtin__ cudaComputeMode +{ + cudaComputeModeDefault = 0, /**< Default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */ + cudaComputeModeExclusive = 1, /**< Compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */ + cudaComputeModeProhibited = 2, /**< Compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */ + cudaComputeModeExclusiveProcess = 3 /**< Compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */ +}; + +/** + * CUDA Limits + */ +enum __device_builtin__ cudaLimit +{ + cudaLimitStackSize = 0x00, /**< GPU thread stack size */ + cudaLimitPrintfFifoSize = 0x01, /**< GPU printf FIFO size */ + cudaLimitMallocHeapSize = 0x02, /**< GPU malloc heap size */ + cudaLimitDevRuntimeSyncDepth = 0x03, /**< GPU device runtime synchronize depth */ + cudaLimitDevRuntimePendingLaunchCount = 0x04, /**< GPU device runtime pending launch count */ + cudaLimitMaxL2FetchGranularity = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */ + cudaLimitPersistingL2CacheSize = 0x06 /**< A size in bytes for L2 persisting lines cache size */ +}; + +/** + * CUDA Memory Advise values + */ +enum __device_builtin__ cudaMemoryAdvise +{ + cudaMemAdviseSetReadMostly = 1, /**< Data will mostly be read and only occassionally be written to */ + cudaMemAdviseUnsetReadMostly = 2, /**< Undo the effect of ::cudaMemAdviseSetReadMostly */ + cudaMemAdviseSetPreferredLocation = 3, /**< Set the preferred location for the data as the specified device */ + cudaMemAdviseUnsetPreferredLocation = 4, /**< Clear the preferred location for the data */ + cudaMemAdviseSetAccessedBy = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */ + cudaMemAdviseUnsetAccessedBy = 6 /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */ +}; + +/** + * CUDA range attributes + */ +enum __device_builtin__ cudaMemRangeAttribute +{ + cudaMemRangeAttributeReadMostly = 1, /**< Whether the range will mostly be read and only occassionally be written to */ + cudaMemRangeAttributePreferredLocation = 2, /**< The preferred location of the range */ + cudaMemRangeAttributeAccessedBy = 3, /**< Memory range has ::cudaMemAdviseSetAccessedBy set for specified device */ + cudaMemRangeAttributeLastPrefetchLocation = 4 /**< The last location to which the range was prefetched */ +}; + +/** + * CUDA Profiler Output modes + */ +enum __device_builtin__ cudaOutputMode +{ + cudaKeyValuePair = 0x00, /**< Output mode Key-Value pair format. */ + cudaCSV = 0x01 /**< Output mode Comma separated values format. */ +}; + +/** + * CUDA GPUDirect RDMA flush writes APIs supported on the device + */ +enum __device_builtin__ cudaFlushGPUDirectRDMAWritesOptions { + cudaFlushGPUDirectRDMAWritesOptionHost = 1<<0, /**< ::cudaDeviceFlushGPUDirectRDMAWrites() and its CUDA Driver API counterpart are supported on the device. */ + cudaFlushGPUDirectRDMAWritesOptionMemOps = 1<<1 /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the CUDA device. */ +}; + +/** + * CUDA GPUDirect RDMA flush writes ordering features of the device + */ +enum __device_builtin__ cudaGPUDirectRDMAWritesOrdering { + cudaGPUDirectRDMAWritesOrderingNone = 0, /**< The device does not natively support ordering of GPUDirect RDMA writes. ::cudaFlushGPUDirectRDMAWrites() can be leveraged if supported. */ + cudaGPUDirectRDMAWritesOrderingOwner = 100, /**< Natively, the device can consistently consume GPUDirect RDMA writes, although other CUDA devices may not. */ + cudaGPUDirectRDMAWritesOrderingAllDevices = 200 /**< Any CUDA device in the system can consistently consume GPUDirect RDMA writes to this device. */ +}; + +/** + * CUDA GPUDirect RDMA flush writes scopes + */ +enum __device_builtin__ cudaFlushGPUDirectRDMAWritesScope { + cudaFlushGPUDirectRDMAWritesToOwner = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */ + cudaFlushGPUDirectRDMAWritesToAllDevices = 200 /**< Blocks until remote writes are visible to all CUDA device contexts. */ +}; + +/** + * CUDA GPUDirect RDMA flush writes targets + */ +enum __device_builtin__ cudaFlushGPUDirectRDMAWritesTarget { + cudaFlushGPUDirectRDMAWritesTargetCurrentDevice /**< Sets the target for ::cudaDeviceFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */ +}; + + +/** + * CUDA device attributes + */ +enum __device_builtin__ cudaDeviceAttr +{ + cudaDevAttrMaxThreadsPerBlock = 1, /**< Maximum number of threads per block */ + cudaDevAttrMaxBlockDimX = 2, /**< Maximum block dimension X */ + cudaDevAttrMaxBlockDimY = 3, /**< Maximum block dimension Y */ + cudaDevAttrMaxBlockDimZ = 4, /**< Maximum block dimension Z */ + cudaDevAttrMaxGridDimX = 5, /**< Maximum grid dimension X */ + cudaDevAttrMaxGridDimY = 6, /**< Maximum grid dimension Y */ + cudaDevAttrMaxGridDimZ = 7, /**< Maximum grid dimension Z */ + cudaDevAttrMaxSharedMemoryPerBlock = 8, /**< Maximum shared memory available per block in bytes */ + cudaDevAttrTotalConstantMemory = 9, /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */ + cudaDevAttrWarpSize = 10, /**< Warp size in threads */ + cudaDevAttrMaxPitch = 11, /**< Maximum pitch in bytes allowed by memory copies */ + cudaDevAttrMaxRegistersPerBlock = 12, /**< Maximum number of 32-bit registers available per block */ + cudaDevAttrClockRate = 13, /**< Peak clock frequency in kilohertz */ + cudaDevAttrTextureAlignment = 14, /**< Alignment requirement for textures */ + cudaDevAttrGpuOverlap = 15, /**< Device can possibly copy memory and execute a kernel concurrently */ + cudaDevAttrMultiProcessorCount = 16, /**< Number of multiprocessors on device */ + cudaDevAttrKernelExecTimeout = 17, /**< Specifies whether there is a run time limit on kernels */ + cudaDevAttrIntegrated = 18, /**< Device is integrated with host memory */ + cudaDevAttrCanMapHostMemory = 19, /**< Device can map host memory into CUDA address space */ + cudaDevAttrComputeMode = 20, /**< Compute mode (See ::cudaComputeMode for details) */ + cudaDevAttrMaxTexture1DWidth = 21, /**< Maximum 1D texture width */ + cudaDevAttrMaxTexture2DWidth = 22, /**< Maximum 2D texture width */ + cudaDevAttrMaxTexture2DHeight = 23, /**< Maximum 2D texture height */ + cudaDevAttrMaxTexture3DWidth = 24, /**< Maximum 3D texture width */ + cudaDevAttrMaxTexture3DHeight = 25, /**< Maximum 3D texture height */ + cudaDevAttrMaxTexture3DDepth = 26, /**< Maximum 3D texture depth */ + cudaDevAttrMaxTexture2DLayeredWidth = 27, /**< Maximum 2D layered texture width */ + cudaDevAttrMaxTexture2DLayeredHeight = 28, /**< Maximum 2D layered texture height */ + cudaDevAttrMaxTexture2DLayeredLayers = 29, /**< Maximum layers in a 2D layered texture */ + cudaDevAttrSurfaceAlignment = 30, /**< Alignment requirement for surfaces */ + cudaDevAttrConcurrentKernels = 31, /**< Device can possibly execute multiple kernels concurrently */ + cudaDevAttrEccEnabled = 32, /**< Device has ECC support enabled */ + cudaDevAttrPciBusId = 33, /**< PCI bus ID of the device */ + cudaDevAttrPciDeviceId = 34, /**< PCI device ID of the device */ + cudaDevAttrTccDriver = 35, /**< Device is using TCC driver model */ + cudaDevAttrMemoryClockRate = 36, /**< Peak memory clock frequency in kilohertz */ + cudaDevAttrGlobalMemoryBusWidth = 37, /**< Global memory bus width in bits */ + cudaDevAttrL2CacheSize = 38, /**< Size of L2 cache in bytes */ + cudaDevAttrMaxThreadsPerMultiProcessor = 39, /**< Maximum resident threads per multiprocessor */ + cudaDevAttrAsyncEngineCount = 40, /**< Number of asynchronous engines */ + cudaDevAttrUnifiedAddressing = 41, /**< Device shares a unified address space with the host */ + cudaDevAttrMaxTexture1DLayeredWidth = 42, /**< Maximum 1D layered texture width */ + cudaDevAttrMaxTexture1DLayeredLayers = 43, /**< Maximum layers in a 1D layered texture */ + cudaDevAttrMaxTexture2DGatherWidth = 45, /**< Maximum 2D texture width if cudaArrayTextureGather is set */ + cudaDevAttrMaxTexture2DGatherHeight = 46, /**< Maximum 2D texture height if cudaArrayTextureGather is set */ + cudaDevAttrMaxTexture3DWidthAlt = 47, /**< Alternate maximum 3D texture width */ + cudaDevAttrMaxTexture3DHeightAlt = 48, /**< Alternate maximum 3D texture height */ + cudaDevAttrMaxTexture3DDepthAlt = 49, /**< Alternate maximum 3D texture depth */ + cudaDevAttrPciDomainId = 50, /**< PCI domain ID of the device */ + cudaDevAttrTexturePitchAlignment = 51, /**< Pitch alignment requirement for textures */ + cudaDevAttrMaxTextureCubemapWidth = 52, /**< Maximum cubemap texture width/height */ + cudaDevAttrMaxTextureCubemapLayeredWidth = 53, /**< Maximum cubemap layered texture width/height */ + cudaDevAttrMaxTextureCubemapLayeredLayers = 54, /**< Maximum layers in a cubemap layered texture */ + cudaDevAttrMaxSurface1DWidth = 55, /**< Maximum 1D surface width */ + cudaDevAttrMaxSurface2DWidth = 56, /**< Maximum 2D surface width */ + cudaDevAttrMaxSurface2DHeight = 57, /**< Maximum 2D surface height */ + cudaDevAttrMaxSurface3DWidth = 58, /**< Maximum 3D surface width */ + cudaDevAttrMaxSurface3DHeight = 59, /**< Maximum 3D surface height */ + cudaDevAttrMaxSurface3DDepth = 60, /**< Maximum 3D surface depth */ + cudaDevAttrMaxSurface1DLayeredWidth = 61, /**< Maximum 1D layered surface width */ + cudaDevAttrMaxSurface1DLayeredLayers = 62, /**< Maximum layers in a 1D layered surface */ + cudaDevAttrMaxSurface2DLayeredWidth = 63, /**< Maximum 2D layered surface width */ + cudaDevAttrMaxSurface2DLayeredHeight = 64, /**< Maximum 2D layered surface height */ + cudaDevAttrMaxSurface2DLayeredLayers = 65, /**< Maximum layers in a 2D layered surface */ + cudaDevAttrMaxSurfaceCubemapWidth = 66, /**< Maximum cubemap surface width */ + cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67, /**< Maximum cubemap layered surface width */ + cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68, /**< Maximum layers in a cubemap layered surface */ + cudaDevAttrMaxTexture1DLinearWidth = 69, /**< Maximum 1D linear texture width */ + cudaDevAttrMaxTexture2DLinearWidth = 70, /**< Maximum 2D linear texture width */ + cudaDevAttrMaxTexture2DLinearHeight = 71, /**< Maximum 2D linear texture height */ + cudaDevAttrMaxTexture2DLinearPitch = 72, /**< Maximum 2D linear texture pitch in bytes */ + cudaDevAttrMaxTexture2DMipmappedWidth = 73, /**< Maximum mipmapped 2D texture width */ + cudaDevAttrMaxTexture2DMipmappedHeight = 74, /**< Maximum mipmapped 2D texture height */ + cudaDevAttrComputeCapabilityMajor = 75, /**< Major compute capability version number */ + cudaDevAttrComputeCapabilityMinor = 76, /**< Minor compute capability version number */ + cudaDevAttrMaxTexture1DMipmappedWidth = 77, /**< Maximum mipmapped 1D texture width */ + cudaDevAttrStreamPrioritiesSupported = 78, /**< Device supports stream priorities */ + cudaDevAttrGlobalL1CacheSupported = 79, /**< Device supports caching globals in L1 */ + cudaDevAttrLocalL1CacheSupported = 80, /**< Device supports caching locals in L1 */ + cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81, /**< Maximum shared memory available per multiprocessor in bytes */ + cudaDevAttrMaxRegistersPerMultiprocessor = 82, /**< Maximum number of 32-bit registers available per multiprocessor */ + cudaDevAttrManagedMemory = 83, /**< Device can allocate managed memory on this system */ + cudaDevAttrIsMultiGpuBoard = 84, /**< Device is on a multi-GPU board */ + cudaDevAttrMultiGpuBoardGroupID = 85, /**< Unique identifier for a group of devices on the same multi-GPU board */ + cudaDevAttrHostNativeAtomicSupported = 86, /**< Link between the device and the host supports native atomic operations */ + cudaDevAttrSingleToDoublePrecisionPerfRatio = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + cudaDevAttrPageableMemoryAccess = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + cudaDevAttrConcurrentManagedAccess = 89, /**< Device can coherently access managed memory concurrently with the CPU */ + cudaDevAttrComputePreemptionSupported = 90, /**< Device supports Compute Preemption */ + cudaDevAttrCanUseHostPointerForRegisteredMem = 91, /**< Device can access host registered memory at the same virtual address as the CPU */ + cudaDevAttrReserved92 = 92, + cudaDevAttrReserved93 = 93, + cudaDevAttrReserved94 = 94, + cudaDevAttrCooperativeLaunch = 95, /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/ + cudaDevAttrCooperativeMultiDeviceLaunch = 96, /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */ + cudaDevAttrMaxSharedMemoryPerBlockOptin = 97, /**< The maximum optin shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */ + cudaDevAttrCanFlushRemoteWrites = 98, /**< Device supports flushing of outstanding remote writes. */ + cudaDevAttrHostRegisterSupported = 99, /**< Device supports host memory registration via ::cudaHostRegister. */ + cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100, /**< Device accesses pageable memory via the host's page tables. */ + cudaDevAttrDirectManagedMemAccessFromHost = 101, /**< Host can directly access managed memory on the device without migration. */ + cudaDevAttrMaxBlocksPerMultiprocessor = 106, /**< Maximum number of blocks per multiprocessor */ + cudaDevAttrMaxPersistingL2CacheSize = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */ + cudaDevAttrMaxAccessPolicyWindowSize = 109, /**< Maximum value of cudaAccessPolicyWindow::num_bytes. */ + cudaDevAttrReservedSharedMemoryPerBlock = 111, /**< Shared memory reserved by CUDA driver per block in bytes */ + cudaDevAttrSparseCudaArraySupported = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */ + cudaDevAttrHostRegisterReadOnlySupported = 113, /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */ + cudaDevAttrTimelineSemaphoreInteropSupported = 114, /**< External timeline semaphore interop is supported on the device */ + cudaDevAttrMaxTimelineSemaphoreInteropSupported = 114, /**< Deprecated, External timeline semaphore interop is supported on the device */ + cudaDevAttrMemoryPoolsSupported = 115, /**< Device supports using the ::cudaMallocAsync and ::cudaMemPool family of APIs */ + cudaDevAttrGPUDirectRDMASupported = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */ + cudaDevAttrGPUDirectRDMAFlushWritesOptions = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are listed in the ::cudaFlushGPUDirectRDMAWritesOptions enum */ + cudaDevAttrGPUDirectRDMAWritesOrdering = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::cudaGPUDirectRDMAWritesOrdering for the numerical values returned here. */ + cudaDevAttrMemoryPoolSupportedHandleTypes = 119, /**< Handle types supported with mempool based IPC */ + cudaDevAttrClusterLaunch = 120, /**< Indicates device supports cluster launch */ + cudaDevAttrDeferredMappingCudaArraySupported = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */ + cudaDevAttrMax +}; + +/** + * CUDA memory pool attributes + */ +enum __device_builtin__ cudaMemPoolAttr +{ + /** + * (value type = int) + * Allow cuMemAllocAsync to use memory asynchronously freed + * in another streams as long as a stream ordering dependency + * of the allocating stream on the free action exists. + * Cuda events and null stream interactions can create the required + * stream ordered dependencies. (default enabled) + */ + cudaMemPoolReuseFollowEventDependencies = 0x1, + + /** + * (value type = int) + * Allow reuse of already completed frees when there is no dependency + * between the free and allocation. (default enabled) + */ + cudaMemPoolReuseAllowOpportunistic = 0x2, + + /** + * (value type = int) + * Allow cuMemAllocAsync to insert new stream dependencies + * in order to establish the stream ordering required to reuse + * a piece of memory released by cuFreeAsync (default enabled). + */ + cudaMemPoolReuseAllowInternalDependencies = 0x3, + + + /** + * (value type = cuuint64_t) + * Amount of reserved memory in bytes to hold onto before trying + * to release memory back to the OS. When more than the release + * threshold bytes of memory are held by the memory pool, the + * allocator will try to release memory back to the OS on the + * next call to stream, event or context synchronize. (default 0) + */ + cudaMemPoolAttrReleaseThreshold = 0x4, + + /** + * (value type = cuuint64_t) + * Amount of backing memory currently allocated for the mempool. + */ + cudaMemPoolAttrReservedMemCurrent = 0x5, + + /** + * (value type = cuuint64_t) + * High watermark of backing memory allocated for the mempool since the + * last time it was reset. High watermark can only be reset to zero. + */ + cudaMemPoolAttrReservedMemHigh = 0x6, + + /** + * (value type = cuuint64_t) + * Amount of memory from the pool that is currently in use by the application. + */ + cudaMemPoolAttrUsedMemCurrent = 0x7, + + /** + * (value type = cuuint64_t) + * High watermark of the amount of memory from the pool that was in use by the application since + * the last time it was reset. High watermark can only be reset to zero. + */ + cudaMemPoolAttrUsedMemHigh = 0x8 +}; + +/** + * Specifies the type of location + */ +enum __device_builtin__ cudaMemLocationType { + cudaMemLocationTypeInvalid = 0, + cudaMemLocationTypeDevice = 1 /**< Location is a device location, thus id is a device ordinal */ +}; + +/** + * Specifies a memory location. + * + * To specify a gpu, set type = ::cudaMemLocationTypeDevice and set id = the gpu's device ordinal. + */ +struct __device_builtin__ cudaMemLocation { + enum cudaMemLocationType type; /**< Specifies the location type, which modifies the meaning of id. */ + int id; /**< identifier for a given this location's ::CUmemLocationType. */ +}; + +/** + * Specifies the memory protection flags for mapping. + */ +enum __device_builtin__ cudaMemAccessFlags { + cudaMemAccessFlagsProtNone = 0, /**< Default, make the address range not accessible */ + cudaMemAccessFlagsProtRead = 1, /**< Make the address range read accessible */ + cudaMemAccessFlagsProtReadWrite = 3 /**< Make the address range read-write accessible */ +}; + +/** + * Memory access descriptor + */ +struct __device_builtin__ cudaMemAccessDesc { + struct cudaMemLocation location; /**< Location on which the request is to change it's accessibility */ + enum cudaMemAccessFlags flags; /**< ::CUmemProt accessibility flags to set on the request */ +}; + +/** + * Defines the allocation types available + */ +enum __device_builtin__ cudaMemAllocationType { + cudaMemAllocationTypeInvalid = 0x0, + /** This allocation type is 'pinned', i.e. cannot migrate from its current + * location while the application is actively using it + */ + cudaMemAllocationTypePinned = 0x1, + cudaMemAllocationTypeMax = 0x7FFFFFFF +}; + +/** + * Flags for specifying particular handle types + */ +enum __device_builtin__ cudaMemAllocationHandleType { + cudaMemHandleTypeNone = 0x0, /**< Does not allow any export mechanism. > */ + cudaMemHandleTypePosixFileDescriptor = 0x1, /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */ + cudaMemHandleTypeWin32 = 0x2, /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */ + cudaMemHandleTypeWin32Kmt = 0x4 /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */ +}; + +/** + * Specifies the properties of allocations made from the pool. + */ +struct __device_builtin__ cudaMemPoolProps { + enum cudaMemAllocationType allocType; /**< Allocation type. Currently must be specified as cudaMemAllocationTypePinned */ + enum cudaMemAllocationHandleType handleTypes; /**< Handle types that will be supported by allocations from the pool. */ + struct cudaMemLocation location; /**< Location allocations should reside. */ + /** + * Windows-specific LPSECURITYATTRIBUTES required when + * ::cudaMemHandleTypeWin32 is specified. This security attribute defines + * the scope of which exported allocations may be tranferred to other + * processes. In all other cases, this field is required to be zero. + */ + void *win32SecurityAttributes; + unsigned char reserved[64]; /**< reserved for future use, must be 0 */ +}; + +/** + * Opaque data for exporting a pool allocation + */ +struct __device_builtin__ cudaMemPoolPtrExportData { + unsigned char reserved[64]; +}; + +/** + * Memory allocation node parameters + */ +struct __device_builtin__ cudaMemAllocNodeParams { + /** + * in: location where the allocation should reside (specified in ::location). + * ::handleTypes must be ::cudaMemHandleTypeNone. IPC is not supported. + */ + struct cudaMemPoolProps poolProps; /**< in: array of memory access descriptors. Used to describe peer GPU access */ + const struct cudaMemAccessDesc *accessDescs; /**< in: number of memory access descriptors. Must not exceed the number of GPUs. */ + size_t accessDescCount; /**< in: Number of `accessDescs`s */ + size_t bytesize; /**< in: size in bytes of the requested allocation */ + void *dptr; /**< out: address of the allocation returned by CUDA */ +}; + +/** + * Graph memory attributes + */ +enum __device_builtin__ cudaGraphMemAttributeType { + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently associated with graphs. + */ + cudaGraphMemAttrUsedMemCurrent = 0x0, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, associated with graphs since the + * last time it was reset. High watermark can only be reset to zero. + */ + cudaGraphMemAttrUsedMemHigh = 0x1, + + /** + * (value type = cuuint64_t) + * Amount of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + cudaGraphMemAttrReservedMemCurrent = 0x2, + + /** + * (value type = cuuint64_t) + * High watermark of memory, in bytes, currently allocated for use by + * the CUDA graphs asynchronous allocator. + */ + cudaGraphMemAttrReservedMemHigh = 0x3 +}; + +/** + * CUDA device P2P attributes + */ + +enum __device_builtin__ cudaDeviceP2PAttr { + cudaDevP2PAttrPerformanceRank = 1, /**< A relative value indicating the performance of the link between two devices */ + cudaDevP2PAttrAccessSupported = 2, /**< Peer access is enabled */ + cudaDevP2PAttrNativeAtomicSupported = 3, /**< Native atomic operation over the link supported */ + cudaDevP2PAttrCudaArrayAccessSupported = 4 /**< Accessing CUDA arrays over the link supported */ +}; + +/** + * CUDA UUID types + */ +#ifndef CU_UUID_HAS_BEEN_DEFINED +#define CU_UUID_HAS_BEEN_DEFINED +struct __device_builtin__ CUuuid_st { /**< CUDA definition of UUID */ + char bytes[16]; +}; +typedef __device_builtin__ struct CUuuid_st CUuuid; +#endif +typedef __device_builtin__ struct CUuuid_st cudaUUID_t; + +/** + * CUDA device properties + */ +struct __device_builtin__ cudaDeviceProp +{ + char name[256]; /**< ASCII string identifying device */ + cudaUUID_t uuid; /**< 16-byte unique identifier */ + char luid[8]; /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */ + unsigned int luidDeviceNodeMask; /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */ + size_t totalGlobalMem; /**< Global memory available on device in bytes */ + size_t sharedMemPerBlock; /**< Shared memory available per block in bytes */ + int regsPerBlock; /**< 32-bit registers available per block */ + int warpSize; /**< Warp size in threads */ + size_t memPitch; /**< Maximum pitch in bytes allowed by memory copies */ + int maxThreadsPerBlock; /**< Maximum number of threads per block */ + int maxThreadsDim[3]; /**< Maximum size of each dimension of a block */ + int maxGridSize[3]; /**< Maximum size of each dimension of a grid */ + int clockRate; /**< Clock frequency in kilohertz */ + size_t totalConstMem; /**< Constant memory available on device in bytes */ + int major; /**< Major compute capability */ + int minor; /**< Minor compute capability */ + size_t textureAlignment; /**< Alignment requirement for textures */ + size_t texturePitchAlignment; /**< Pitch alignment requirement for texture references bound to pitched memory */ + int deviceOverlap; /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */ + int multiProcessorCount; /**< Number of multiprocessors on device */ + int kernelExecTimeoutEnabled; /**< Specified whether there is a run time limit on kernels */ + int integrated; /**< Device is integrated as opposed to discrete */ + int canMapHostMemory; /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */ + int computeMode; /**< Compute mode (See ::cudaComputeMode) */ + int maxTexture1D; /**< Maximum 1D texture size */ + int maxTexture1DMipmap; /**< Maximum 1D mipmapped texture size */ + int maxTexture1DLinear; /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */ + int maxTexture2D[2]; /**< Maximum 2D texture dimensions */ + int maxTexture2DMipmap[2]; /**< Maximum 2D mipmapped texture dimensions */ + int maxTexture2DLinear[3]; /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */ + int maxTexture2DGather[2]; /**< Maximum 2D texture dimensions if texture gather operations have to be performed */ + int maxTexture3D[3]; /**< Maximum 3D texture dimensions */ + int maxTexture3DAlt[3]; /**< Maximum alternate 3D texture dimensions */ + int maxTextureCubemap; /**< Maximum Cubemap texture dimensions */ + int maxTexture1DLayered[2]; /**< Maximum 1D layered texture dimensions */ + int maxTexture2DLayered[3]; /**< Maximum 2D layered texture dimensions */ + int maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */ + int maxSurface1D; /**< Maximum 1D surface size */ + int maxSurface2D[2]; /**< Maximum 2D surface dimensions */ + int maxSurface3D[3]; /**< Maximum 3D surface dimensions */ + int maxSurface1DLayered[2]; /**< Maximum 1D layered surface dimensions */ + int maxSurface2DLayered[3]; /**< Maximum 2D layered surface dimensions */ + int maxSurfaceCubemap; /**< Maximum Cubemap surface dimensions */ + int maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */ + size_t surfaceAlignment; /**< Alignment requirements for surfaces */ + int concurrentKernels; /**< Device can possibly execute multiple kernels concurrently */ + int ECCEnabled; /**< Device has ECC support enabled */ + int pciBusID; /**< PCI bus ID of the device */ + int pciDeviceID; /**< PCI device ID of the device */ + int pciDomainID; /**< PCI domain ID of the device */ + int tccDriver; /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */ + int asyncEngineCount; /**< Number of asynchronous engines */ + int unifiedAddressing; /**< Device shares a unified address space with the host */ + int memoryClockRate; /**< Peak memory clock frequency in kilohertz */ + int memoryBusWidth; /**< Global memory bus width in bits */ + int l2CacheSize; /**< Size of L2 cache in bytes */ + int persistingL2CacheMaxSize; /**< Device's maximum l2 persisting lines capacity setting in bytes */ + int maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */ + int streamPrioritiesSupported; /**< Device supports stream priorities */ + int globalL1CacheSupported; /**< Device supports caching globals in L1 */ + int localL1CacheSupported; /**< Device supports caching locals in L1 */ + size_t sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */ + int regsPerMultiprocessor; /**< 32-bit registers available per multiprocessor */ + int managedMemory; /**< Device supports allocating managed memory on this system */ + int isMultiGpuBoard; /**< Device is on a multi-GPU board */ + int multiGpuBoardGroupID; /**< Unique identifier for a group of devices on the same multi-GPU board */ + int hostNativeAtomicSupported; /**< Link between the device and the host supports native atomic operations */ + int singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */ + int pageableMemoryAccess; /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */ + int concurrentManagedAccess; /**< Device can coherently access managed memory concurrently with the CPU */ + int computePreemptionSupported; /**< Device supports Compute Preemption */ + int canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */ + int cooperativeLaunch; /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */ + int cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */ + size_t sharedMemPerBlockOptin; /**< Per device maximum shared memory per block usable by special opt in */ + int pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */ + int directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */ + int maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */ + int accessPolicyMaxWindowSize; /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */ + size_t reservedSharedMemPerBlock; /**< Shared memory reserved by CUDA driver per block in bytes */ +}; + +#define cudaDevicePropDontCare \ + { \ + {'\0'}, /* char name[256]; */ \ + {{0}}, /* cudaUUID_t uuid; */ \ + {'\0'}, /* char luid[8]; */ \ + 0, /* unsigned int luidDeviceNodeMask */ \ + 0, /* size_t totalGlobalMem; */ \ + 0, /* size_t sharedMemPerBlock; */ \ + 0, /* int regsPerBlock; */ \ + 0, /* int warpSize; */ \ + 0, /* size_t memPitch; */ \ + 0, /* int maxThreadsPerBlock; */ \ + {0, 0, 0}, /* int maxThreadsDim[3]; */ \ + {0, 0, 0}, /* int maxGridSize[3]; */ \ + 0, /* int clockRate; */ \ + 0, /* size_t totalConstMem; */ \ + -1, /* int major; */ \ + -1, /* int minor; */ \ + 0, /* size_t textureAlignment; */ \ + 0, /* size_t texturePitchAlignment */ \ + -1, /* int deviceOverlap; */ \ + 0, /* int multiProcessorCount; */ \ + 0, /* int kernelExecTimeoutEnabled */ \ + 0, /* int integrated */ \ + 0, /* int canMapHostMemory */ \ + 0, /* int computeMode */ \ + 0, /* int maxTexture1D */ \ + 0, /* int maxTexture1DMipmap */ \ + 0, /* int maxTexture1DLinear */ \ + {0, 0}, /* int maxTexture2D[2] */ \ + {0, 0}, /* int maxTexture2DMipmap[2] */ \ + {0, 0, 0}, /* int maxTexture2DLinear[3] */ \ + {0, 0}, /* int maxTexture2DGather[2] */ \ + {0, 0, 0}, /* int maxTexture3D[3] */ \ + {0, 0, 0}, /* int maxTexture3DAlt[3] */ \ + 0, /* int maxTextureCubemap */ \ + {0, 0}, /* int maxTexture1DLayered[2] */ \ + {0, 0, 0}, /* int maxTexture2DLayered[3] */ \ + {0, 0}, /* int maxTextureCubemapLayered[2] */ \ + 0, /* int maxSurface1D */ \ + {0, 0}, /* int maxSurface2D[2] */ \ + {0, 0, 0}, /* int maxSurface3D[3] */ \ + {0, 0}, /* int maxSurface1DLayered[2] */ \ + {0, 0, 0}, /* int maxSurface2DLayered[3] */ \ + 0, /* int maxSurfaceCubemap */ \ + {0, 0}, /* int maxSurfaceCubemapLayered[2] */ \ + 0, /* size_t surfaceAlignment */ \ + 0, /* int concurrentKernels */ \ + 0, /* int ECCEnabled */ \ + 0, /* int pciBusID */ \ + 0, /* int pciDeviceID */ \ + 0, /* int pciDomainID */ \ + 0, /* int tccDriver */ \ + 0, /* int asyncEngineCount */ \ + 0, /* int unifiedAddressing */ \ + 0, /* int memoryClockRate */ \ + 0, /* int memoryBusWidth */ \ + 0, /* int l2CacheSize */ \ + 0, /* int persistingL2CacheMaxSize */ \ + 0, /* int maxThreadsPerMultiProcessor */ \ + 0, /* int streamPrioritiesSupported */ \ + 0, /* int globalL1CacheSupported */ \ + 0, /* int localL1CacheSupported */ \ + 0, /* size_t sharedMemPerMultiprocessor; */ \ + 0, /* int regsPerMultiprocessor; */ \ + 0, /* int managedMemory */ \ + 0, /* int isMultiGpuBoard */ \ + 0, /* int multiGpuBoardGroupID */ \ + 0, /* int hostNativeAtomicSupported */ \ + 0, /* int singleToDoublePrecisionPerfRatio */ \ + 0, /* int pageableMemoryAccess */ \ + 0, /* int concurrentManagedAccess */ \ + 0, /* int computePreemptionSupported */ \ + 0, /* int canUseHostPointerForRegisteredMem */ \ + 0, /* int cooperativeLaunch */ \ + 0, /* int cooperativeMultiDeviceLaunch */ \ + 0, /* size_t sharedMemPerBlockOptin */ \ + 0, /* int pageableMemoryAccessUsesHostPageTables */ \ + 0, /* int directManagedMemAccessFromHost */ \ + 0, /* int accessPolicyMaxWindowSize */ \ + 0, /* size_t reservedSharedMemPerBlock */ \ + } /**< Empty device properties */ + +/** + * CUDA IPC Handle Size + */ +#define CUDA_IPC_HANDLE_SIZE 64 + +/** + * CUDA IPC event handle + */ +typedef __device_builtin__ struct __device_builtin__ cudaIpcEventHandle_st +{ + char reserved[CUDA_IPC_HANDLE_SIZE]; +}cudaIpcEventHandle_t; + +/** + * CUDA IPC memory handle + */ +typedef __device_builtin__ struct __device_builtin__ cudaIpcMemHandle_st +{ + char reserved[CUDA_IPC_HANDLE_SIZE]; +}cudaIpcMemHandle_t; + +/** + * External memory handle types + */ +enum __device_builtin__ cudaExternalMemoryHandleType { + /** + * Handle is an opaque file descriptor + */ + cudaExternalMemoryHandleTypeOpaqueFd = 1, + /** + * Handle is an opaque shared NT handle + */ + cudaExternalMemoryHandleTypeOpaqueWin32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + cudaExternalMemoryHandleTypeOpaqueWin32Kmt = 3, + /** + * Handle is a D3D12 heap object + */ + cudaExternalMemoryHandleTypeD3D12Heap = 4, + /** + * Handle is a D3D12 committed resource + */ + cudaExternalMemoryHandleTypeD3D12Resource = 5, + /** + * Handle is a shared NT handle to a D3D11 resource + */ + cudaExternalMemoryHandleTypeD3D11Resource = 6, + /** + * Handle is a globally shared handle to a D3D11 resource + */ + cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7, + /** + * Handle is an NvSciBuf object + */ + cudaExternalMemoryHandleTypeNvSciBuf = 8 +}; + +/** + * Indicates that the external memory object is a dedicated resource + */ +#define cudaExternalMemoryDedicated 0x1 + +/** When the /p flags parameter of ::cudaExternalSemaphoreSignalParams + * contains this flag, it indicates that signaling an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define cudaExternalSemaphoreSignalSkipNvSciBufMemSync 0x01 + +/** When the /p flags parameter of ::cudaExternalSemaphoreWaitParams + * contains this flag, it indicates that waiting an external semaphore object + * should skip performing appropriate memory synchronization operations over all + * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf, + * which otherwise are performed by default to ensure data coherency with other + * importers of the same NvSciBuf memory objects. + */ +#define cudaExternalSemaphoreWaitSkipNvSciBufMemSync 0x02 + +/** + * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application need signaler specific NvSciSyncAttr + * to be filled by ::cudaDeviceGetNvSciSyncAttributes. + */ +#define cudaNvSciSyncAttrSignal 0x1 + +/** + * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this, + * it indicates that application need waiter specific NvSciSyncAttr + * to be filled by ::cudaDeviceGetNvSciSyncAttributes. + */ +#define cudaNvSciSyncAttrWait 0x2 + +/** + * External memory handle descriptor + */ +struct __device_builtin__ cudaExternalMemoryHandleDesc { + /** + * Type of the handle + */ + enum cudaExternalMemoryHandleType type; + union { + /** + * File descriptor referencing the memory object. Valid + * when type is + * ::cudaExternalMemoryHandleTypeOpaqueFd + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::cudaExternalMemoryHandleTypeOpaqueWin32 + * - ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt + * - ::cudaExternalMemoryHandleTypeD3D12Heap + * - ::cudaExternalMemoryHandleTypeD3D12Resource + * - ::cudaExternalMemoryHandleTypeD3D11Resource + * - ::cudaExternalMemoryHandleTypeD3D11ResourceKmt + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is one of the following: + * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt + * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid memory object. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + /** + * A handle representing NvSciBuf Object. Valid when type + * is ::cudaExternalMemoryHandleTypeNvSciBuf + */ + const void *nvSciBufObject; + } handle; + /** + * Size of the memory allocation + */ + unsigned long long size; + /** + * Flags must either be zero or ::cudaExternalMemoryDedicated + */ + unsigned int flags; +}; + +/** + * External memory buffer descriptor + */ +struct __device_builtin__ cudaExternalMemoryBufferDesc { + /** + * Offset into the memory object where the buffer's base is + */ + unsigned long long offset; + /** + * Size of the buffer + */ + unsigned long long size; + /** + * Flags reserved for future use. Must be zero. + */ + unsigned int flags; +}; + +/** + * External memory mipmap descriptor + */ +struct __device_builtin__ cudaExternalMemoryMipmappedArrayDesc { + /** + * Offset into the memory object where the base level of the + * mipmap chain is. + */ + unsigned long long offset; + /** + * Format of base level of the mipmap chain + */ + struct cudaChannelFormatDesc formatDesc; + /** + * Dimensions of base level of the mipmap chain + */ + struct cudaExtent extent; + /** + * Flags associated with CUDA mipmapped arrays. + * See ::cudaMallocMipmappedArray + */ + unsigned int flags; + /** + * Total number of levels in the mipmap chain + */ + unsigned int numLevels; +}; + +/** + * External semaphore handle types + */ +enum __device_builtin__ cudaExternalSemaphoreHandleType { + /** + * Handle is an opaque file descriptor + */ + cudaExternalSemaphoreHandleTypeOpaqueFd = 1, + /** + * Handle is an opaque shared NT handle + */ + cudaExternalSemaphoreHandleTypeOpaqueWin32 = 2, + /** + * Handle is an opaque, globally shared handle + */ + cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3, + /** + * Handle is a shared NT handle referencing a D3D12 fence object + */ + cudaExternalSemaphoreHandleTypeD3D12Fence = 4, + /** + * Handle is a shared NT handle referencing a D3D11 fence object + */ + cudaExternalSemaphoreHandleTypeD3D11Fence = 5, + /** + * Opaque handle to NvSciSync Object + */ + cudaExternalSemaphoreHandleTypeNvSciSync = 6, + /** + * Handle is a shared NT handle referencing a D3D11 keyed mutex object + */ + cudaExternalSemaphoreHandleTypeKeyedMutex = 7, + /** + * Handle is a shared KMT handle referencing a D3D11 keyed mutex object + */ + cudaExternalSemaphoreHandleTypeKeyedMutexKmt = 8, + /** + * Handle is an opaque handle file descriptor referencing a timeline semaphore + */ + cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd = 9, + /** + * Handle is an opaque handle file descriptor referencing a timeline semaphore + */ + cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 = 10 +}; + +/** + * External semaphore handle descriptor + */ +struct __device_builtin__ cudaExternalSemaphoreHandleDesc { + /** + * Type of the handle + */ + enum cudaExternalSemaphoreHandleType type; + union { + /** + * File descriptor referencing the semaphore object. Valid when + * type is one of the following: + * - ::cudaExternalSemaphoreHandleTypeOpaqueFd + * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd + */ + int fd; + /** + * Win32 handle referencing the semaphore object. Valid when + * type is one of the following: + * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32 + * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt + * - ::cudaExternalSemaphoreHandleTypeD3D12Fence + * - ::cudaExternalSemaphoreHandleTypeD3D11Fence + * - ::cudaExternalSemaphoreHandleTypeKeyedMutex + * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32 + * Exactly one of 'handle' and 'name' must be non-NULL. If + * type is one of the following: + * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt + * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt + * then 'name' must be NULL. + */ + struct { + /** + * Valid NT handle. Must be NULL if 'name' is non-NULL + */ + void *handle; + /** + * Name of a valid synchronization primitive. + * Must be NULL if 'handle' is non-NULL. + */ + const void *name; + } win32; + /** + * Valid NvSciSyncObj. Must be non NULL + */ + const void* nvSciSyncObj; + } handle; + /** + * Flags reserved for the future. Must be zero. + */ + unsigned int flags; +}; + +/** + * External semaphore signal parameters(deprecated) + */ +struct __device_builtin__ cudaExternalSemaphoreSignalParams_v1 { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be signaled + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType + * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /* + * Value of key to release the mutex with + */ + unsigned long long key; + } keyedMutex; + } params; + /** + * Only when ::cudaExternalSemaphoreSignalParams is used to + * signal a ::cudaExternalSemaphore_t of type + * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates + * that while signaling the ::cudaExternalSemaphore_t, no memory + * synchronization operations should be performed for any external memory + * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf. + * For all other types of ::cudaExternalSemaphore_t, flags must be zero. + */ + unsigned int flags; +}; + +/** +* External semaphore wait parameters(deprecated) +*/ +struct __device_builtin__ cudaExternalSemaphoreWaitParams_v1 { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be waited on + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType + * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to acquire the mutex with + */ + unsigned long long key; + /** + * Timeout in milliseconds to wait to acquire the mutex + */ + unsigned int timeoutMs; + } keyedMutex; + } params; + /** + * Only when ::cudaExternalSemaphoreSignalParams is used to + * signal a ::cudaExternalSemaphore_t of type + * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates + * that while waiting for the ::cudaExternalSemaphore_t, no memory + * synchronization operations should be performed for any external memory + * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf. + * For all other types of ::cudaExternalSemaphore_t, flags must be zero. + */ + unsigned int flags; +}; + +/** + * External semaphore signal parameters, compatible with driver type + */ +struct __device_builtin__ cudaExternalSemaphoreSignalParams{ + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be signaled + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType + * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /* + * Value of key to release the mutex with + */ + unsigned long long key; + } keyedMutex; + unsigned int reserved[12]; + } params; + /** + * Only when ::cudaExternalSemaphoreSignalParams is used to + * signal a ::cudaExternalSemaphore_t of type + * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates + * that while signaling the ::cudaExternalSemaphore_t, no memory + * synchronization operations should be performed for any external memory + * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf. + * For all other types of ::cudaExternalSemaphore_t, flags must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +}; + +/** + * External semaphore wait parameters, compatible with driver type + */ +struct __device_builtin__ cudaExternalSemaphoreWaitParams { + struct { + /** + * Parameters for fence objects + */ + struct { + /** + * Value of fence to be waited on + */ + unsigned long long value; + } fence; + union { + /** + * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType + * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync. + */ + void *fence; + unsigned long long reserved; + } nvSciSync; + /** + * Parameters for keyed mutex objects + */ + struct { + /** + * Value of key to acquire the mutex with + */ + unsigned long long key; + /** + * Timeout in milliseconds to wait to acquire the mutex + */ + unsigned int timeoutMs; + } keyedMutex; + unsigned int reserved[10]; + } params; + /** + * Only when ::cudaExternalSemaphoreSignalParams is used to + * signal a ::cudaExternalSemaphore_t of type + * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is + * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates + * that while waiting for the ::cudaExternalSemaphore_t, no memory + * synchronization operations should be performed for any external memory + * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf. + * For all other types of ::cudaExternalSemaphore_t, flags must be zero. + */ + unsigned int flags; + unsigned int reserved[16]; +}; + +/******************************************************************************* +* * +* SHORTHAND TYPE DEFINITION USED BY RUNTIME API * +* * +*******************************************************************************/ + +/** + * CUDA Error types + */ +typedef __device_builtin__ enum cudaError cudaError_t; + +/** + * CUDA stream + */ +typedef __device_builtin__ struct CUstream_st *cudaStream_t; + +/** + * CUDA event types + */ +typedef __device_builtin__ struct CUevent_st *cudaEvent_t; + +/** + * CUDA graphics resource types + */ +typedef __device_builtin__ struct cudaGraphicsResource *cudaGraphicsResource_t; + +/** + * CUDA output file modes + */ +typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t; + +/** + * CUDA external memory + */ +typedef __device_builtin__ struct CUexternalMemory_st *cudaExternalMemory_t; + +/** + * CUDA external semaphore + */ +typedef __device_builtin__ struct CUexternalSemaphore_st *cudaExternalSemaphore_t; + +/** + * CUDA graph + */ +typedef __device_builtin__ struct CUgraph_st *cudaGraph_t; + +/** + * CUDA graph node. + */ +typedef __device_builtin__ struct CUgraphNode_st *cudaGraphNode_t; + +/** + * CUDA user object for graphs + */ +typedef __device_builtin__ struct CUuserObject_st *cudaUserObject_t; + +/** + * CUDA function + */ +typedef __device_builtin__ struct CUfunc_st *cudaFunction_t; + +/** + * CUDA memory pool + */ +typedef __device_builtin__ struct CUmemPoolHandle_st *cudaMemPool_t; + +/** + * CUDA cooperative group scope + */ +enum __device_builtin__ cudaCGScope { + cudaCGScopeInvalid = 0, /**< Invalid cooperative group scope */ + cudaCGScopeGrid = 1, /**< Scope represented by a grid_group */ + cudaCGScopeMultiGrid = 2 /**< Scope represented by a multi_grid_group */ +}; + +/** + * CUDA launch parameters + */ +struct __device_builtin__ cudaLaunchParams +{ + void *func; /**< Device function symbol */ + dim3 gridDim; /**< Grid dimentions */ + dim3 blockDim; /**< Block dimentions */ + void **args; /**< Arguments */ + size_t sharedMem; /**< Shared memory */ + cudaStream_t stream; /**< Stream identifier */ +}; + +/** + * CUDA GPU kernel node parameters + */ +struct __device_builtin__ cudaKernelNodeParams { + void* func; /**< Kernel to launch */ + dim3 gridDim; /**< Grid dimensions */ + dim3 blockDim; /**< Block dimensions */ + unsigned int sharedMemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + void **kernelParams; /**< Array of pointers to individual kernel arguments*/ + void **extra; /**< Pointer to kernel arguments in the "extra" format */ +}; + +/** + * External semaphore signal node parameters + */ +struct __device_builtin__ cudaExternalSemaphoreSignalNodeParams { + cudaExternalSemaphore_t* extSemArray; /**< Array of external semaphore handles. */ + const struct cudaExternalSemaphoreSignalParams* paramsArray; /**< Array of external semaphore signal parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +}; + +/** + * External semaphore wait node parameters + */ +struct __device_builtin__ cudaExternalSemaphoreWaitNodeParams { + cudaExternalSemaphore_t* extSemArray; /**< Array of external semaphore handles. */ + const struct cudaExternalSemaphoreWaitParams* paramsArray; /**< Array of external semaphore wait parameters. */ + unsigned int numExtSems; /**< Number of handles and parameters supplied in extSemArray and paramsArray. */ +}; + +/** +* CUDA Graph node types +*/ +enum __device_builtin__ cudaGraphNodeType { + cudaGraphNodeTypeKernel = 0x00, /**< GPU kernel node */ + cudaGraphNodeTypeMemcpy = 0x01, /**< Memcpy node */ + cudaGraphNodeTypeMemset = 0x02, /**< Memset node */ + cudaGraphNodeTypeHost = 0x03, /**< Host (executable) node */ + cudaGraphNodeTypeGraph = 0x04, /**< Node which executes an embedded graph */ + cudaGraphNodeTypeEmpty = 0x05, /**< Empty (no-op) node */ + cudaGraphNodeTypeWaitEvent = 0x06, /**< External event wait node */ + cudaGraphNodeTypeEventRecord = 0x07, /**< External event record node */ + cudaGraphNodeTypeExtSemaphoreSignal = 0x08, /**< External semaphore signal node */ + cudaGraphNodeTypeExtSemaphoreWait = 0x09, /**< External semaphore wait node */ + cudaGraphNodeTypeMemAlloc = 0x0a, /**< Memory allocation node */ + cudaGraphNodeTypeMemFree = 0x0b, /**< Memory free node */ + cudaGraphNodeTypeCount +}; + +/** + * CUDA executable (launchable) graph + */ +typedef struct CUgraphExec_st* cudaGraphExec_t; + +/** +* CUDA Graph Update error types +*/ +enum __device_builtin__ cudaGraphExecUpdateResult { + cudaGraphExecUpdateSuccess = 0x0, /**< The update succeeded */ + cudaGraphExecUpdateError = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */ + cudaGraphExecUpdateErrorTopologyChanged = 0x2, /**< The update failed because the topology changed */ + cudaGraphExecUpdateErrorNodeTypeChanged = 0x3, /**< The update failed because a node type changed */ + cudaGraphExecUpdateErrorFunctionChanged = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */ + cudaGraphExecUpdateErrorParametersChanged = 0x5, /**< The update failed because the parameters changed in a way that is not supported */ + cudaGraphExecUpdateErrorNotSupported = 0x6, /**< The update failed because something about the node is not supported */ + cudaGraphExecUpdateErrorUnsupportedFunctionChange = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */ + cudaGraphExecUpdateErrorAttributesChanged = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */ +}; + +/** + * Flags to specify search options to be used with ::cudaGetDriverEntryPoint + * For more details see ::cuGetProcAddress + */ +enum __device_builtin__ cudaGetDriverEntryPointFlags { + cudaEnableDefault = 0x0, /**< Default search mode for driver symbols. */ + cudaEnableLegacyStream = 0x1, /**< Search for legacy versions of driver symbols. */ + cudaEnablePerThreadDefaultStream = 0x2 /**< Search for per-thread versions of driver symbols. */ +}; + +/** + * CUDA Graph debug write options + */ +enum __device_builtin__ cudaGraphDebugDotFlags { + cudaGraphDebugDotFlagsVerbose = 1<<0, /**< Output all debug data as if every debug flag is enabled */ + cudaGraphDebugDotFlagsKernelNodeParams = 1<<2, /**< Adds cudaKernelNodeParams to output */ + cudaGraphDebugDotFlagsMemcpyNodeParams = 1<<3, /**< Adds cudaMemcpy3DParms to output */ + cudaGraphDebugDotFlagsMemsetNodeParams = 1<<4, /**< Adds cudaMemsetParams to output */ + cudaGraphDebugDotFlagsHostNodeParams = 1<<5, /**< Adds cudaHostNodeParams to output */ + cudaGraphDebugDotFlagsEventNodeParams = 1<<6, /**< Adds cudaEvent_t handle from record and wait nodes to output */ + cudaGraphDebugDotFlagsExtSemasSignalNodeParams = 1<<7, /**< Adds cudaExternalSemaphoreSignalNodeParams values to output */ + cudaGraphDebugDotFlagsExtSemasWaitNodeParams = 1<<8, /**< Adds cudaExternalSemaphoreWaitNodeParams to output */ + cudaGraphDebugDotFlagsKernelNodeAttributes = 1<<9, /**< Adds cudaKernelNodeAttrID values to output */ + cudaGraphDebugDotFlagsHandles = 1<<10 /**< Adds node handles and every kernel function handle to output */ +}; + +/** + * Flags for instantiating a graph + */ +enum __device_builtin__ cudaGraphInstantiateFlags { + cudaGraphInstantiateFlagAutoFreeOnLaunch = 1 /**< Automatically free memory allocated in a graph before relaunching. */ + , cudaGraphInstantiateFlagUseNodePriority = 8 /**< Run the graph using the per-node priority attributes rather than the + priority of the stream it is launched into. */ +}; + +/** + * Launch attributes enum; used as id field of ::cudaLaunchAttribute + */ +typedef __device_builtin__ enum cudaLaunchAttributeID { + cudaLaunchAttributeIgnore = 0 /**< Ignored entry, for convenient composition */ + , cudaLaunchAttributeAccessPolicyWindow = 1 /**< Valid for streams, graph nodes, launches. */ + , cudaLaunchAttributeCooperative = 2 /**< Valid for graph nodes, launches. */ + , cudaLaunchAttributeSynchronizationPolicy = 3 /**< Valid for streams. */ + , cudaLaunchAttributeClusterDimension = 4 /**< Valid for graph nodes, launches. */ + , cudaLaunchAttributeClusterSchedulingPolicyPreference = 5 /**< Valid for graph nodes, launches. */ + , cudaLaunchAttributeProgrammaticStreamSerialization = 6 /**< Valid for launches. Setting + programmaticStreamSerializationAllowed to non-0 + signals that the kernel will use programmatic + means to resolve its stream dependency, so that + the CUDA runtime should opportunistically allow + the grid's execution to overlap with the previous + kernel in the stream, if that kernel requests the + overlap. */ + , cudaLaunchAttributeProgrammaticEvent = 7 /**< Valid for launches. Event recorded through this launch + attribute is guaranteed to only trigger after all + block in the associated kernel trigger the event. A + block can trigger the event through PTX + griddepcontrol.launch_dependents. A trigger can also + be inserted at the beginning of each block's execution + if triggerAtBlockStart is set to non-0. Note that + dependents (including the CPU thread calling + cudaEventSynchronize()) are not guaranteed to observe + the release precisely when it is released. For + example, cudaEventSynchronize() may only observe the + event trigger long after the associated kernel has + completed. This recording type is primarily meant for + establishing programmatic dependency between device + tasks. The event supplied must not be an interprocess + or interop event. The event must disable timing + (i.e. created with ::cudaEventDisableTiming flag + set). */ + , cudaLaunchAttributePriority = 8 /**< Valid for graph nodes. */ +} cudaLaunchAttributeID; + +/** + * Launch attributes union; used as value field of ::cudaLaunchAttribute + */ +typedef __device_builtin__ union cudaLaunchAttributeValue { + char pad[64]; /* Pad to 64 bytes */ + struct cudaAccessPolicyWindow accessPolicyWindow; + int cooperative; + enum cudaSynchronizationPolicy syncPolicy; + struct { + unsigned int x; + unsigned int y; + unsigned int z; + } clusterDim; + enum cudaClusterSchedulingPolicy clusterSchedulingPolicyPreference; + int programmaticStreamSerializationAllowed; + struct { + cudaEvent_t event; + int flags; + int triggerAtBlockStart; + } programmaticEvent; + int priority; +} cudaLaunchAttributeValue; + +/** + * Launch attribute + */ +typedef __device_builtin__ struct cudaLaunchAttribute_st { + cudaLaunchAttributeID id; + char pad[8 - sizeof(cudaLaunchAttributeID)]; + cudaLaunchAttributeValue val; +} cudaLaunchAttribute; + +/** + * CUDA extensible launch configuration + */ +typedef __device_builtin__ struct cudaLaunchConfig_st { + dim3 gridDim; /**< Grid dimentions */ + dim3 blockDim; /**< Block dimentions */ + size_t dynamicSmemBytes; /**< Dynamic shared-memory size per thread block in bytes */ + cudaStream_t stream; /**< Stream identifier */ + cudaLaunchAttribute *attrs; /**< nullable if numAttrs == 0 */ + unsigned int numAttrs; /**< Number of attributes populated in attrs */ +} cudaLaunchConfig_t; + +/** + * Stream Attributes + */ +#define cudaStreamAttrID cudaLaunchAttributeID +#define cudaStreamAttributeAccessPolicyWindow cudaLaunchAttributeAccessPolicyWindow +#define cudaStreamAttributeSynchronizationPolicy cudaLaunchAttributeSynchronizationPolicy + +/** + * Stream attributes union used with ::cudaStreamSetAttribute/::cudaStreamGetAttribute + */ +#define cudaStreamAttrValue cudaLaunchAttributeValue + +/** + * Graph kernel node Attributes + */ +#define cudaKernelNodeAttrID cudaLaunchAttributeID +#define cudaKernelNodeAttributeAccessPolicyWindow cudaLaunchAttributeAccessPolicyWindow +#define cudaKernelNodeAttributeCooperative cudaLaunchAttributeCooperative +#define cudaKernelNodeAttributePriority cudaLaunchAttributePriority +#define cudaKernelNodeAttributeClusterDimension cudaLaunchAttributeClusterDimension +#define cudaKernelNodeAttributeClusterSchedulingPolicyPreference cudaLaunchAttributeClusterSchedulingPolicyPreference + +/** + * Graph kernel node attributes union, used with ::cudaGraphKernelNodeSetAttribute/::cudaGraphKernelNodeGetAttribute + */ +#define cudaKernelNodeAttrValue cudaLaunchAttributeValue + +/** @} */ +/** @} */ /* END CUDART_TYPES */ + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__ +#endif + +#undef __CUDA_DEPRECATED + +#endif /* !__DRIVER_TYPES_H__ */ diff --git a/ext/cudart/include/host_config.h b/ext/cudart/include/host_config.h new file mode 100644 index 0000000000000000000000000000000000000000..785bec4e5c0652f9605ccf9341b7f761a85471ab --- /dev/null +++ b/ext/cudart/include/host_config.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("host_config.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "host_config.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__ +#endif + +#include "crt/host_config.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__ +#endif diff --git a/ext/cudart/include/host_defines.h b/ext/cudart/include/host_defines.h new file mode 100644 index 0000000000000000000000000000000000000000..98a9c98a957e8f60e872b94fde762516c5523367 --- /dev/null +++ b/ext/cudart/include/host_defines.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("host_defines.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "host_defines.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__ +#endif + +#include "crt/host_defines.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__ +#endif diff --git a/ext/cudart/include/library_types.h b/ext/cudart/include/library_types.h new file mode 100644 index 0000000000000000000000000000000000000000..4a7e42c6b89ba4b446d4cf3d52c8bacd74e73b0d --- /dev/null +++ b/ext/cudart/include/library_types.h @@ -0,0 +1,103 @@ +/* + * Copyright 1993-2015 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__LIBRARY_TYPES_H__) +#define __LIBRARY_TYPES_H__ + + + +typedef enum cudaDataType_t +{ + CUDA_R_16F = 2, /* real as a half */ + CUDA_C_16F = 6, /* complex as a pair of half numbers */ + CUDA_R_16BF = 14, /* real as a nv_bfloat16 */ + CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */ + CUDA_R_32F = 0, /* real as a float */ + CUDA_C_32F = 4, /* complex as a pair of float numbers */ + CUDA_R_64F = 1, /* real as a double */ + CUDA_C_64F = 5, /* complex as a pair of double numbers */ + CUDA_R_4I = 16, /* real as a signed 4-bit int */ + CUDA_C_4I = 17, /* complex as a pair of signed 4-bit int numbers */ + CUDA_R_4U = 18, /* real as a unsigned 4-bit int */ + CUDA_C_4U = 19, /* complex as a pair of unsigned 4-bit int numbers */ + CUDA_R_8I = 3, /* real as a signed 8-bit int */ + CUDA_C_8I = 7, /* complex as a pair of signed 8-bit int numbers */ + CUDA_R_8U = 8, /* real as a unsigned 8-bit int */ + CUDA_C_8U = 9, /* complex as a pair of unsigned 8-bit int numbers */ + CUDA_R_16I = 20, /* real as a signed 16-bit int */ + CUDA_C_16I = 21, /* complex as a pair of signed 16-bit int numbers */ + CUDA_R_16U = 22, /* real as a unsigned 16-bit int */ + CUDA_C_16U = 23, /* complex as a pair of unsigned 16-bit int numbers */ + CUDA_R_32I = 10, /* real as a signed 32-bit int */ + CUDA_C_32I = 11, /* complex as a pair of signed 32-bit int numbers */ + CUDA_R_32U = 12, /* real as a unsigned 32-bit int */ + CUDA_C_32U = 13, /* complex as a pair of unsigned 32-bit int numbers */ + CUDA_R_64I = 24, /* real as a signed 64-bit int */ + CUDA_C_64I = 25, /* complex as a pair of signed 64-bit int numbers */ + CUDA_R_64U = 26, /* real as a unsigned 64-bit int */ + CUDA_C_64U = 27, /* complex as a pair of unsigned 64-bit int numbers */ + CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */ + CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */ +} cudaDataType; + + +typedef enum libraryPropertyType_t +{ + MAJOR_VERSION, + MINOR_VERSION, + PATCH_LEVEL +} libraryPropertyType; + + +#ifndef __cplusplus +typedef enum cudaDataType_t cudaDataType_t; +typedef enum libraryPropertyType_t libraryPropertyType_t; +#endif + +#endif /* !__LIBRARY_TYPES_H__ */ diff --git a/ext/cudart/include/math_constants.h b/ext/cudart/include/math_constants.h new file mode 100644 index 0000000000000000000000000000000000000000..39937e980f88a614d847154f9e4364bd9ba95cbd --- /dev/null +++ b/ext/cudart/include/math_constants.h @@ -0,0 +1,152 @@ +/* + * Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__MATH_CONSTANTS_H__) +#define __MATH_CONSTANTS_H__ + +/* single precision constants */ +#define CUDART_INF_F __int_as_float(0x7f800000U) +#define CUDART_NAN_F __int_as_float(0x7fffffffU) +#define CUDART_MIN_DENORM_F __int_as_float(0x00000001U) +#define CUDART_MAX_NORMAL_F __int_as_float(0x7f7fffffU) +#define CUDART_NEG_ZERO_F __int_as_float(0x80000000U) +#define CUDART_ZERO_F 0.0F +#define CUDART_ONE_F 1.0F +#define CUDART_SQRT_HALF_F 0.707106781F +#define CUDART_SQRT_HALF_HI_F 0.707106781F +#define CUDART_SQRT_HALF_LO_F 1.210161749e-08F +#define CUDART_SQRT_TWO_F 1.414213562F +#define CUDART_THIRD_F 0.333333333F +#define CUDART_PIO4_F 0.785398163F +#define CUDART_PIO2_F 1.570796327F +#define CUDART_3PIO4_F 2.356194490F +#define CUDART_2_OVER_PI_F 0.636619772F +#define CUDART_SQRT_2_OVER_PI_F 0.797884561F +#define CUDART_PI_F 3.141592654F +#define CUDART_L2E_F 1.442695041F +#define CUDART_L2T_F 3.321928094F +#define CUDART_LG2_F 0.301029996F +#define CUDART_LGE_F 0.434294482F +#define CUDART_LN2_F 0.693147181F +#define CUDART_LNT_F 2.302585093F +#define CUDART_LNPI_F 1.144729886F +#define CUDART_TWO_TO_M126_F 1.175494351e-38F +#define CUDART_TWO_TO_126_F 8.507059173e37F +#define CUDART_NORM_HUGE_F 3.402823466e38F +#define CUDART_TWO_TO_23_F 8388608.0F +#define CUDART_TWO_TO_24_F 16777216.0F +#define CUDART_TWO_TO_31_F 2147483648.0F +#define CUDART_TWO_TO_32_F 4294967296.0F +#define CUDART_REMQUO_BITS_F 3U +#define CUDART_REMQUO_MASK_F (~((~0U)<<CUDART_REMQUO_BITS_F)) +#define CUDART_TRIG_PLOSS_F 105615.0F + +/* double precision constants */ +#define CUDART_INF __longlong_as_double(0x7ff0000000000000ULL) +#define CUDART_NAN __longlong_as_double(0xfff8000000000000ULL) +#define CUDART_NEG_ZERO __longlong_as_double(0x8000000000000000ULL) +#define CUDART_MIN_DENORM __longlong_as_double(0x0000000000000001ULL) +#define CUDART_ZERO 0.0 +#define CUDART_ONE 1.0 +#define CUDART_SQRT_TWO 1.4142135623730951e+0 +#define CUDART_SQRT_HALF 7.0710678118654757e-1 +#define CUDART_SQRT_HALF_HI 7.0710678118654757e-1 +#define CUDART_SQRT_HALF_LO (-4.8336466567264567e-17) +#define CUDART_THIRD 3.3333333333333333e-1 +#define CUDART_TWOTHIRD 6.6666666666666667e-1 +#define CUDART_PIO4 7.8539816339744828e-1 +#define CUDART_PIO4_HI 7.8539816339744828e-1 +#define CUDART_PIO4_LO 3.0616169978683830e-17 +#define CUDART_PIO2 1.5707963267948966e+0 +#define CUDART_PIO2_HI 1.5707963267948966e+0 +#define CUDART_PIO2_LO 6.1232339957367660e-17 +#define CUDART_3PIO4 2.3561944901923448e+0 +#define CUDART_2_OVER_PI 6.3661977236758138e-1 +#define CUDART_PI 3.1415926535897931e+0 +#define CUDART_PI_HI 3.1415926535897931e+0 +#define CUDART_PI_LO 1.2246467991473532e-16 +#define CUDART_SQRT_2PI 2.5066282746310007e+0 +#define CUDART_SQRT_2PI_HI 2.5066282746310007e+0 +#define CUDART_SQRT_2PI_LO (-1.8328579980459167e-16) +#define CUDART_SQRT_PIO2 1.2533141373155003e+0 +#define CUDART_SQRT_PIO2_HI 1.2533141373155003e+0 +#define CUDART_SQRT_PIO2_LO (-9.1642899902295834e-17) +#define CUDART_SQRT_2OPI 7.9788456080286536e-1 +#define CUDART_L2E 1.4426950408889634e+0 +#define CUDART_L2E_HI 1.4426950408889634e+0 +#define CUDART_L2E_LO 2.0355273740931033e-17 +#define CUDART_L2T 3.3219280948873622e+0 +#define CUDART_LG2 3.0102999566398120e-1 +#define CUDART_LG2_HI 3.0102999566398120e-1 +#define CUDART_LG2_LO (-2.8037281277851704e-18) +#define CUDART_LGE 4.3429448190325182e-1 +#define CUDART_LGE_HI 4.3429448190325182e-1 +#define CUDART_LGE_LO 1.09831965021676510e-17 +#define CUDART_LN2 6.9314718055994529e-1 +#define CUDART_LN2_HI 6.9314718055994529e-1 +#define CUDART_LN2_LO 2.3190468138462996e-17 +#define CUDART_LNT 2.3025850929940459e+0 +#define CUDART_LNT_HI 2.3025850929940459e+0 +#define CUDART_LNT_LO (-2.1707562233822494e-16) +#define CUDART_LNPI 1.1447298858494002e+0 +#define CUDART_LN2_X_1024 7.0978271289338397e+2 +#define CUDART_LN2_X_1025 7.1047586007394398e+2 +#define CUDART_LN2_X_1075 7.4513321910194122e+2 +#define CUDART_LG2_X_1024 3.0825471555991675e+2 +#define CUDART_LG2_X_1075 3.2360724533877976e+2 +#define CUDART_TWO_TO_23 8388608.0 +#define CUDART_TWO_TO_52 4503599627370496.0 +#define CUDART_TWO_TO_53 9007199254740992.0 +#define CUDART_TWO_TO_54 18014398509481984.0 +#define CUDART_TWO_TO_M54 5.5511151231257827e-17 +#define CUDART_TWO_TO_M1022 2.22507385850720140e-308 +#define CUDART_TRIG_PLOSS 2147483648.0 +#define CUDART_DBL2INT_CVT 6755399441055744.0 + +#endif /* !__MATH_CONSTANTS_H__ */ diff --git a/ext/cudart/include/math_functions.h b/ext/cudart/include/math_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..bc806976784e494edc905d8b8bd9ad138054bbea --- /dev/null +++ b/ext/cudart/include/math_functions.h @@ -0,0 +1,65 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#if defined(_MSC_VER) +#pragma message("math_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.") +#else +#warning "math_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead." +#endif +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__ +#endif + +#include "crt/math_functions.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__ +#endif diff --git a/ext/cudart/include/mma.h b/ext/cudart/include/mma.h new file mode 100644 index 0000000000000000000000000000000000000000..9f36f671c0b3a4e95cbb7bddbe41e75ac783b722 --- /dev/null +++ b/ext/cudart/include/mma.h @@ -0,0 +1,60 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__ +#endif + +#include "crt/mma.h" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__ +#endif diff --git a/ext/cudart/include/nvfunctional b/ext/cudart/include/nvfunctional new file mode 100644 index 0000000000000000000000000000000000000000..4fdeeecf6b63f92c5d684a03bb461cd935c0fd35 --- /dev/null +++ b/ext/cudart/include/nvfunctional @@ -0,0 +1,60 @@ +/* + * Copyright 2014-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__ +#endif + +#include "crt/nvfunctional" + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_NV_LIBCXX_FUNCTIONAL_H_WRAPPER__ +#endif diff --git a/ext/cudart/include/sm_20_atomic_functions.h b/ext/cudart/include/sm_20_atomic_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..12b74c94deef2bdea5bd14c9247814427308870b --- /dev/null +++ b/ext/cudart/include/sm_20_atomic_functions.h @@ -0,0 +1,100 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_20_ATOMIC_FUNCTIONS_H__) +#define __SM_20_ATOMIC_FUNCTIONS_H__ + +#if defined(__CUDACC_RTC__) +#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + + +#ifdef __CUDA_ARCH__ +extern "C" +{ +extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val); +} +#endif /* __CUDA_ARCH__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) __DEF_IF_HOST + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEF_IF_HOST +#undef __SM_20_ATOMIC_FUNCTIONS_DECL__ + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "sm_20_atomic_functions.hpp" +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#endif /* !__SM_20_ATOMIC_FUNCTIONS_H__ */ diff --git a/ext/cudart/include/sm_20_atomic_functions.hpp b/ext/cudart/include/sm_20_atomic_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ac4aa9bfc6b8d5d4d240e05a2fd557889f30c47f --- /dev/null +++ b/ext/cudart/include/sm_20_atomic_functions.hpp @@ -0,0 +1,85 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__) +#define __SM_20_ATOMIC_FUNCTIONS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val) +{ + return __fAtomicAdd(address, val); +} + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_20_ATOMIC_FUNCTIONS_DECL__ + +#endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */ + diff --git a/ext/cudart/include/sm_20_intrinsics.h b/ext/cudart/include/sm_20_intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..6965e3ef357042574a411a88fa4f88ae72487618 --- /dev/null +++ b/ext/cudart/include/sm_20_intrinsics.h @@ -0,0 +1,1551 @@ +/* + * Copyright 1993-2021 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_20_INTRINSICS_H__) +#define __SM_20_INTRINSICS_H__ + +#if defined(__CUDACC_RTC__) +#define __SM_20_INTRINSICS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is not valid on compute_70 and above, and should be replaced with "#x"_sync()."\ + "To continue using "#x"(), specify virtual architecture compute_60 when targeting sm_70 and above, for example, using the pair of compiler options: -arch=compute_60 -code=sm_70." +#else +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." +#endif + +extern "C" +{ +extern __device__ __device_builtin__ void __threadfence_system(void); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Divide two floating-point values in round-to-nearest-even mode. + * + * Divides two floating-point values \p x by \p y in round-to-nearest-even mode. + * + * \return Returns \p x / \p y. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __ddiv_rn(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Divide two floating-point values in round-towards-zero mode. + * + * Divides two floating-point values \p x by \p y in round-towards-zero mode. + * + * \return Returns \p x / \p y. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __ddiv_rz(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Divide two floating-point values in round-up mode. + * + * Divides two floating-point values \p x by \p y in round-up (to positive infinity) mode. + * + * \return Returns \p x / \p y. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __ddiv_ru(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Divide two floating-point values in round-down mode. + * + * Divides two floating-point values \p x by \p y in round-down (to negative infinity) mode. + * + * \return Returns \p x / \p y. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __ddiv_rd(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $\frac{1}{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mfrac> + * <m:mn>1</m:mn> + * <m:mi>x</m:mi> + * </m:mfrac> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * in round-to-nearest-even mode. + * + * Compute the reciprocal of \p x in round-to-nearest-even mode. + * + * \return Returns + * \latexonly $\frac{1}{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mfrac> + * <m:mn>1</m:mn> + * <m:mi>x</m:mi> + * </m:mfrac> + * </m:math> + * </d4p_MathML>\endxmlonly. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __drcp_rn(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $\frac{1}{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mfrac> + * <m:mn>1</m:mn> + * <m:mi>x</m:mi> + * </m:mfrac> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * in round-towards-zero mode. + * + * Compute the reciprocal of \p x in round-towards-zero mode. + * + * \return Returns + * \latexonly $\frac{1}{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mfrac> + * <m:mn>1</m:mn> + * <m:mi>x</m:mi> + * </m:mfrac> + * </m:math> + * </d4p_MathML>\endxmlonly. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __drcp_rz(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $\frac{1}{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mfrac> + * <m:mn>1</m:mn> + * <m:mi>x</m:mi> + * </m:mfrac> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * in round-up mode. + * + * Compute the reciprocal of \p x in round-up (to positive infinity) mode. + * + * \return Returns + * \latexonly $\frac{1}{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mfrac> + * <m:mn>1</m:mn> + * <m:mi>x</m:mi> + * </m:mfrac> + * </m:math> + * </d4p_MathML>\endxmlonly. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __drcp_ru(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $\frac{1}{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mfrac> + * <m:mn>1</m:mn> + * <m:mi>x</m:mi> + * </m:mfrac> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * in round-down mode. + * + * Compute the reciprocal of \p x in round-down (to negative infinity) mode. + * + * \return Returns + * \latexonly $\frac{1}{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mfrac> + * <m:mn>1</m:mn> + * <m:mi>x</m:mi> + * </m:mfrac> + * </m:math> + * </d4p_MathML>\endxmlonly. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __drcp_rd(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $\sqrt{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:msqrt> + * <m:mi>x</m:mi> + * </m:msqrt> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * in round-to-nearest-even mode. + * + * Compute the square root of \p x in round-to-nearest-even mode. + * + * \return Returns + * \latexonly $\sqrt{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:msqrt> + * <m:mi>x</m:mi> + * </m:msqrt> + * </m:math> + * </d4p_MathML>\endxmlonly. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __dsqrt_rn(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $\sqrt{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:msqrt> + * <m:mi>x</m:mi> + * </m:msqrt> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * in round-towards-zero mode. + * + * Compute the square root of \p x in round-towards-zero mode. + * + * \return Returns + * \latexonly $\sqrt{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:msqrt> + * <m:mi>x</m:mi> + * </m:msqrt> + * </m:math> + * </d4p_MathML>\endxmlonly. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __dsqrt_rz(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $\sqrt{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:msqrt> + * <m:mi>x</m:mi> + * </m:msqrt> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * in round-up mode. + * + * Compute the square root of \p x in round-up (to positive infinity) mode. + * + * \return Returns + * \latexonly $\sqrt{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:msqrt> + * <m:mi>x</m:mi> + * </m:msqrt> + * </m:math> + * </d4p_MathML>\endxmlonly. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __dsqrt_ru(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $\sqrt{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:msqrt> + * <m:mi>x</m:mi> + * </m:msqrt> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * in round-down mode. + * + * Compute the square root of \p x in round-down (to negative infinity) mode. + * + * \return Returns + * \latexonly $\sqrt{x}$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:msqrt> + * <m:mi>x</m:mi> + * </m:msqrt> + * </m:math> + * </d4p_MathML>\endxmlonly. + * + * \note_accuracy_double + * \note_requires_fermi + */ +extern __device__ __device_builtin__ double __dsqrt_rd(double x); +extern __device__ __device_builtin__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int __ballot(int); +extern __device__ __device_builtin__ int __syncthreads_count(int); +extern __device__ __device_builtin__ int __syncthreads_and(int); +extern __device__ __device_builtin__ int __syncthreads_or(int); +extern __device__ __device_builtin__ long long int clock64(void); + + +/** + * \ingroup CUDA_MATH_INTRINSIC_SINGLE + * \brief Compute fused multiply-add operation in round-to-nearest-even mode, ignore \p -ftz=true compiler flag + * + * Behavior is the same as ::__fmaf_rn(\p x, \p y, \p z), the difference is in + * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect. + */ +extern __device__ __device_builtin__ float __fmaf_ieee_rn(float x, float y, float z); + +/** + * \ingroup CUDA_MATH_INTRINSIC_SINGLE + * \brief Compute fused multiply-add operation in round-down mode, ignore \p -ftz=true compiler flag + * + * Behavior is the same as ::__fmaf_rd(\p x, \p y, \p z), the difference is in + * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect. + */ +extern __device__ __device_builtin__ float __fmaf_ieee_rd(float x, float y, float z); + +/** + * \ingroup CUDA_MATH_INTRINSIC_SINGLE + * \brief Compute fused multiply-add operation in round-up mode, ignore \p -ftz=true compiler flag + * + * Behavior is the same as ::__fmaf_ru(\p x, \p y, \p z), the difference is in + * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect. + */ +extern __device__ __device_builtin__ float __fmaf_ieee_ru(float x, float y, float z); + +/** + * \ingroup CUDA_MATH_INTRINSIC_SINGLE + * \brief Compute fused multiply-add operation in round-towards-zero mode, ignore \p -ftz=true compiler flag + * + * Behavior is the same as ::__fmaf_rz(\p x, \p y, \p z), the difference is in + * handling denormalized inputs and outputs: \p -ftz compiler flag has no effect. + */ +extern __device__ __device_builtin__ float __fmaf_ieee_rz(float x, float y, float z); + + +// SM_13 intrinsics + +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Reinterpret bits in a double as a 64-bit signed integer. + * + * Reinterpret the bits in the double-precision floating-point value \p x + * as a signed 64-bit integer. + * \return Returns reinterpreted value. + */ +extern __device__ __device_builtin__ long long int __double_as_longlong(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Reinterpret bits in a 64-bit signed integer as a double. + * + * Reinterpret the bits in the 64-bit signed integer value \p x as + * a double-precision floating-point value. + * \return Returns reinterpreted value. + */ +extern __device__ __device_builtin__ double __longlong_as_double(long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single operation in round-to-nearest-even mode. + * + * Computes the value of + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single ternary operation, rounding the + * result once in round-to-nearest-even mode. + * + * \return Returns the rounded value of + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single operation. + * - fmaf( + * \latexonly $\pm \infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , + * \latexonly $\pm 0$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>0</m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , \p z) returns NaN. + * - fmaf( + * \latexonly $\pm 0$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>0</m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , + * \latexonly $\pm \infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , \p z) returns NaN. + * - fmaf(\p x, \p y, + * \latexonly $-\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>-</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * ) returns NaN if + * \latexonly $x \times y$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * is an exact + * \latexonly $+\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>+</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * . + * - fmaf(\p x, \p y, + * \latexonly $+\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>+</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * ) returns NaN if + * \latexonly $x \times y$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * is an exact + * \latexonly $-\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>-</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * . + * + * \note_accuracy_double + */ +extern __device__ __device_builtin__ double __fma_rn(double x, double y, double z); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single operation in round-towards-zero mode. + * + * Computes the value of + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single ternary operation, rounding the + * result once in round-towards-zero mode. + * + * \return Returns the rounded value of + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single operation. + * - fmaf( + * \latexonly $\pm \infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , + * \latexonly $\pm 0$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>0</m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , \p z) returns NaN. + * - fmaf( + * \latexonly $\pm 0$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>0</m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , + * \latexonly $\pm \infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , \p z) returns NaN. + * - fmaf(\p x, \p y, + * \latexonly $-\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>-</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * ) returns NaN if + * \latexonly $x \times y$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * is an exact + * \latexonly $+\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>+</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * . + * - fmaf(\p x, \p y, + * \latexonly $+\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>+</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * ) returns NaN if + * \latexonly $x \times y$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * is an exact + * \latexonly $-\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>-</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * . + * + * \note_accuracy_double + */ +extern __device__ __device_builtin__ double __fma_rz(double x, double y, double z); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single operation in round-up mode. + * + * Computes the value of + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single ternary operation, rounding the + * result once in round-up (to positive infinity) mode. + * + * \return Returns the rounded value of + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single operation. + * - fmaf( + * \latexonly $\pm \infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , + * \latexonly $\pm 0$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>0</m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , \p z) returns NaN. + * - fmaf( + * \latexonly $\pm 0$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>0</m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , + * \latexonly $\pm \infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , \p z) returns NaN. + * - fmaf(\p x, \p y, + * \latexonly $-\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>-</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * ) returns NaN if + * \latexonly $x \times y$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * is an exact + * \latexonly $+\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>+</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * . + * - fmaf(\p x, \p y, + * \latexonly $+\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>+</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * ) returns NaN if + * \latexonly $x \times y$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * is an exact + * \latexonly $-\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>-</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * . + * + * \note_accuracy_double + */ +extern __device__ __device_builtin__ double __fma_ru(double x, double y, double z); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Compute + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single operation in round-down mode. + * + * Computes the value of + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single ternary operation, rounding the + * result once in round-down (to negative infinity) mode. + * + * \return Returns the rounded value of + * \latexonly $x \times y + z$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * <m:mo>+</m:mo> + * <m:mi>z</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * as a single operation. + * - fmaf( + * \latexonly $\pm \infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , + * \latexonly $\pm 0$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>0</m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , \p z) returns NaN. + * - fmaf( + * \latexonly $\pm 0$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>0</m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , + * \latexonly $\pm \infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>±<!-- ± --></m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * , \p z) returns NaN. + * - fmaf(\p x, \p y, + * \latexonly $-\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>-</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * ) returns NaN if + * \latexonly $x \times y$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * is an exact + * \latexonly $+\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>+</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * . + * - fmaf(\p x, \p y, + * \latexonly $+\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>+</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * ) returns NaN if + * \latexonly $x \times y$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mi>x</m:mi> + * <m:mo>×<!-- &Multiply; --></m:mo> + * <m:mi>y</m:mi> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * is an exact + * \latexonly $-\infty$ \endlatexonly + * \xmlonly + * <d4p_MathML outputclass="xmlonly"> + * <m:math xmlns:m="http://www.w3.org/1998/Math/MathML"> + * <m:mo>-</m:mo> + * <m:mn>∞<!-- &Infinity; --></m:mn> + * </m:math> + * </d4p_MathML> + * \endxmlonly + * . + * + * \note_accuracy_double + */ +extern __device__ __device_builtin__ double __fma_rd(double x, double y, double z); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Add two floating-point values in round-to-nearest-even mode. + * + * Adds two floating-point values \p x and \p y in round-to-nearest-even mode. + * + * \return Returns \p x + \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dadd_rn(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Add two floating-point values in round-towards-zero mode. + * + * Adds two floating-point values \p x and \p y in round-towards-zero mode. + * + * \return Returns \p x + \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dadd_rz(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Add two floating-point values in round-up mode. + * + * Adds two floating-point values \p x and \p y in round-up (to positive infinity) mode. + * + * \return Returns \p x + \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dadd_ru(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Add two floating-point values in round-down mode. + * + * Adds two floating-point values \p x and \p y in round-down (to negative infinity) mode. + * + * \return Returns \p x + \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dadd_rd(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Subtract two floating-point values in round-to-nearest-even mode. + * + * Subtracts two floating-point values \p x and \p y in round-to-nearest-even mode. + * + * \return Returns \p x - \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dsub_rn(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Subtract two floating-point values in round-towards-zero mode. + * + * Subtracts two floating-point values \p x and \p y in round-towards-zero mode. + * + * \return Returns \p x - \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dsub_rz(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Subtract two floating-point values in round-up mode. + * + * Subtracts two floating-point values \p x and \p y in round-up (to positive infinity) mode. + * + * \return Returns \p x - \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dsub_ru(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Subtract two floating-point values in round-down mode. + * + * Subtracts two floating-point values \p x and \p y in round-down (to negative infinity) mode. + * + * \return Returns \p x - \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dsub_rd(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Multiply two floating-point values in round-to-nearest-even mode. + * + * Multiplies two floating-point values \p x and \p y in round-to-nearest-even mode. + * + * \return Returns \p x * \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dmul_rn(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Multiply two floating-point values in round-towards-zero mode. + * + * Multiplies two floating-point values \p x and \p y in round-towards-zero mode. + * + * \return Returns \p x * \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dmul_rz(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Multiply two floating-point values in round-up mode. + * + * Multiplies two floating-point values \p x and \p y in round-up (to positive infinity) mode. + * + * \return Returns \p x * \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dmul_ru(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_DOUBLE + * \brief Multiply two floating-point values in round-down mode. + * + * Multiplies two floating-point values \p x and \p y in round-down (to negative infinity) mode. + * + * \return Returns \p x * \p y. + * + * \note_accuracy_double + * \note_nofma + */ +extern __device__ __device_builtin__ double __dmul_rd(double x, double y); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a float in round-to-nearest-even mode. + * + * Convert the double-precision floating-point value \p x to a single-precision + * floating-point value in round-to-nearest-even mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ float __double2float_rn(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a float in round-towards-zero mode. + * + * Convert the double-precision floating-point value \p x to a single-precision + * floating-point value in round-towards-zero mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ float __double2float_rz(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a float in round-up mode. + * + * Convert the double-precision floating-point value \p x to a single-precision + * floating-point value in round-up (to positive infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ float __double2float_ru(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a float in round-down mode. + * + * Convert the double-precision floating-point value \p x to a single-precision + * floating-point value in round-down (to negative infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ float __double2float_rd(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a signed int in round-to-nearest-even mode. + * + * Convert the double-precision floating-point value \p x to a + * signed integer value in round-to-nearest-even mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ int __double2int_rn(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a signed int in round-up mode. + * + * Convert the double-precision floating-point value \p x to a + * signed integer value in round-up (to positive infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ int __double2int_ru(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a signed int in round-down mode. + * + * Convert the double-precision floating-point value \p x to a + * signed integer value in round-down (to negative infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ int __double2int_rd(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to an unsigned int in round-to-nearest-even mode. + * + * Convert the double-precision floating-point value \p x to an + * unsigned integer value in round-to-nearest-even mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ unsigned int __double2uint_rn(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to an unsigned int in round-up mode. + * + * Convert the double-precision floating-point value \p x to an + * unsigned integer value in round-up (to positive infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ unsigned int __double2uint_ru(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to an unsigned int in round-down mode. + * + * Convert the double-precision floating-point value \p x to an + * unsigned integer value in round-down (to negative infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ unsigned int __double2uint_rd(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a signed 64-bit int in round-to-nearest-even mode. + * + * Convert the double-precision floating-point value \p x to a + * signed 64-bit integer value in round-to-nearest-even mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ long long int __double2ll_rn(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a signed 64-bit int in round-up mode. + * + * Convert the double-precision floating-point value \p x to a + * signed 64-bit integer value in round-up (to positive infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ long long int __double2ll_ru(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to a signed 64-bit int in round-down mode. + * + * Convert the double-precision floating-point value \p x to a + * signed 64-bit integer value in round-down (to negative infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ long long int __double2ll_rd(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to an unsigned 64-bit int in round-to-nearest-even mode. + * + * Convert the double-precision floating-point value \p x to an + * unsigned 64-bit integer value in round-to-nearest-even mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ unsigned long long int __double2ull_rn(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to an unsigned 64-bit int in round-up mode. + * + * Convert the double-precision floating-point value \p x to an + * unsigned 64-bit integer value in round-up (to positive infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ unsigned long long int __double2ull_ru(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a double to an unsigned 64-bit int in round-down mode. + * + * Convert the double-precision floating-point value \p x to an + * unsigned 64-bit integer value in round-down (to negative infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ unsigned long long int __double2ull_rd(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a signed int to a double. + * + * Convert the signed integer value \p x to a double-precision floating-point value. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __int2double_rn(int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert an unsigned int to a double. + * + * Convert the unsigned integer value \p x to a double-precision floating-point value. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __uint2double_rn(unsigned int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a signed 64-bit int to a double in round-to-nearest-even mode. + * + * Convert the signed 64-bit integer value \p x to a double-precision floating-point + * value in round-to-nearest-even mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __ll2double_rn(long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a signed 64-bit int to a double in round-towards-zero mode. + * + * Convert the signed 64-bit integer value \p x to a double-precision floating-point + * value in round-towards-zero mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __ll2double_rz(long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a signed 64-bit int to a double in round-up mode. + * + * Convert the signed 64-bit integer value \p x to a double-precision floating-point + * value in round-up (to positive infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __ll2double_ru(long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert a signed 64-bit int to a double in round-down mode. + * + * Convert the signed 64-bit integer value \p x to a double-precision floating-point + * value in round-down (to negative infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __ll2double_rd(long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert an unsigned 64-bit int to a double in round-to-nearest-even mode. + * + * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point + * value in round-to-nearest-even mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __ull2double_rn(unsigned long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert an unsigned 64-bit int to a double in round-towards-zero mode. + * + * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point + * value in round-towards-zero mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __ull2double_rz(unsigned long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert an unsigned 64-bit int to a double in round-up mode. + * + * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point + * value in round-up (to positive infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __ull2double_ru(unsigned long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Convert an unsigned 64-bit int to a double in round-down mode. + * + * Convert the unsigned 64-bit integer value \p x to a double-precision floating-point + * value in round-down (to negative infinity) mode. + * \return Returns converted value. + */ +extern __device__ __device_builtin__ double __ull2double_rd(unsigned long long int x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Reinterpret high 32 bits in a double as a signed integer. + * + * Reinterpret the high 32 bits in the double-precision floating-point value \p x + * as a signed integer. + * \return Returns reinterpreted value. + */ +extern __device__ __device_builtin__ int __double2hiint(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Reinterpret low 32 bits in a double as a signed integer. + * + * Reinterpret the low 32 bits in the double-precision floating-point value \p x + * as a signed integer. + * \return Returns reinterpreted value. + */ +extern __device__ __device_builtin__ int __double2loint(double x); +/** + * \ingroup CUDA_MATH_INTRINSIC_CAST + * \brief Reinterpret high and low 32-bit integer values as a double. + * + * Reinterpret the integer value of \p hi as the high 32 bits of a + * double-precision floating-point value and the integer value of \p lo + * as the low 32 bits of the same double-precision floating-point value. + * \return Returns reinterpreted value. + */ +extern __device__ __device_builtin__ double __hiloint2double(int hi, int lo); + + +} + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ +__SM_20_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__ballot)) unsigned int ballot(bool pred) __DEF_IF_HOST + +__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred) __DEF_IF_HOST + +__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred) __DEF_IF_HOST + +__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred) __DEF_IF_HOST + +#undef __DEPRECATED__ +#undef __WSB_DEPRECATION_MESSAGE + +__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr) __DEF_IF_HOST +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) +__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr) __DEF_IF_HOST +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */ +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *ptr) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *ptr) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *ptr) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *ptr) __DEF_IF_HOST +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr) __DEF_IF_HOST +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */ + +__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits) __DEF_IF_HOST +__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits) __DEF_IF_HOST +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) +__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits) __DEF_IF_HOST +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */ +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEF_IF_HOST +#undef __SM_20_INTRINSICS_DECL__ + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "sm_20_intrinsics.hpp" +#endif /* !__CUDACC_RTC__ */ +#endif /* !__SM_20_INTRINSICS_H__ && defined(__CUDA_ARCH__) */ diff --git a/ext/cudart/include/sm_20_intrinsics.hpp b/ext/cudart/include/sm_20_intrinsics.hpp new file mode 100644 index 0000000000000000000000000000000000000000..30c1ab99e0d66ebbceb8fe88b1122443cbf5f998 --- /dev/null +++ b/ext/cudart/include/sm_20_intrinsics.hpp @@ -0,0 +1,221 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_20_INTRINSICS_HPP__) +#define __SM_20_INTRINSICS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_20_INTRINSICS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_20_INTRINSICS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_20_INTRINSICS_DECL__ unsigned int ballot(bool pred) +{ + return __ballot((int)pred); +} + +__SM_20_INTRINSICS_DECL__ int syncthreads_count(bool pred) +{ + return __syncthreads_count((int)pred); +} + +__SM_20_INTRINSICS_DECL__ bool syncthreads_and(bool pred) +{ + return (bool)__syncthreads_and((int)pred); +} + +__SM_20_INTRINSICS_DECL__ bool syncthreads_or(bool pred) +{ + return (bool)__syncthreads_or((int)pred); +} + + +extern "C" { + __device__ unsigned __nv_isGlobal_impl(const void *); + __device__ unsigned __nv_isShared_impl(const void *); + __device__ unsigned __nv_isConstant_impl(const void *); + __device__ unsigned __nv_isLocal_impl(const void *); + __device__ unsigned __nv_isGridConstant_impl(const void *); +} + +__SM_20_INTRINSICS_DECL__ unsigned int __isGlobal(const void *ptr) +{ + return __nv_isGlobal_impl(ptr); +} + +__SM_20_INTRINSICS_DECL__ unsigned int __isShared(const void *ptr) +{ + return __nv_isShared_impl(ptr); +} + +__SM_20_INTRINSICS_DECL__ unsigned int __isConstant(const void *ptr) +{ + return __nv_isConstant_impl(ptr); +} + +__SM_20_INTRINSICS_DECL__ unsigned int __isLocal(const void *ptr) +{ + return __nv_isLocal_impl(ptr); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) +__SM_20_INTRINSICS_DECL__ unsigned int __isGridConstant(const void *ptr) +{ + return __nv_isGridConstant_impl(ptr); +} +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */ + +extern "C" { + __device__ size_t __nv_cvta_generic_to_global_impl(const void *); + __device__ size_t __nv_cvta_generic_to_shared_impl(const void *); + __device__ size_t __nv_cvta_generic_to_constant_impl(const void *); + __device__ size_t __nv_cvta_generic_to_local_impl(const void *); + __device__ void * __nv_cvta_global_to_generic_impl(size_t); + __device__ void * __nv_cvta_shared_to_generic_impl(size_t); + __device__ void * __nv_cvta_constant_to_generic_impl(size_t); + __device__ void * __nv_cvta_local_to_generic_impl(size_t); +} + +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_global(const void *p) +{ + return __nv_cvta_generic_to_global_impl(p); +} + +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_shared(const void *p) +{ + return __nv_cvta_generic_to_shared_impl(p); +} + +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_constant(const void *p) +{ + return __nv_cvta_generic_to_constant_impl(p); +} + +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_local(const void *p) +{ + return __nv_cvta_generic_to_local_impl(p); +} + +__SM_20_INTRINSICS_DECL__ void * __cvta_global_to_generic(size_t rawbits) +{ + return __nv_cvta_global_to_generic_impl(rawbits); +} + +__SM_20_INTRINSICS_DECL__ void * __cvta_shared_to_generic(size_t rawbits) +{ + return __nv_cvta_shared_to_generic_impl(rawbits); +} + +__SM_20_INTRINSICS_DECL__ void * __cvta_constant_to_generic(size_t rawbits) +{ + return __nv_cvta_constant_to_generic_impl(rawbits); +} + +__SM_20_INTRINSICS_DECL__ void * __cvta_local_to_generic(size_t rawbits) +{ + return __nv_cvta_local_to_generic_impl(rawbits); +} + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __CVTA_PTR_64 1 +#endif + +__SM_20_INTRINSICS_DECL__ size_t __cvta_generic_to_grid_constant(const void *ptr) +{ +#if __CVTA_PTR_64 + unsigned long long ret; + asm("cvta.to.param.u64 %0, %1;" : "=l"(ret) : "l"(ptr)); +#else /* !__CVTA_PTR_64 */ + unsigned ret; + asm("cvta.to.param.u32 %0, %1;" : "=r"(ret) : "r"(ptr)); +#endif /* __CVTA_PTR_64 */ + return (size_t)ret; + +} + +__SM_20_INTRINSICS_DECL__ void * __cvta_grid_constant_to_generic(size_t rawbits) +{ + void *ret; +#if __CVTA_PTR_64 + unsigned long long in = rawbits; + asm("cvta.param.u64 %0, %1;" : "=l"(ret) : "l"(in)); +#else /* !__CVTA_PTR_64 */ + unsigned in = rawbits; + asm("cvta.param.u32 %0, %1;" : "=r"(ret) : "r"(in)); +#endif /* __CVTA_PTR_64 */ + return ret; +} +#undef __CVTA_PTR_64 +#endif /* !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 700) */ + + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_20_INTRINSICS_DECL__ + +#endif /* !__SM_20_INTRINSICS_HPP__ */ + diff --git a/ext/cudart/include/sm_30_intrinsics.h b/ext/cudart/include/sm_30_intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..393ddfcb38e0bc21631affe3dca370b01761a464 --- /dev/null +++ b/ext/cudart/include/sm_30_intrinsics.h @@ -0,0 +1,215 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_30_INTRINSICS_H__) +#define __SM_30_INTRINSICS_H__ + +#if defined(__CUDACC_RTC__) +#define __SM_30_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + + +/******************************************************************************* +* * +* Below are declarations of SM-3.0 intrinsics which are included as * +* source (instead of being built in to the compiler) * +* * +*******************************************************************************/ + +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +#if defined(_WIN32) +# define __DEPRECATED__(msg) __declspec(deprecated(msg)) +#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__)))) +# define __DEPRECATED__(msg) __attribute__((deprecated)) +#else +# define __DEPRECATED__(msg) __attribute__((deprecated(msg))) +#endif + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +#define __WSB_DEPRECATION_MESSAGE(x) #x"() is deprecated in favor of "#x"_sync() and may be removed in a future release (Use -Wno-deprecated-declarations to suppress this warning)." +#endif + +__SM_30_INTRINSICS_DECL__ unsigned __fns(unsigned mask, unsigned base, int offset) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ void __barrier_sync(unsigned id) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ void __barrier_sync_count(unsigned id, unsigned cnt) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ void __syncwarp(unsigned mask=0xFFFFFFFF) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ int __all_sync(unsigned mask, int pred) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ int __any_sync(unsigned mask, int pred) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ int __uni_sync(unsigned mask, int pred) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned __ballot_sync(unsigned mask, int pred) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned __activemask() __DEF_IF_HOST + +// Warp register exchange (shuffle) intrinsics. +// Notes: +// a) Warp size is hardcoded to 32 here, because the compiler does not know +// the "warpSize" constant at this time +// b) we cannot map the float __shfl to the int __shfl because it'll mess with +// the register number (especially if you're doing two shfls to move a double). +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) int __shfl(int var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned int __shfl(unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) int __shfl_up(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned int __shfl_up(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) int __shfl_down(int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned int __shfl_down(unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) int __shfl_xor(int var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned int __shfl_xor(unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) float __shfl(float var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) float __shfl_up(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) float __shfl_down(float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) float __shfl_xor(float var, int laneMask, int width=warpSize) __DEF_IF_HOST +#endif + +__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width=warpSize) __DEF_IF_HOST + +// 64-bits SHFL +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long long __shfl(unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long long __shfl(long long var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long long __shfl_up(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long long __shfl_down(long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long long __shfl_xor(long long var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) double __shfl(double var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) double __shfl_up(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) double __shfl_down(double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) double __shfl_xor(double var, int laneMask, int width=warpSize) __DEF_IF_HOST +#endif + +__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width=warpSize) __DEF_IF_HOST + +// long needs some help to choose between 32-bits and 64-bits +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) long __shfl(long var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl)) unsigned long __shfl(unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) long __shfl_up(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_up)) unsigned long __shfl_up(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) long __shfl_down(long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_down)) unsigned long __shfl_down(unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) long __shfl_xor(long var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ __DEPRECATED__(__WSB_DEPRECATION_MESSAGE(__shfl_xor)) unsigned long __shfl_xor(unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST +#endif + +__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width=warpSize) __DEF_IF_HOST +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width=warpSize) __DEF_IF_HOST + +#undef __DEPRECATED__ +#undef __WSB_DEPRECATION_MESSAGE + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEF_IF_HOST +#undef __SM_30_INTRINSICS_DECL__ + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "sm_30_intrinsics.hpp" +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#endif /* !__SM_30_INTRINSICS_H__ */ diff --git a/ext/cudart/include/sm_30_intrinsics.hpp b/ext/cudart/include/sm_30_intrinsics.hpp new file mode 100644 index 0000000000000000000000000000000000000000..c5d484255e85ce1e7faa660347d29b8c17d43639 --- /dev/null +++ b/ext/cudart/include/sm_30_intrinsics.hpp @@ -0,0 +1,604 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_30_INTRINSICS_HPP__) +#define __SM_30_INTRINSICS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_30_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_30_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +// In here are intrinsics which are built in to the compiler. These may be +// referenced by intrinsic implementations from this file. +extern "C" +{ +} + +/******************************************************************************* +* * +* Below are implementations of SM-3.0 intrinsics which are included as * +* source (instead of being built in to the compiler) * +* * +*******************************************************************************/ + +#if !defined warpSize && !defined __local_warpSize +#define warpSize 32 +#define __local_warpSize +#endif + +__SM_30_INTRINSICS_DECL__ +unsigned __fns(unsigned mask, unsigned base, int offset) { + extern __device__ __device_builtin__ unsigned int __nvvm_fns(unsigned int mask, unsigned int base, int offset); + return __nvvm_fns(mask, base, offset); +} + +__SM_30_INTRINSICS_DECL__ +void __barrier_sync(unsigned id) { + extern __device__ __device_builtin__ void __nvvm_barrier_sync(unsigned id); + return __nvvm_barrier_sync(id); +} + +__SM_30_INTRINSICS_DECL__ +void __barrier_sync_count(unsigned id, unsigned cnt) { + extern __device__ __device_builtin__ void __nvvm_barrier_sync_cnt(unsigned id, unsigned cnt); + return __nvvm_barrier_sync_cnt(id, cnt); +} + +__SM_30_INTRINSICS_DECL__ +void __syncwarp(unsigned mask) { + extern __device__ __device_builtin__ void __nvvm_bar_warp_sync(unsigned mask); + return __nvvm_bar_warp_sync(mask); +} + +__SM_30_INTRINSICS_DECL__ +int __all_sync(unsigned mask, int pred) { + extern __device__ __device_builtin__ int __nvvm_vote_all_sync(unsigned int mask, int pred); + return __nvvm_vote_all_sync(mask, pred); +} + +__SM_30_INTRINSICS_DECL__ +int __any_sync(unsigned mask, int pred) { + extern __device__ __device_builtin__ int __nvvm_vote_any_sync(unsigned int mask, int pred); + return __nvvm_vote_any_sync(mask, pred); +} + +__SM_30_INTRINSICS_DECL__ +int __uni_sync(unsigned mask, int pred) { + extern __device__ __device_builtin__ int __nvvm_vote_uni_sync(unsigned int mask, int pred); + return __nvvm_vote_uni_sync(mask, pred); +} + +__SM_30_INTRINSICS_DECL__ +unsigned __ballot_sync(unsigned mask, int pred) { + extern __device__ __device_builtin__ unsigned int __nvvm_vote_ballot_sync(unsigned int mask, int pred); + return __nvvm_vote_ballot_sync(mask, pred); +} + +__SM_30_INTRINSICS_DECL__ +unsigned __activemask() { + unsigned ret; + asm volatile ("activemask.b32 %0;" : "=r"(ret)); + return ret; +} + +// These are removed starting with compute_70 and onwards +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 + +__SM_30_INTRINSICS_DECL__ int __shfl(int var, int srcLane, int width) { + int ret; + int c = ((warpSize-width) << 8) | 0x1f; + asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(srcLane), "r"(c)); + return ret; +} + +__SM_30_INTRINSICS_DECL__ unsigned int __shfl(unsigned int var, int srcLane, int width) { + return (unsigned int) __shfl((int)var, srcLane, width); +} + +__SM_30_INTRINSICS_DECL__ int __shfl_up(int var, unsigned int delta, int width) { + int ret; + int c = (warpSize-width) << 8; + asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c)); + return ret; +} + +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up(unsigned int var, unsigned int delta, int width) { + return (unsigned int) __shfl_up((int)var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ int __shfl_down(int var, unsigned int delta, int width) { + int ret; + int c = ((warpSize-width) << 8) | 0x1f; + asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(delta), "r"(c)); + return ret; +} + +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down(unsigned int var, unsigned int delta, int width) { + return (unsigned int) __shfl_down((int)var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ int __shfl_xor(int var, int laneMask, int width) { + int ret; + int c = ((warpSize-width) << 8) | 0x1f; + asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(var), "r"(laneMask), "r"(c)); + return ret; +} + +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor(unsigned int var, int laneMask, int width) { + return (unsigned int) __shfl_xor((int)var, laneMask, width); +} + +__SM_30_INTRINSICS_DECL__ float __shfl(float var, int srcLane, int width) { + float ret; + int c; + c = ((warpSize-width) << 8) | 0x1f; + asm volatile ("shfl.idx.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(srcLane), "r"(c)); + return ret; +} + +__SM_30_INTRINSICS_DECL__ float __shfl_up(float var, unsigned int delta, int width) { + float ret; + int c; + c = (warpSize-width) << 8; + asm volatile ("shfl.up.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c)); + return ret; +} + +__SM_30_INTRINSICS_DECL__ float __shfl_down(float var, unsigned int delta, int width) { + float ret; + int c; + c = ((warpSize-width) << 8) | 0x1f; + asm volatile ("shfl.down.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c)); + return ret; +} + +__SM_30_INTRINSICS_DECL__ float __shfl_xor(float var, int laneMask, int width) { + float ret; + int c; + c = ((warpSize-width) << 8) | 0x1f; + asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c)); + return ret; +} + +// 64-bits SHFL + +__SM_30_INTRINSICS_DECL__ long long __shfl(long long var, int srcLane, int width) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); + hi = __shfl(hi, srcLane, width); + lo = __shfl(lo, srcLane, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl(unsigned long long var, int srcLane, int width) { + return (unsigned long long) __shfl((long long) var, srcLane, width); +} + +__SM_30_INTRINSICS_DECL__ long long __shfl_up(long long var, unsigned int delta, int width) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); + hi = __shfl_up(hi, delta, width); + lo = __shfl_up(lo, delta, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up(unsigned long long var, unsigned int delta, int width) { + return (unsigned long long) __shfl_up((long long) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ long long __shfl_down(long long var, unsigned int delta, int width) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); + hi = __shfl_down(hi, delta, width); + lo = __shfl_down(lo, delta, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down(unsigned long long var, unsigned int delta, int width) { + return (unsigned long long) __shfl_down((long long) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ long long __shfl_xor(long long var, int laneMask, int width) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); + hi = __shfl_xor(hi, laneMask, width); + lo = __shfl_xor(lo, laneMask, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor(unsigned long long var, int laneMask, int width) { + return (unsigned long long) __shfl_xor((long long) var, laneMask, width); +} + +__SM_30_INTRINSICS_DECL__ double __shfl(double var, int srcLane, int width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = __shfl(hi, srcLane, width); + lo = __shfl(lo, srcLane, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ double __shfl_up(double var, unsigned int delta, int width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = __shfl_up(hi, delta, width); + lo = __shfl_up(lo, delta, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ double __shfl_down(double var, unsigned int delta, int width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = __shfl_down(hi, delta, width); + lo = __shfl_down(lo, delta, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ double __shfl_xor(double var, int laneMask, int width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = __shfl_xor(hi, laneMask, width); + lo = __shfl_xor(lo, laneMask, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ long __shfl(long var, int srcLane, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl((long long) var, srcLane, width) : + __shfl((int) var, srcLane, width); +} + +__SM_30_INTRINSICS_DECL__ unsigned long __shfl(unsigned long var, int srcLane, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl((unsigned long long) var, srcLane, width) : + __shfl((unsigned int) var, srcLane, width); +} + +__SM_30_INTRINSICS_DECL__ long __shfl_up(long var, unsigned int delta, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_up((long long) var, delta, width) : + __shfl_up((int) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up(unsigned long var, unsigned int delta, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_up((unsigned long long) var, delta, width) : + __shfl_up((unsigned int) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ long __shfl_down(long var, unsigned int delta, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_down((long long) var, delta, width) : + __shfl_down((int) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down(unsigned long var, unsigned int delta, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_down((unsigned long long) var, delta, width) : + __shfl_down((unsigned int) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ long __shfl_xor(long var, int laneMask, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_xor((long long) var, laneMask, width) : + __shfl_xor((int) var, laneMask, width); +} + +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor(unsigned long var, int laneMask, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_xor((unsigned long long) var, laneMask, width) : + __shfl_xor((unsigned int) var, laneMask, width); +} + +#endif /* !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700 */ + +// Warp register exchange (shuffle) intrinsics. +// Notes: +// a) Warp size is hardcoded to 32 here, because the compiler does not know +// the "warpSize" constant at this time +// b) we cannot map the float __shfl to the int __shfl because it'll mess with +// the register number (especially if you're doing two shfls to move a double). +__SM_30_INTRINSICS_DECL__ int __shfl_sync(unsigned mask, int var, int srcLane, int width) { + extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c); + int ret; + int c = ((warpSize-width) << 8) | 0x1f; + ret = __nvvm_shfl_idx_sync(mask, var, srcLane, c); + return ret; +} + +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_sync(unsigned mask, unsigned int var, int srcLane, int width) { + return (unsigned int) __shfl_sync(mask, (int)var, srcLane, width); +} + +__SM_30_INTRINSICS_DECL__ int __shfl_up_sync(unsigned mask, int var, unsigned int delta, int width) { + extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c); + int ret; + int c = (warpSize-width) << 8; + ret = __nvvm_shfl_up_sync(mask, var, delta, c); + return ret; +} + +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_up_sync(unsigned mask, unsigned int var, unsigned int delta, int width) { + return (unsigned int) __shfl_up_sync(mask, (int)var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ int __shfl_down_sync(unsigned mask, int var, unsigned int delta, int width) { + extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c); + int ret; + int c = ((warpSize-width) << 8) | 0x1f; + ret = __nvvm_shfl_down_sync(mask, var, delta, c); + return ret; +} + +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_down_sync(unsigned mask, unsigned int var, unsigned int delta, int width) { + return (unsigned int) __shfl_down_sync(mask, (int)var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ int __shfl_xor_sync(unsigned mask, int var, int laneMask, int width) { + extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c); + int ret; + int c = ((warpSize-width) << 8) | 0x1f; + ret = __nvvm_shfl_bfly_sync(mask, var, laneMask, c); + return ret; +} + +__SM_30_INTRINSICS_DECL__ unsigned int __shfl_xor_sync(unsigned mask, unsigned int var, int laneMask, int width) { + return (unsigned int) __shfl_xor_sync(mask, (int)var, laneMask, width); +} + +__SM_30_INTRINSICS_DECL__ float __shfl_sync(unsigned mask, float var, int srcLane, int width) { + extern __device__ __device_builtin__ unsigned __nvvm_shfl_idx_sync(unsigned mask, unsigned a, unsigned b, unsigned c); + int ret; + int c; + c = ((warpSize-width) << 8) | 0x1f; + ret = __nvvm_shfl_idx_sync(mask, __float_as_int(var), srcLane, c); + return __int_as_float(ret); +} + +__SM_30_INTRINSICS_DECL__ float __shfl_up_sync(unsigned mask, float var, unsigned int delta, int width) { + extern __device__ __device_builtin__ unsigned __nvvm_shfl_up_sync(unsigned mask, unsigned a, unsigned b, unsigned c); + int ret; + int c; + c = (warpSize-width) << 8; + ret = __nvvm_shfl_up_sync(mask, __float_as_int(var), delta, c); + return __int_as_float(ret); +} + +__SM_30_INTRINSICS_DECL__ float __shfl_down_sync(unsigned mask, float var, unsigned int delta, int width) { + extern __device__ __device_builtin__ unsigned __nvvm_shfl_down_sync(unsigned mask, unsigned a, unsigned b, unsigned c); + int ret; + int c; + c = ((warpSize-width) << 8) | 0x1f; + ret = __nvvm_shfl_down_sync(mask, __float_as_int(var), delta, c); + return __int_as_float(ret); +} + +__SM_30_INTRINSICS_DECL__ float __shfl_xor_sync(unsigned mask, float var, int laneMask, int width) { + extern __device__ __device_builtin__ unsigned __nvvm_shfl_bfly_sync(unsigned mask, unsigned a, unsigned b, unsigned c); + int ret; + int c; + c = ((warpSize-width) << 8) | 0x1f; + ret = __nvvm_shfl_bfly_sync(mask, __float_as_int(var), laneMask, c); + return __int_as_float(ret); +} + +// 64-bits SHFL +__SM_30_INTRINSICS_DECL__ long long __shfl_sync(unsigned mask, long long var, int srcLane, int width) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); + hi = __shfl_sync(mask, hi, srcLane, width); + lo = __shfl_sync(mask, lo, srcLane, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_sync(unsigned mask, unsigned long long var, int srcLane, int width) { + return (unsigned long long) __shfl_sync(mask, (long long) var, srcLane, width); +} + +__SM_30_INTRINSICS_DECL__ long long __shfl_up_sync(unsigned mask, long long var, unsigned int delta, int width) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); + hi = __shfl_up_sync(mask, hi, delta, width); + lo = __shfl_up_sync(mask, lo, delta, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_up_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) { + return (unsigned long long) __shfl_up_sync(mask, (long long) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ long long __shfl_down_sync(unsigned mask, long long var, unsigned int delta, int width) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); + hi = __shfl_down_sync(mask, hi, delta, width); + lo = __shfl_down_sync(mask, lo, delta, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_down_sync(unsigned mask, unsigned long long var, unsigned int delta, int width) { + return (unsigned long long) __shfl_down_sync(mask, (long long) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ long long __shfl_xor_sync(unsigned mask, long long var, int laneMask, int width) { + int lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(var)); + hi = __shfl_xor_sync(mask, hi, laneMask, width); + lo = __shfl_xor_sync(mask, lo, laneMask, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ unsigned long long __shfl_xor_sync(unsigned mask, unsigned long long var, int laneMask, int width) { + return (unsigned long long) __shfl_xor_sync(mask, (long long) var, laneMask, width); +} + +__SM_30_INTRINSICS_DECL__ double __shfl_sync(unsigned mask, double var, int srcLane, int width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = __shfl_sync(mask, hi, srcLane, width); + lo = __shfl_sync(mask, lo, srcLane, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ double __shfl_up_sync(unsigned mask, double var, unsigned int delta, int width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = __shfl_up_sync(mask, hi, delta, width); + lo = __shfl_up_sync(mask, lo, delta, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ double __shfl_down_sync(unsigned mask, double var, unsigned int delta, int width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = __shfl_down_sync(mask, hi, delta, width); + lo = __shfl_down_sync(mask, lo, delta, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} + +__SM_30_INTRINSICS_DECL__ double __shfl_xor_sync(unsigned mask, double var, int laneMask, int width) { + unsigned lo, hi; + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "d"(var)); + hi = __shfl_xor_sync(mask, hi, laneMask, width); + lo = __shfl_xor_sync(mask, lo, laneMask, width); + asm volatile("mov.b64 %0, {%1,%2};" : "=d"(var) : "r"(lo), "r"(hi)); + return var; +} + +// long needs some help to choose between 32-bits and 64-bits + +__SM_30_INTRINSICS_DECL__ long __shfl_sync(unsigned mask, long var, int srcLane, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_sync(mask, (long long) var, srcLane, width) : + __shfl_sync(mask, (int) var, srcLane, width); +} + +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_sync(unsigned mask, unsigned long var, int srcLane, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_sync(mask, (unsigned long long) var, srcLane, width) : + __shfl_sync(mask, (unsigned int) var, srcLane, width); +} + +__SM_30_INTRINSICS_DECL__ long __shfl_up_sync(unsigned mask, long var, unsigned int delta, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_up_sync(mask, (long long) var, delta, width) : + __shfl_up_sync(mask, (int) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_up_sync(unsigned mask, unsigned long var, unsigned int delta, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_up_sync(mask, (unsigned long long) var, delta, width) : + __shfl_up_sync(mask, (unsigned int) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ long __shfl_down_sync(unsigned mask, long var, unsigned int delta, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_down_sync(mask, (long long) var, delta, width) : + __shfl_down_sync(mask, (int) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_down_sync(unsigned mask, unsigned long var, unsigned int delta, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_down_sync(mask, (unsigned long long) var, delta, width) : + __shfl_down_sync(mask, (unsigned int) var, delta, width); +} + +__SM_30_INTRINSICS_DECL__ long __shfl_xor_sync(unsigned mask, long var, int laneMask, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_xor_sync(mask, (long long) var, laneMask, width) : + __shfl_xor_sync(mask, (int) var, laneMask, width); +} + +__SM_30_INTRINSICS_DECL__ unsigned long __shfl_xor_sync(unsigned mask, unsigned long var, int laneMask, int width) { + return (sizeof(long) == sizeof(long long)) ? + __shfl_xor_sync(mask, (unsigned long long) var, laneMask, width) : + __shfl_xor_sync(mask, (unsigned int) var, laneMask, width); +} + +#if defined(__local_warpSize) +#undef warpSize +#undef __local_warpSize +#endif + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 300 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_30_INTRINSICS_DECL__ + +#endif /* !__SM_30_INTRINSICS_HPP__ */ + diff --git a/ext/cudart/include/sm_32_atomic_functions.h b/ext/cudart/include/sm_32_atomic_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..905732018c9a8ab1866aed932bd292ffa7ba2ac4 --- /dev/null +++ b/ext/cudart/include/sm_32_atomic_functions.h @@ -0,0 +1,131 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.35.235 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_32_ATOMIC_FUNCTIONS_H__) +#define __SM_32_ATOMIC_FUNCTIONS_H__ + +#if defined(__CUDACC_RTC__) +#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + + +#ifdef __CUDA_ARCH__ +extern "C" +{ +extern __device__ __device_builtin__ long long __illAtomicMin(long long *address, long long val); +extern __device__ __device_builtin__ long long __illAtomicMax(long long *address, long long val); +extern __device__ __device_builtin__ long long __llAtomicAnd(long long *address, long long val); +extern __device__ __device_builtin__ long long __llAtomicOr(long long *address, long long val); +extern __device__ __device_builtin__ long long __llAtomicXor(long long *address, long long val); +extern __device__ __device_builtin__ unsigned long long __ullAtomicMin(unsigned long long *address, unsigned long long val); +extern __device__ __device_builtin__ unsigned long long __ullAtomicMax(unsigned long long *address, unsigned long long val); +extern __device__ __device_builtin__ unsigned long long __ullAtomicAnd(unsigned long long *address, unsigned long long val); +extern __device__ __device_builtin__ unsigned long long __ullAtomicOr (unsigned long long *address, unsigned long long val); +extern __device__ __device_builtin__ unsigned long long __ullAtomicXor(unsigned long long *address, unsigned long long val); +} +#endif /* __CUDA_ARCH__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEF_IF_HOST +#undef __SM_32_ATOMIC_FUNCTIONS_DECL__ + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "sm_32_atomic_functions.hpp" +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#endif /* !__SM_32_ATOMIC_FUNCTIONS_H__ */ diff --git a/ext/cudart/include/sm_32_atomic_functions.hpp b/ext/cudart/include/sm_32_atomic_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ebe60b8ca83666f07464b06ff04e6fc432c31b7b --- /dev/null +++ b/ext/cudart/include/sm_32_atomic_functions.hpp @@ -0,0 +1,134 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.35.235 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__) +#define __SM_32_ATOMIC_FUNCTIONS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val) +{ + return __illAtomicMin(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val) +{ + return __illAtomicMax(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val) +{ + return __llAtomicAnd(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val) +{ + return __llAtomicOr(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val) +{ + return __llAtomicXor(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMin(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMax(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAnd(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicOr(address, val); +} + +__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicXor(address, val); +} + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_32_ATOMIC_FUNCTIONS_DECL__ + +#endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */ + diff --git a/ext/cudart/include/sm_32_intrinsics.h b/ext/cudart/include/sm_32_intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..33a805f765400883ddabf405578abb241755aa88 --- /dev/null +++ b/ext/cudart/include/sm_32_intrinsics.h @@ -0,0 +1,510 @@ +/* + * Copyright 1993-2020 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_32_INTRINSICS_H__) +#define __SM_32_INTRINSICS_H__ + +#if defined(__CUDACC_RTC__) +#define __SM_32_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + + +/******************************************************************************* +* * +* Below are declarations of SM-3.5 intrinsics which are included as * +* source (instead of being built in to the compiler) * +* * +*******************************************************************************/ +/****************************************************************************** + * __ldg * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) __DEF_IF_HOST +/****************************************************************************** + * __ldcg * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) __DEF_IF_HOST +/****************************************************************************** + * __ldca * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) __DEF_IF_HOST +/****************************************************************************** + * __ldcs * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) __DEF_IF_HOST +/****************************************************************************** + * __ldlu * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) __DEF_IF_HOST +/****************************************************************************** + * __ldcv * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) __DEF_IF_HOST +/****************************************************************************** + * __stwb * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) __DEF_IF_HOST +/****************************************************************************** + * __stcg * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) __DEF_IF_HOST +/****************************************************************************** + * __stcs * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) __DEF_IF_HOST +/****************************************************************************** + * __stwt * + ******************************************************************************/ +__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) __DEF_IF_HOST + +__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) __DEF_IF_HOST +__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) __DEF_IF_HOST + + +// SHF is the "funnel shift" operation - an accelerated left/right shift with carry +// operating on 64-bit quantities, which are concatenations of two 32-bit registers. + +/** + * \ingroup CUDA_MATH_INTRINSIC_INT + * \brief Concatenate \p hi : \p lo, shift left by \p shift & 31 bits, return the most significant 32 bits. + * + * Shift the 64-bit value formed by concatenating argument \p lo and \p hi left by the amount specified by the argument \p shift. + * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value. + * The source is shifted left by the wrapped value of \p shift (\p shift & 31). + * The most significant 32-bits of the result are returned. + * + * \return Returns the most significant 32 bits of the shifted 64-bit value. + */ +__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST +/** + * \ingroup CUDA_MATH_INTRINSIC_INT + * \brief Concatenate \p hi : \p lo, shift left by min(\p shift, 32) bits, return the most significant 32 bits. + * + * Shift the 64-bit value formed by concatenating argument \p lo and \p hi left by the amount specified by the argument \p shift. + * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value. + * The source is shifted left by the clamped value of \p shift (min(\p shift, 32)). + * The most significant 32-bits of the result are returned. + * + * \return Returns the most significant 32 bits of the shifted 64-bit value. + */ +__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST + +/** + * \ingroup CUDA_MATH_INTRINSIC_INT + * \brief Concatenate \p hi : \p lo, shift right by \p shift & 31 bits, return the least significant 32 bits. + * + * Shift the 64-bit value formed by concatenating argument \p lo and \p hi right by the amount specified by the argument \p shift. + * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value. + * The source is shifted right by the wrapped value of \p shift (\p shift & 31). + * The least significant 32-bits of the result are returned. + * + * \return Returns the least significant 32 bits of the shifted 64-bit value. + */ +__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST +/** + * \ingroup CUDA_MATH_INTRINSIC_INT + * \brief Concatenate \p hi : \p lo, shift right by min(\p shift, 32) bits, return the least significant 32 bits. + * + * Shift the 64-bit value formed by concatenating argument \p lo and \p hi right by the amount specified by the argument \p shift. + * Argument \p lo holds bits 31:0 and argument \p hi holds bits 63:32 of the 64-bit source value. + * The source is shifted right by the clamped value of \p shift (min(\p shift, 32)). + * The least significant 32-bits of the result are returned. + * + * \return Returns the least significant 32 bits of the shifted 64-bit value. + */ +__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift) __DEF_IF_HOST + + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_32_INTRINSICS_DECL__ + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "sm_32_intrinsics.hpp" +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#endif /* !__SM_32_INTRINSICS_H__ */ diff --git a/ext/cudart/include/sm_32_intrinsics.hpp b/ext/cudart/include/sm_32_intrinsics.hpp new file mode 100644 index 0000000000000000000000000000000000000000..af5f6634434ff690d9d07a8bdcb7a44702b6fe48 --- /dev/null +++ b/ext/cudart/include/sm_32_intrinsics.hpp @@ -0,0 +1,588 @@ +/* + * Copyright 1993-2020 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_32_INTRINSICS_HPP__) +#define __SM_32_INTRINSICS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_32_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +// In here are intrinsics which are built in to the compiler. These may be +// referenced by intrinsic implementations from this file. +extern "C" +{ + // There are no intrinsics built in to the compiler for SM-3.5, + // all intrinsics are now implemented as inline PTX below. +} + +/******************************************************************************* +* * +* Below are implementations of SM-3.5 intrinsics which are included as * +* source (instead of being built in to the compiler) * +* * +*******************************************************************************/ + +// LDG is a "load from global via texture path" command which can exhibit higher +// bandwidth on GK110 than a regular LD. +// Define a different pointer storage size for 64 and 32 bit +#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__) +#define __LDG_PTR "l" +#else +#define __LDG_PTR "r" +#endif + +/****************************************************************************** + * __ldg * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +#endif + + +__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; } +__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; } +__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; } +__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; } +__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; } +__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.nc.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.nc.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.nc.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; } + +__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (unsigned char)ret; } +__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.nc.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.nc.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.nc.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.nc.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.nc.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; } + +__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } + + +/****************************************************************************** + * __ldcg * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +#endif + + +__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; } +__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; } +__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; } +__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; } +__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; } +__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cg.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cg.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cg.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cg.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cg.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; } + +__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (unsigned char)ret; } +__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cg.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cg.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cg.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cg.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cg.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cg.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; } + +__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) { float ret; asm volatile ("ld.global.cg.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) { double ret; asm volatile ("ld.global.cg.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cg.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cg.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } + +/****************************************************************************** + * __ldca * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +#endif + + +__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; } +__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; } +__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; } +__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; } +__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; } +__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.ca.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.ca.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) { short2 ret; asm volatile ("ld.global.ca.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) { short4 ret; asm volatile ("ld.global.ca.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) { int2 ret; asm volatile ("ld.global.ca.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) { int4 ret; asm volatile ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.ca.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; } + +__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (unsigned char)ret; } +__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.ca.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.ca.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.ca.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.ca.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.ca.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.ca.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.ca.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; } + +__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) { float ret; asm volatile ("ld.global.ca.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) { double ret; asm volatile ("ld.global.ca.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) { float2 ret; asm volatile ("ld.global.ca.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) { float4 ret; asm volatile ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) { double2 ret; asm volatile ("ld.global.ca.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } + +/****************************************************************************** + * __ldcs * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +#endif + + +__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; } +__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; } +__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; } +__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; } +__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; } +__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cs.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cs.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cs.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cs.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cs.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cs.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; } + +__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (unsigned char)ret; } +__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cs.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cs.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cs.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cs.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cs.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cs.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; } + +__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) { float ret; asm volatile ("ld.global.cs.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) { double ret; asm volatile ("ld.global.cs.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cs.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; } +__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cs.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; } + +/****************************************************************************** + * __ldlu * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +#endif + + +__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; } +__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; } +__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) { unsigned short ret; asm ("ld.global.lu.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; } +__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) { unsigned int ret; asm ("ld.global.lu.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; } +__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) { unsigned long long ret; asm ("ld.global.lu.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; } +__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.lu.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.lu.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) { short2 ret; asm ("ld.global.lu.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) { short4 ret; asm ("ld.global.lu.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) { int2 ret; asm ("ld.global.lu.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) { int4 ret; asm ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.lu.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } + +__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.lu.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (unsigned char)ret; } +__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.lu.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.lu.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.lu.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.lu.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.lu.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.lu.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.lu.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) { uint2 ret; asm ("ld.global.lu.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) { uint4 ret; asm ("ld.global.lu.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.lu.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } + +__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) { float ret; asm ("ld.global.lu.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) { double ret; asm ("ld.global.lu.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) { float2 ret; asm ("ld.global.lu.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) { float4 ret; asm ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) { double2 ret; asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } + +/****************************************************************************** + * __ldcv * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; } +__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +#endif + + +__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; } +__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; } +__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) { unsigned short ret; asm ("ld.global.cv.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; } +__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) { unsigned int ret; asm ("ld.global.cv.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; } +__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) { unsigned long long ret; asm ("ld.global.cv.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; } +__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.cv.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.cv.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) { short2 ret; asm ("ld.global.cv.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) { short4 ret; asm ("ld.global.cv.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) { int2 ret; asm ("ld.global.cv.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) { int4 ret; asm ("ld.global.cv.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.cv.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } + +__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.cv.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (unsigned char)ret; } +__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.cv.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.cv.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.cv.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.cv.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; } +__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.cv.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; } +__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.cv.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.cv.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) { uint2 ret; asm ("ld.global.cv.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) { uint4 ret; asm ("ld.global.cv.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.cv.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } + +__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) { float ret; asm ("ld.global.cv.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) { double ret; asm ("ld.global.cv.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) { float2 ret; asm ("ld.global.cv.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) { float4 ret; asm ("ld.global.cv.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; } +__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) { double2 ret; asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; } + +/****************************************************************************** + * __stwb * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +#endif + + +__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) { asm ("st.global.wb.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) { asm ("st.global.wb.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) { asm ("st.global.wb.s16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) { asm ("st.global.wb.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) { asm ("st.global.wb.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) { asm ("st.global.wb.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) { asm ("st.global.wb.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) { asm ("st.global.wb.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) { asm ("st.global.wb.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) { asm ("st.global.wb.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); } + +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) { asm ("st.global.wb.u8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) { asm ("st.global.wb.u16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) { asm ("st.global.wb.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wb.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.u8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) { asm ("st.global.wb.v2.u16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) { asm ("st.global.wb.v4.u16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) { asm ("st.global.wb.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) { asm ("st.global.wb.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wb.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); } + +__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) { asm ("st.global.wb.f32 [%0], %1;" :: __LDG_PTR (ptr), "f"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) { asm ("st.global.wb.f64 [%0], %1;" :: __LDG_PTR (ptr), "d"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) { asm ("st.global.wb.v2.f32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) { asm ("st.global.wb.v4.f32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) { asm ("st.global.wb.v2.f64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); } + +/****************************************************************************** + * __stcg * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +#endif + + +__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) { asm ("st.global.cg.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) { asm ("st.global.cg.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) { asm ("st.global.cg.s16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) { asm ("st.global.cg.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) { asm ("st.global.cg.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) { asm ("st.global.cg.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) { asm ("st.global.cg.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) { asm ("st.global.cg.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) { asm ("st.global.cg.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) { asm ("st.global.cg.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); } + +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) { asm ("st.global.cg.u8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) { asm ("st.global.cg.u16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) { asm ("st.global.cg.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cg.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.u8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) { asm ("st.global.cg.v2.u16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) { asm ("st.global.cg.v4.u16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) { asm ("st.global.cg.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) { asm ("st.global.cg.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cg.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); } + +__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) { asm ("st.global.cg.f32 [%0], %1;" :: __LDG_PTR (ptr), "f"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) { asm ("st.global.cg.f64 [%0], %1;" :: __LDG_PTR (ptr), "d"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) { asm ("st.global.cg.v2.f32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) { asm ("st.global.cg.v4.f32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) { asm ("st.global.cg.v2.f64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); } + +/****************************************************************************** + * __stcs * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +#endif + + +__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) { asm ("st.global.cs.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) { asm ("st.global.cs.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) { asm ("st.global.cs.s16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) { asm ("st.global.cs.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) { asm ("st.global.cs.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) { asm ("st.global.cs.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) { asm ("st.global.cs.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) { asm ("st.global.cs.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) { asm ("st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) { asm ("st.global.cs.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); } + +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) { asm ("st.global.cs.u8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) { asm ("st.global.cs.u16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) { asm ("st.global.cs.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cs.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.u8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) { asm ("st.global.cs.v2.u16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) { asm ("st.global.cs.v4.u16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) { asm ("st.global.cs.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) { asm ("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cs.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); } + +__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) { asm ("st.global.cs.f32 [%0], %1;" :: __LDG_PTR (ptr), "f"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) { asm ("st.global.cs.f64 [%0], %1;" :: __LDG_PTR (ptr), "d"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) { asm ("st.global.cs.v2.f32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) { asm ("st.global.cs.v4.f32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) { asm ("st.global.cs.v2.f64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); } + +/****************************************************************************** + * __stwt * + ******************************************************************************/ + +// Size of long is architecture and OS specific. +#if defined(__LP64__) // 64 bits +__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +#else // 32 bits +__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +#endif + + +__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) { asm ("st.global.wt.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) { asm ("st.global.wt.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) { asm ("st.global.wt.s16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) { asm ("st.global.wt.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) { asm ("st.global.wt.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) { asm ("st.global.wt.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) { asm ("st.global.wt.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) { asm ("st.global.wt.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) { asm ("st.global.wt.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) { asm ("st.global.wt.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); } + +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) { asm ("st.global.wt.u8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) { asm ("st.global.wt.u16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) { asm ("st.global.wt.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wt.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.u8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) { asm ("st.global.wt.v2.u16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) { asm ("st.global.wt.v4.u16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) { asm ("st.global.wt.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) { asm ("st.global.wt.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wt.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); } + +__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) { asm ("st.global.wt.f32 [%0], %1;" :: __LDG_PTR (ptr), "f"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) { asm ("st.global.wt.f64 [%0], %1;" :: __LDG_PTR (ptr), "d"(value) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) { asm ("st.global.wt.v2.f32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) { asm ("st.global.wt.v4.f32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); } +__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) { asm ("st.global.wt.v2.f64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); } + +#undef __LDG_PTR + + +// SHF is the "funnel shift" operation - an accelerated left/right shift with carry +// operating on 64-bit quantities, which are concatenations of two 32-bit registers. + +// This shifts [b:a] left by "shift" bits, returning the most significant bits of the result. +__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift) +{ + unsigned int ret; + asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift)); + return ret; +} +__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift) +{ + unsigned int ret; + asm volatile ("shf.l.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift)); + return ret; +} + +// This shifts [b:a] right by "shift" bits, returning the least significant bits of the result. +__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift) +{ + unsigned int ret; + asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift)); + return ret; +} +__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift) +{ + unsigned int ret; + asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift)); + return ret; +} + + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_32_INTRINSICS_DECL__ + +#endif /* !__SM_32_INTRINSICS_HPP__ */ + diff --git a/ext/cudart/include/sm_35_atomic_functions.h b/ext/cudart/include/sm_35_atomic_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..c8961079aeac4c9e73a7c2825cf9ea10b171af09 --- /dev/null +++ b/ext/cudart/include/sm_35_atomic_functions.h @@ -0,0 +1,58 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.35.235 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_35_ATOMIC_FUNCTIONS_H__) +#define __SM_35_ATOMIC_FUNCTIONS_H__ + +/******************************************************************************* +* All sm_35 atomics are supported by sm_32 so simply include its header file * +*******************************************************************************/ +#include "sm_32_atomic_functions.h" + +#endif /* !__SM_35_ATOMIC_FUNCTIONS_H__ */ diff --git a/ext/cudart/include/sm_35_intrinsics.h b/ext/cudart/include/sm_35_intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..da1e823a24171ed1ca9414955c6c68159a4411f5 --- /dev/null +++ b/ext/cudart/include/sm_35_intrinsics.h @@ -0,0 +1,116 @@ +/* + + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + + * + + * NOTICE TO LICENSEE: + + * + + * This source code and/or documentation ("Licensed Deliverables") are + + * subject to NVIDIA intellectual property rights under U.S. and + + * international Copyright laws. + + * + + * These Licensed Deliverables contained herein is PROPRIETARY and + + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + + * conditions of a form of NVIDIA software license agreement by and + + * between NVIDIA and Licensee ("License Agreement") or electronically + + * accepted by Licensee. Notwithstanding any terms or conditions to + + * the contrary in the License Agreement, reproduction or disclosure + + * of the Licensed Deliverables to any third party without the express + + * written consent of NVIDIA is prohibited. + + * + + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + + * OF THESE LICENSED DELIVERABLES. + + * + + * U.S. Government End Users. These Licensed Deliverables are a + + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + + * 1995), consisting of "commercial computer software" and "commercial + + * computer software documentation" as such terms are used in 48 + + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + + * U.S. Government End Users acquire the Licensed Deliverables with + + * only those rights set forth herein. + + * + + * Any use of the Licensed Deliverables in individual and commercial + + * software must include, in the user documentation and internal + + * comments to the code, the above Disclaimer and U.S. Government End + + * Users Notice. + + */ + + + +#if !defined(__SM_35_INTRINSICS_H__) + +#define __SM_35_INTRINSICS_H__ + + + +/********************************************************************************** + +* All sm_35 intrinsics are supported by sm_32 so simply include its header file * + +**********************************************************************************/ + +#include "sm_32_intrinsics.h" + + + +#endif /* !__SM_35_INTRINSICS_H__ */ + diff --git a/ext/cudart/include/sm_60_atomic_functions.h b/ext/cudart/include/sm_60_atomic_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..4eae20ab38c6efe8af1fb20a1a3c7a3783ec6834 --- /dev/null +++ b/ext/cudart/include/sm_60_atomic_functions.h @@ -0,0 +1,539 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_60_ATOMIC_FUNCTIONS_H__) +#define __SM_60_ATOMIC_FUNCTIONS_H__ + + +#if defined(__CUDACC_RTC__) +#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + + + +#ifdef __CUDA_ARCH__ +extern "C" +{ +extern __device__ __device_builtin__ double __dAtomicAdd(double *address, double val); + +extern __device__ __device_builtin__ +int __iAtomicAdd_block(int *address, int val); + +extern __device__ __device_builtin__ +int __iAtomicAdd_system(int *address, int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicAdd_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicAdd_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicAdd_block(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicAdd_system(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +float __fAtomicAdd_block(float *address, float val); + +extern __device__ __device_builtin__ +float __fAtomicAdd_system(float *address, float val); + +extern __device__ __device_builtin__ +double __dAtomicAdd_block(double *address, double val); + +extern __device__ __device_builtin__ +double __dAtomicAdd_system(double *address, double val); + +extern __device__ __device_builtin__ +int __iAtomicExch_block(int *address, int val); + +extern __device__ __device_builtin__ +int __iAtomicExch_system(int *address, int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicExch_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicExch_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicExch_block(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicExch_system(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +float __fAtomicExch_block(float *address, float val); + +extern __device__ __device_builtin__ +float __fAtomicExch_system(float *address, float val); + +extern __device__ __device_builtin__ +int __iAtomicMin_block(int *address, int val); + +extern __device__ __device_builtin__ +int __iAtomicMin_system(int *address, int val); + +extern __device__ __device_builtin__ +long long __illAtomicMin_block(long long *address, long long val); + +extern __device__ __device_builtin__ +long long __illAtomicMin_system(long long *address, long long val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicMin_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicMin_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicMin_block(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicMin_system(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +int __iAtomicMax_block(int *address, int val); + +extern __device__ __device_builtin__ +int __iAtomicMax_system(int *address, int val); + +extern __device__ __device_builtin__ +long long __illAtomicMax_block(long long *address, long long val); + +extern __device__ __device_builtin__ +long long __illAtomicMax_system(long long *address, long long val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicMax_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicMax_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicMax_block(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicMax_system(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicInc_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicInc_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicDec_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicDec_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +int __iAtomicCAS_block(int *address, int compare, int val); + +extern __device__ __device_builtin__ +int __iAtomicCAS_system(int *address, int compare, int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicCAS_block(unsigned int *address, unsigned int compare, + unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicCAS_system(unsigned int *address, unsigned int compare, + unsigned int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicCAS_block(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicCAS_system(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val); + +extern __device__ __device_builtin__ +int __iAtomicAnd_block(int *address, int val); + +extern __device__ __device_builtin__ +int __iAtomicAnd_system(int *address, int val); + +extern __device__ __device_builtin__ +long long __llAtomicAnd_block(long long *address, long long val); + +extern __device__ __device_builtin__ +long long __llAtomicAnd_system(long long *address, long long val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicAnd_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicAnd_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicAnd_block(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicAnd_system(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +int __iAtomicOr_block(int *address, int val); + +extern __device__ __device_builtin__ +int __iAtomicOr_system(int *address, int val); + +extern __device__ __device_builtin__ +long long __llAtomicOr_block(long long *address, long long val); + +extern __device__ __device_builtin__ +long long __llAtomicOr_system(long long *address, long long val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicOr_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicOr_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicOr_block(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicOr_system(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +int __iAtomicXor_block(int *address, int val); + +extern __device__ __device_builtin__ +int __iAtomicXor_system(int *address, int val); + +extern __device__ __device_builtin__ +long long __llAtomicXor_block(long long *address, long long val); + +extern __device__ __device_builtin__ +long long __llAtomicXor_system(long long *address, long long val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicXor_block(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned int __uAtomicXor_system(unsigned int *address, unsigned int val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicXor_block(unsigned long long *address, unsigned long long val); + +extern __device__ __device_builtin__ +unsigned long long __ullAtomicXor_system(unsigned long long *address, unsigned long long val); +} +#endif /* __CUDA_ARCH__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAdd_block(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAdd_system(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAdd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAdd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicAdd_block(float *address, float val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicAdd_system(float *address, float val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +double atomicAdd_block(double *address, double val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +double atomicAdd_system(double *address, double val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicSub_block(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicSub_system(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicSub_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicSub_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicExch_block(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicExch_system(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicExch_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicExch_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicExch_block(float *address, float val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicExch_system(float *address, float val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMin_block(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMin_system(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMin_block(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMin_system(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMin_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMin_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMax_block(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMax_system(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMax_block(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMax_system(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMax_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMax_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicInc_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicInc_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicDec_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicDec_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicCAS_block(int *address, int compare, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicCAS_system(int *address, int compare, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicCAS_block(unsigned int *address, unsigned int compare, + unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicCAS_system(unsigned int *address, unsigned int compare, + unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long int atomicCAS_block(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long int atomicCAS_system(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAnd_block(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAnd_system(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicAnd_block(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicAnd_system(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAnd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAnd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicOr_block(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicOr_system(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicOr_block(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicOr_system(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicOr_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicOr_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicXor_block(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicXor_system(int *address, int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicXor_block(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicXor_system(long long *address, long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicXor_block(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicXor_system(unsigned int *address, unsigned int val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_60_ATOMIC_FUNCTIONS_DECL__ +#undef __DEF_IF_HOST + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "sm_60_atomic_functions.hpp" +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#endif /* !__SM_60_ATOMIC_FUNCTIONS_H__ */ + diff --git a/ext/cudart/include/sm_60_atomic_functions.hpp b/ext/cudart/include/sm_60_atomic_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..b4d5227023221116868e8446fdac23efb96e94ae --- /dev/null +++ b/ext/cudart/include/sm_60_atomic_functions.hpp @@ -0,0 +1,527 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__) +#define __SM_60_ATOMIC_FUNCTIONS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__ +#else /* __CUDACC_RTC__ */ +#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val) +{ + return __dAtomicAdd(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAdd_block(int *address, int val) +{ + return __iAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAdd_system(int *address, int val) +{ + return __iAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAdd_block(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAdd_system(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicAdd_block(float *address, float val) +{ + return __fAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicAdd_system(float *address, float val) +{ + return __fAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +double atomicAdd_block(double *address, double val) +{ + return __dAtomicAdd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +double atomicAdd_system(double *address, double val) +{ + return __dAtomicAdd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicSub_block(int *address, int val) +{ + return __iAtomicAdd_block(address, (unsigned int)-(int)val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicSub_system(int *address, int val) +{ + return __iAtomicAdd_system(address, (unsigned int)-(int)val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicSub_block(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd_block(address, (unsigned int)-(int)val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicSub_system(unsigned int *address, unsigned int val) +{ + return __uAtomicAdd_system(address, (unsigned int)-(int)val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicExch_block(int *address, int val) +{ + return __iAtomicExch_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicExch_system(int *address, int val) +{ + return __iAtomicExch_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicExch_block(unsigned int *address, unsigned int val) +{ + return __uAtomicExch_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicExch_system(unsigned int *address, unsigned int val) +{ + return __uAtomicExch_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicExch_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicExch_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicExch_block(float *address, float val) +{ + return __fAtomicExch_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +float atomicExch_system(float *address, float val) +{ + return __fAtomicExch_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMin_block(int *address, int val) +{ + return __iAtomicMin_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMin_system(int *address, int val) +{ + return __iAtomicMin_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMin_block(long long *address, long long val) +{ + return __illAtomicMin_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMin_system(long long *address, long long val) +{ + return __illAtomicMin_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMin_block(unsigned int *address, unsigned int val) +{ + return __uAtomicMin_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMin_system(unsigned int *address, unsigned int val) +{ + return __uAtomicMin_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMin_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMin_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMax_block(int *address, int val) +{ + return __iAtomicMax_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicMax_system(int *address, int val) +{ + return __iAtomicMax_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMax_block(long long *address, long long val) +{ + return __illAtomicMax_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicMax_system(long long *address, long long val) +{ + return __illAtomicMax_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMax_block(unsigned int *address, unsigned int val) +{ + return __uAtomicMax_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicMax_system(unsigned int *address, unsigned int val) +{ + return __uAtomicMax_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMax_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicMax_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicInc_block(unsigned int *address, unsigned int val) +{ + return __uAtomicInc_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicInc_system(unsigned int *address, unsigned int val) +{ + return __uAtomicInc_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicDec_block(unsigned int *address, unsigned int val) +{ + return __uAtomicDec_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicDec_system(unsigned int *address, unsigned int val) +{ + return __uAtomicDec_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicCAS_block(int *address, int compare, int val) +{ + return __iAtomicCAS_block(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicCAS_system(int *address, int compare, int val) +{ + return __iAtomicCAS_system(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicCAS_block(unsigned int *address, unsigned int compare, + unsigned int val) +{ + return __uAtomicCAS_block(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicCAS_system(unsigned int *address, unsigned int compare, + unsigned int val) +{ + return __uAtomicCAS_system(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long int atomicCAS_block(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val) +{ + return __ullAtomicCAS_block(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long int atomicCAS_system(unsigned long long int *address, + unsigned long long int compare, + unsigned long long int val) +{ + return __ullAtomicCAS_system(address, compare, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAnd_block(int *address, int val) +{ + return __iAtomicAnd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicAnd_system(int *address, int val) +{ + return __iAtomicAnd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicAnd_block(long long *address, long long val) +{ + return __llAtomicAnd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicAnd_system(long long *address, long long val) +{ + return __llAtomicAnd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAnd_block(unsigned int *address, unsigned int val) +{ + return __uAtomicAnd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicAnd_system(unsigned int *address, unsigned int val) +{ + return __uAtomicAnd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAnd_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicAnd_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicOr_block(int *address, int val) +{ + return __iAtomicOr_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicOr_system(int *address, int val) +{ + return __iAtomicOr_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicOr_block(long long *address, long long val) +{ + return __llAtomicOr_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicOr_system(long long *address, long long val) +{ + return __llAtomicOr_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicOr_block(unsigned int *address, unsigned int val) +{ + return __uAtomicOr_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicOr_system(unsigned int *address, unsigned int val) +{ + return __uAtomicOr_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicOr_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicOr_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicXor_block(int *address, int val) +{ + return __iAtomicXor_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +int atomicXor_system(int *address, int val) +{ + return __iAtomicXor_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicXor_block(long long *address, long long val) +{ + return __llAtomicXor_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +long long atomicXor_system(long long *address, long long val) +{ + return __llAtomicXor_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicXor_block(unsigned int *address, unsigned int val) +{ + return __uAtomicXor_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned int atomicXor_system(unsigned int *address, unsigned int val) +{ + return __uAtomicXor_system(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicXor_block(address, val); +} + +__SM_60_ATOMIC_FUNCTIONS_DECL__ +unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val) +{ + return __ullAtomicXor_system(address, val); +} + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_60_ATOMIC_FUNCTIONS_DECL__ + +#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */ + diff --git a/ext/cudart/include/sm_61_intrinsics.h b/ext/cudart/include/sm_61_intrinsics.h new file mode 100644 index 0000000000000000000000000000000000000000..cf2dce47bb458fc3c093d710b78a72e582bc0fdd --- /dev/null +++ b/ext/cudart/include/sm_61_intrinsics.h @@ -0,0 +1,123 @@ +/* + * Copyright 2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_61_INTRINSICS_H__) +#define __SM_61_INTRINSICS_H__ + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#ifndef __CUDA_ARCH__ +#define __DEF_IF_HOST { } +#else /* !__CUDA_ARCH__ */ +#define __DEF_IF_HOST ; +#endif /* __CUDA_ARCH__ */ + +/******************************************************************************* +* * +* Below are declarations of SM-6.1 intrinsics which are included as * +* source (instead of being built in to the compiler) * +* * +*******************************************************************************/ + + +/****************************************************************************** + * __dp2a * + ******************************************************************************/ +// Generic [_lo] +__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST +// Vector-style [_lo] +__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST +// Generic [_hi] +__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST +// Vector-style [_hi] +__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST + + +/****************************************************************************** + * __dp4a * + ******************************************************************************/ +// Generic +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) __DEF_IF_HOST +// Vector-style +__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) __DEF_IF_HOST +__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) __DEF_IF_HOST + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __DEF_IF_HOST +#undef __SM_61_INTRINSICS_DECL__ + +#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__) +#include "sm_61_intrinsics.hpp" +#endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */ + +#endif /* !__SM_61_INTRINSICS_H__ */ diff --git a/ext/cudart/include/sm_61_intrinsics.hpp b/ext/cudart/include/sm_61_intrinsics.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f603d04ca4aa7bfef97539fb404150b81e490d67 --- /dev/null +++ b/ext/cudart/include/sm_61_intrinsics.hpp @@ -0,0 +1,161 @@ +/* + * Copyright 2016 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SM_61_INTRINSICS_HPP__) +#define __SM_61_INTRINSICS_HPP__ + +#if defined(__CUDACC_RTC__) +#define __SM_61_INTRINSICS_DECL__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__ +#endif /* __CUDACC_RTC__ */ + +#if defined(__cplusplus) && defined(__CUDACC__) + +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610 + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +/******************************************************************************* +* * +* Below are implementations of SM-6.1 intrinsics which are included as * +* source (instead of being built in to the compiler) * +* * +*******************************************************************************/ + +// 4a +__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) { + int ret; + asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) { + unsigned int ret; + asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) { + int ret; + asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) { + unsigned int ret; + asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c)); + return ret; +} + +// 2a.lo +__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) { + int ret; + asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) { + unsigned int ret; + asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) { + int ret; + asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) { + unsigned int ret; + asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c)); + return ret; +} + +// 2a.hi +__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) { + int ret; + asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) { + unsigned int ret; + asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) { + int ret; + asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c)); + return ret; +} + +__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) { + unsigned int ret; + asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c)); + return ret; +} + + +#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */ + +#endif /* __cplusplus && __CUDACC__ */ + +#undef __SM_61_INTRINSICS_DECL__ + +#endif /* !__SM_61_INTRINSICS_HPP__ */ + diff --git a/ext/cudart/include/surface_functions.h b/ext/cudart/include/surface_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..587a995d0ea8a697b028706e824f9437276401e6 --- /dev/null +++ b/ext/cudart/include/surface_functions.h @@ -0,0 +1,439 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SURFACE_FUNCTIONS_H__) +#define __SURFACE_FUNCTIONS_H__ + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" +#include "cuda_surface_types.h" + +#if defined(_WIN32) +# define __DEPRECATED__ __declspec(deprecated) +#else +# define __DEPRECATED__ __attribute__((deprecated)) +#endif + + + +#ifdef __CUDA_ARCH__ +template <typename T> struct __nv_surf_trait { typedef void * cast_type; }; + +template<> struct __nv_surf_trait<char> { typedef char * cast_type; }; +template<> struct __nv_surf_trait<signed char> { typedef signed char * cast_type; }; +template<> struct __nv_surf_trait<unsigned char> { typedef unsigned char * cast_type; }; +template<> struct __nv_surf_trait<char1> { typedef char1 * cast_type; }; +template<> struct __nv_surf_trait<uchar1> { typedef uchar1 * cast_type; }; +template<> struct __nv_surf_trait<char2> { typedef char2 * cast_type; }; +template<> struct __nv_surf_trait<uchar2> { typedef uchar2 * cast_type; }; +template<> struct __nv_surf_trait<char4> { typedef char4 * cast_type; }; +template<> struct __nv_surf_trait<uchar4> { typedef uchar4 * cast_type; }; +template<> struct __nv_surf_trait<short> { typedef short * cast_type; }; +template<> struct __nv_surf_trait<unsigned short> { typedef unsigned short * cast_type; }; +template<> struct __nv_surf_trait<short1> { typedef short1 * cast_type; }; +template<> struct __nv_surf_trait<ushort1> { typedef ushort1 * cast_type; }; +template<> struct __nv_surf_trait<short2> { typedef short2 * cast_type; }; +template<> struct __nv_surf_trait<ushort2> { typedef ushort2 * cast_type; }; +template<> struct __nv_surf_trait<short4> { typedef short4 * cast_type; }; +template<> struct __nv_surf_trait<ushort4> { typedef ushort4 * cast_type; }; +template<> struct __nv_surf_trait<int> { typedef int * cast_type; }; +template<> struct __nv_surf_trait<unsigned int> { typedef unsigned int * cast_type; }; +template<> struct __nv_surf_trait<int1> { typedef int1 * cast_type; }; +template<> struct __nv_surf_trait<uint1> { typedef uint1 * cast_type; }; +template<> struct __nv_surf_trait<int2> { typedef int2 * cast_type; }; +template<> struct __nv_surf_trait<uint2> { typedef uint2 * cast_type; }; +template<> struct __nv_surf_trait<int4> { typedef int4 * cast_type; }; +template<> struct __nv_surf_trait<uint4> { typedef uint4 * cast_type; }; +template<> struct __nv_surf_trait<long long> { typedef long long * cast_type; }; +template<> struct __nv_surf_trait<unsigned long long> { typedef unsigned long long * cast_type; }; +template<> struct __nv_surf_trait<longlong1> { typedef longlong1 * cast_type; }; +template<> struct __nv_surf_trait<ulonglong1> { typedef ulonglong1 * cast_type; }; +template<> struct __nv_surf_trait<longlong2> { typedef longlong2 * cast_type; }; +template<> struct __nv_surf_trait<ulonglong2> { typedef ulonglong2 * cast_type; }; +#if !defined(__LP64__) +template<> struct __nv_surf_trait<long> { typedef int * cast_type; }; +template<> struct __nv_surf_trait<unsigned long> { typedef unsigned int * cast_type; }; +template<> struct __nv_surf_trait<long1> { typedef int1 * cast_type; }; +template<> struct __nv_surf_trait<ulong1> { typedef uint1 * cast_type; }; +template<> struct __nv_surf_trait<long2> { typedef int2 * cast_type; }; +template<> struct __nv_surf_trait<ulong2> { typedef uint2 * cast_type; }; +template<> struct __nv_surf_trait<long4> { typedef uint4 * cast_type; }; +template<> struct __nv_surf_trait<ulong4> { typedef int4 * cast_type; }; +#endif +template<> struct __nv_surf_trait<float> { typedef float * cast_type; }; +template<> struct __nv_surf_trait<float1> { typedef float1 * cast_type; }; +template<> struct __nv_surf_trait<float2> { typedef float2 * cast_type; }; +template<> struct __nv_surf_trait<float4> { typedef float4 * cast_type; }; +#endif /* defined(__CUDA_ARCH__) */ + +template <typename T> +static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1Dread_v2", (void *)res, s, surf, x, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ T surf1Dread(surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf1Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, mode); + return temp; +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf1Dread<T>(surf, x, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template <typename T> +static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2Dread_v2", (void *)res, s, surf, x, y, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ T surf2Dread(surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf2Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, mode); + return temp; +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf2Dread<T>(surf, x, y, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template <typename T> +static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf3Dread_v2", (void *)res, s, surf, x, y, z, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ T surf3Dread(surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf3Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, z, mode); + return temp; +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf3Dread<T>(surf, x, y, z, mode); +#endif /* __CUDA_ARCH__ */ +} + + + +template <typename T> +static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1DLayeredread_v2", (void *)res, s, surf, x, layer, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ T surf1DLayeredread(surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf1DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, layer, mode); + return temp; +#endif +} + + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf1DLayeredread<T>(surf, x, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template <typename T> +static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2DLayeredread_v2", (void *)res, s, surf, x, y, layer, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ T surf2DLayeredread(surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surf2DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layer, mode); + return temp; +#endif +} + + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surf2DLayeredread<T>(surf, x, y, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template <typename T> +static __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapread_v2", (void *)res, s, surf, x, y, face, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ T surfCubemapread(surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + + __nv_tex_surf_handler("__surfCubemapread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, face, mode); + return temp; +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surfCubemapread<T>(surf, x, y, face, mode); +#endif /* __CUDA_ARCH__ */ +} + + +template <typename T> +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (void *)res, s, surf, x, y, layerFace, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ T surfCubemapLayeredread(surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T temp; + __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layerFace, mode); + return temp; +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + *res = surfCubemapLayeredread<T>(surf, x, y, layerFace, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surf1Dwrite +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1Dwrite_v2", (void *)&val, s, surf, x, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, mode); +#endif /* __CUDA_ARCH__ */ +} + + +//surf2Dwrite +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2Dwrite_v2", (void *)&val, s, surf, x, y, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surf3Dwrite +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf3Dwrite_v2", (void *)&val, s, surf, x, y, z,mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf3Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, z, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surf1DLayeredwrite +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (void *)&val, s, surf, x, layer,mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surf2DLayeredwrite +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (void *)&val, s, surf, x, y, layer,mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + +//surfCubemapwrite +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapwrite_v2", (void *)&val, s, surf, x, y, face, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, face, mode); +#endif /* __CUDA_ARCH__ */ +} + + +//surfCubemapLayeredwrite +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (void *)&val, s, surf, x, y, layerFace, mode); +#endif +} + +template<class T> +static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layerFace, mode); +#endif /* __CUDA_ARCH__ */ +} + +#undef __DEPRECATED__ + + +#endif /* __cplusplus && __CUDACC__ */ +#endif /* !__SURFACE_FUNCTIONS_H__ */ diff --git a/ext/cudart/include/surface_indirect_functions.h b/ext/cudart/include/surface_indirect_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..7d5c4b641e32fbff81ba639460cee3c9d517a0c5 --- /dev/null +++ b/ext/cudart/include/surface_indirect_functions.h @@ -0,0 +1,286 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + + +#ifndef __SURFACE_INDIRECT_FUNCTIONS_H__ +#define __SURFACE_INDIRECT_FUNCTIONS_H__ + + +#if defined(__cplusplus) && defined(__CUDACC__) + +#include "cuda_runtime_api.h" + +template<typename T> struct __nv_isurf_trait { }; +template<> struct __nv_isurf_trait<char> { typedef void type; }; +template<> struct __nv_isurf_trait<signed char> { typedef void type; }; +template<> struct __nv_isurf_trait<char1> { typedef void type; }; +template<> struct __nv_isurf_trait<unsigned char> { typedef void type; }; +template<> struct __nv_isurf_trait<uchar1> { typedef void type; }; +template<> struct __nv_isurf_trait<short> { typedef void type; }; +template<> struct __nv_isurf_trait<short1> { typedef void type; }; +template<> struct __nv_isurf_trait<unsigned short> { typedef void type; }; +template<> struct __nv_isurf_trait<ushort1> { typedef void type; }; +template<> struct __nv_isurf_trait<int> { typedef void type; }; +template<> struct __nv_isurf_trait<int1> { typedef void type; }; +template<> struct __nv_isurf_trait<unsigned int> { typedef void type; }; +template<> struct __nv_isurf_trait<uint1> { typedef void type; }; +template<> struct __nv_isurf_trait<long long> { typedef void type; }; +template<> struct __nv_isurf_trait<longlong1> { typedef void type; }; +template<> struct __nv_isurf_trait<unsigned long long> { typedef void type; }; +template<> struct __nv_isurf_trait<ulonglong1> { typedef void type; }; +template<> struct __nv_isurf_trait<float> { typedef void type; }; +template<> struct __nv_isurf_trait<float1> { typedef void type; }; + +template<> struct __nv_isurf_trait<char2> { typedef void type; }; +template<> struct __nv_isurf_trait<uchar2> { typedef void type; }; +template<> struct __nv_isurf_trait<short2> { typedef void type; }; +template<> struct __nv_isurf_trait<ushort2> { typedef void type; }; +template<> struct __nv_isurf_trait<int2> { typedef void type; }; +template<> struct __nv_isurf_trait<uint2> { typedef void type; }; +template<> struct __nv_isurf_trait<longlong2> { typedef void type; }; +template<> struct __nv_isurf_trait<ulonglong2> { typedef void type; }; +template<> struct __nv_isurf_trait<float2> { typedef void type; }; + +template<> struct __nv_isurf_trait<char4> { typedef void type; }; +template<> struct __nv_isurf_trait<uchar4> { typedef void type; }; +template<> struct __nv_isurf_trait<short4> { typedef void type; }; +template<> struct __nv_isurf_trait<ushort4> { typedef void type; }; +template<> struct __nv_isurf_trait<int4> { typedef void type; }; +template<> struct __nv_isurf_trait<uint4> { typedef void type; }; +template<> struct __nv_isurf_trait<float4> { typedef void type; }; + + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf1Dread(T *ptr, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf1Dread", ptr, obj, x, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <class T> +static __device__ T surf1Dread(cudaSurfaceObject_t surfObject, int x, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T ret; + surf1Dread(&ret, surfObject, x, boundaryMode); + return ret; +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf2Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf2Dread", ptr, obj, x, y, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <class T> +static __device__ T surf2Dread(cudaSurfaceObject_t surfObject, int x, int y, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T ret; + surf2Dread(&ret, surfObject, x, y, boundaryMode); + return ret; +#endif /* __CUDA_ARCH__ */ +} + + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf3Dread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf3Dread", ptr, obj, x, y, z, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <class T> +static __device__ T surf3Dread(cudaSurfaceObject_t surfObject, int x, int y, int z, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T ret; + surf3Dread(&ret, surfObject, x, y, z, boundaryMode); + return ret; +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf1DLayeredread", ptr, obj, x, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <class T> +static __device__ T surf1DLayeredread(cudaSurfaceObject_t surfObject, int x, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T ret; + surf1DLayeredread(&ret, surfObject, x, layer, boundaryMode); + return ret; +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf2DLayeredread", ptr, obj, x, y, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <class T> +static __device__ T surf2DLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layer, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T ret; + surf2DLayeredread(&ret, surfObject, x, y, layer, boundaryMode); + return ret; +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surfCubemapread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurfCubemapread", ptr, obj, x, y, face, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <class T> +static __device__ T surfCubemapread(cudaSurfaceObject_t surfObject, int x, int y, int face, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T ret; + surfCubemapread(&ret, surfObject, x, y, face, boundaryMode); + return ret; +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredread(T *ptr, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurfCubemapLayeredread", ptr, obj, x, y, layerface, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <class T> +static __device__ T surfCubemapLayeredread(cudaSurfaceObject_t surfObject, int x, int y, int layerface, cudaSurfaceBoundaryMode boundaryMode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + T ret; + surfCubemapLayeredread(&ret, surfObject, x, y, layerface, boundaryMode); + return ret; +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf1Dwrite(T val, cudaSurfaceObject_t obj, int x, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf1Dwrite_v2", &val, obj, x, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf2Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf2Dwrite_v2", &val, obj, x, y, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf3Dwrite(T val, cudaSurfaceObject_t obj, int x, int y, int z, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf3Dwrite_v2", &val, obj, x, y, z, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf1DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf1DLayeredwrite_v2", &val, obj, x, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surf2DLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layer, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurf2DLayeredwrite_v2", &val, obj, x, y, layer, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surfCubemapwrite(T val, cudaSurfaceObject_t obj, int x, int y, int face, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurfCubemapwrite_v2", &val, obj, x, y, face, mode); +#endif /* __CUDA_ARCH__ */ +} + +template <typename T> +static __device__ typename __nv_isurf_trait<T>::type surfCubemapLayeredwrite(T val, cudaSurfaceObject_t obj, int x, int y, int layerface, cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__isurfCubemapLayeredwrite_v2", &val, obj, x, y, layerface, mode); +#endif /* __CUDA_ARCH__ */ +} + +#endif // __cplusplus && __CUDACC__ + +#endif // __SURFACE_INDIRECT_FUNCTIONS_H__ + + diff --git a/ext/cudart/include/surface_types.h b/ext/cudart/include/surface_types.h new file mode 100644 index 0000000000000000000000000000000000000000..95ff57ca1bbaa517ade86424a9c5dbe8a2a4b8ee --- /dev/null +++ b/ext/cudart/include/surface_types.h @@ -0,0 +1,119 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__SURFACE_TYPES_H__) +#define __SURFACE_TYPES_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "driver_types.h" + +/** + * \addtogroup CUDART_TYPES + * + * @{ + */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#define cudaSurfaceType1D 0x01 +#define cudaSurfaceType2D 0x02 +#define cudaSurfaceType3D 0x03 +#define cudaSurfaceTypeCubemap 0x0C +#define cudaSurfaceType1DLayered 0xF1 +#define cudaSurfaceType2DLayered 0xF2 +#define cudaSurfaceTypeCubemapLayered 0xFC + +/** + * CUDA Surface boundary modes + */ +enum __device_builtin__ cudaSurfaceBoundaryMode +{ + cudaBoundaryModeZero = 0, /**< Zero boundary mode */ + cudaBoundaryModeClamp = 1, /**< Clamp boundary mode */ + cudaBoundaryModeTrap = 2 /**< Trap boundary mode */ +}; + +/** + * CUDA Surface format modes + */ +enum __device_builtin__ cudaSurfaceFormatMode +{ + cudaFormatModeForced = 0, /**< Forced format mode */ + cudaFormatModeAuto = 1 /**< Auto format mode */ +}; + +/** + * CUDA Surface reference + */ +struct __device_builtin__ surfaceReference +{ + /** + * Channel descriptor for surface reference + */ + struct cudaChannelFormatDesc channelDesc; +}; + +/** + * An opaque value that represents a CUDA Surface object + */ +typedef __device_builtin__ unsigned long long cudaSurfaceObject_t; + +/** @} */ +/** @} */ /* END CUDART_TYPES */ + +#endif /* !__SURFACE_TYPES_H__ */ diff --git a/ext/cudart/include/texture_fetch_functions.h b/ext/cudart/include/texture_fetch_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..ad970aea7a04023822ba01515a10c6e83c4d7def --- /dev/null +++ b/ext/cudart/include/texture_fetch_functions.h @@ -0,0 +1,739 @@ +/* + * Copyright 1993-2017 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__TEXTURE_FETCH_FUNCTIONS_H__) +#define __TEXTURE_FETCH_FUNCTIONS_H__ + + +#if defined(__cplusplus) && defined(__CUDACC__) + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" +#include "cuda_texture_types.h" + +#if defined(_WIN32) +# define __DEPRECATED__ __declspec(deprecated) +#else +# define __DEPRECATED__ __attribute__((deprecated)) +#endif + + +template <typename T> +struct __nv_tex_rmet_ret { }; + +template<> struct __nv_tex_rmet_ret<char> { typedef char type; }; +template<> struct __nv_tex_rmet_ret<signed char> { typedef signed char type; }; +template<> struct __nv_tex_rmet_ret<unsigned char> { typedef unsigned char type; }; +template<> struct __nv_tex_rmet_ret<char1> { typedef char1 type; }; +template<> struct __nv_tex_rmet_ret<uchar1> { typedef uchar1 type; }; +template<> struct __nv_tex_rmet_ret<char2> { typedef char2 type; }; +template<> struct __nv_tex_rmet_ret<uchar2> { typedef uchar2 type; }; +template<> struct __nv_tex_rmet_ret<char4> { typedef char4 type; }; +template<> struct __nv_tex_rmet_ret<uchar4> { typedef uchar4 type; }; + +template<> struct __nv_tex_rmet_ret<short> { typedef short type; }; +template<> struct __nv_tex_rmet_ret<unsigned short> { typedef unsigned short type; }; +template<> struct __nv_tex_rmet_ret<short1> { typedef short1 type; }; +template<> struct __nv_tex_rmet_ret<ushort1> { typedef ushort1 type; }; +template<> struct __nv_tex_rmet_ret<short2> { typedef short2 type; }; +template<> struct __nv_tex_rmet_ret<ushort2> { typedef ushort2 type; }; +template<> struct __nv_tex_rmet_ret<short4> { typedef short4 type; }; +template<> struct __nv_tex_rmet_ret<ushort4> { typedef ushort4 type; }; + +template<> struct __nv_tex_rmet_ret<int> { typedef int type; }; +template<> struct __nv_tex_rmet_ret<unsigned int> { typedef unsigned int type; }; +template<> struct __nv_tex_rmet_ret<int1> { typedef int1 type; }; +template<> struct __nv_tex_rmet_ret<uint1> { typedef uint1 type; }; +template<> struct __nv_tex_rmet_ret<int2> { typedef int2 type; }; +template<> struct __nv_tex_rmet_ret<uint2> { typedef uint2 type; }; +template<> struct __nv_tex_rmet_ret<int4> { typedef int4 type; }; +template<> struct __nv_tex_rmet_ret<uint4> { typedef uint4 type; }; + +#if !defined(__LP64__) +template<> struct __nv_tex_rmet_ret<long> { typedef long type; }; +template<> struct __nv_tex_rmet_ret<unsigned long> { typedef unsigned long type; }; +template<> struct __nv_tex_rmet_ret<long1> { typedef long1 type; }; +template<> struct __nv_tex_rmet_ret<ulong1> { typedef ulong1 type; }; +template<> struct __nv_tex_rmet_ret<long2> { typedef long2 type; }; +template<> struct __nv_tex_rmet_ret<ulong2> { typedef ulong2 type; }; +template<> struct __nv_tex_rmet_ret<long4> { typedef long4 type; }; +template<> struct __nv_tex_rmet_ret<ulong4> { typedef ulong4 type; }; +#endif /* !__LP64__ */ +template<> struct __nv_tex_rmet_ret<float> { typedef float type; }; +template<> struct __nv_tex_rmet_ret<float1> { typedef float1 type; }; +template<> struct __nv_tex_rmet_ret<float2> { typedef float2 type; }; +template<> struct __nv_tex_rmet_ret<float4> { typedef float4 type; }; + + +template <typename T> struct __nv_tex_rmet_cast { typedef T* type; }; +#if !defined(__LP64__) +template<> struct __nv_tex_rmet_cast<long> { typedef int *type; }; +template<> struct __nv_tex_rmet_cast<unsigned long> { typedef unsigned int *type; }; +template<> struct __nv_tex_rmet_cast<long1> { typedef int1 *type; }; +template<> struct __nv_tex_rmet_cast<ulong1> { typedef uint1 *type; }; +template<> struct __nv_tex_rmet_cast<long2> { typedef int2 *type; }; +template<> struct __nv_tex_rmet_cast<ulong2> { typedef uint2 *type; }; +template<> struct __nv_tex_rmet_cast<long4> { typedef int4 *type; }; +template<> struct __nv_tex_rmet_cast<ulong4> { typedef uint4 *type; }; +#endif /* !__LP64__ */ + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeElementType> t, int x) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex1Dfetch_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x); + return temp; +#endif +} + +template <typename T> +struct __nv_tex_rmnf_ret { }; + +template <> struct __nv_tex_rmnf_ret<char> { typedef float type; }; +template <> struct __nv_tex_rmnf_ret<signed char> { typedef float type; }; +template <> struct __nv_tex_rmnf_ret<unsigned char> { typedef float type; }; +template <> struct __nv_tex_rmnf_ret<short> { typedef float type; }; +template <> struct __nv_tex_rmnf_ret<unsigned short> { typedef float type; }; +template <> struct __nv_tex_rmnf_ret<char1> { typedef float1 type; }; +template <> struct __nv_tex_rmnf_ret<uchar1> { typedef float1 type; }; +template <> struct __nv_tex_rmnf_ret<short1> { typedef float1 type; }; +template <> struct __nv_tex_rmnf_ret<ushort1> { typedef float1 type; }; +template <> struct __nv_tex_rmnf_ret<char2> { typedef float2 type; }; +template <> struct __nv_tex_rmnf_ret<uchar2> { typedef float2 type; }; +template <> struct __nv_tex_rmnf_ret<short2> { typedef float2 type; }; +template <> struct __nv_tex_rmnf_ret<ushort2> { typedef float2 type; }; +template <> struct __nv_tex_rmnf_ret<char4> { typedef float4 type; }; +template <> struct __nv_tex_rmnf_ret<uchar4> { typedef float4 type; }; +template <> struct __nv_tex_rmnf_ret<short4> { typedef float4 type; }; +template <> struct __nv_tex_rmnf_ret<ushort4> { typedef float4 type; }; + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, int x) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex1Dfetch_rmnf_v2", &type_dummy, &retval, t, x); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex1D +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex1D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex1D_rmnf_v2", &type_dummy, &retval, t, x); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +//tex2D +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + + __nv_tex_surf_handler("__tex2D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex2D_rmnf_v2", &type_dummy, &retval, t, x, y); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +//tex1DLayered +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex1DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, layer); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex1DLayered_rmnf_v2", &type_dummy, &retval, t, x, layer); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +//tex2DLayered +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex2DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, layer); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex2DLayered_rmnf_v2", &type_dummy, &retval, t, x, y, layer); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex3D +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex3D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex3D_rmnf_v2", &type_dummy, &retval, t, x, y, z); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// texCubemap +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__texCubemap_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__texCubemap_rmnf_v2", &type_dummy, &retval, t, x, y, z); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +template <typename T> +struct __nv_tex2dgather_ret { }; +template <> struct __nv_tex2dgather_ret<char> { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret<signed char> { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret<char1> { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret<char2> { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret<char3> { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret<char4> { typedef char4 type; }; +template <> struct __nv_tex2dgather_ret<unsigned char> { typedef uchar4 type; }; +template <> struct __nv_tex2dgather_ret<uchar1> { typedef uchar4 type; }; +template <> struct __nv_tex2dgather_ret<uchar2> { typedef uchar4 type; }; +template <> struct __nv_tex2dgather_ret<uchar3> { typedef uchar4 type; }; +template <> struct __nv_tex2dgather_ret<uchar4> { typedef uchar4 type; }; + +template <> struct __nv_tex2dgather_ret<short> { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret<short1> { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret<short2> { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret<short3> { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret<short4> { typedef short4 type; }; +template <> struct __nv_tex2dgather_ret<unsigned short> { typedef ushort4 type; }; +template <> struct __nv_tex2dgather_ret<ushort1> { typedef ushort4 type; }; +template <> struct __nv_tex2dgather_ret<ushort2> { typedef ushort4 type; }; +template <> struct __nv_tex2dgather_ret<ushort3> { typedef ushort4 type; }; +template <> struct __nv_tex2dgather_ret<ushort4> { typedef ushort4 type; }; + +template <> struct __nv_tex2dgather_ret<int> { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret<int1> { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret<int2> { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret<int3> { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret<int4> { typedef int4 type; }; +template <> struct __nv_tex2dgather_ret<unsigned int> { typedef uint4 type; }; +template <> struct __nv_tex2dgather_ret<uint1> { typedef uint4 type; }; +template <> struct __nv_tex2dgather_ret<uint2> { typedef uint4 type; }; +template <> struct __nv_tex2dgather_ret<uint3> { typedef uint4 type; }; +template <> struct __nv_tex2dgather_ret<uint4> { typedef uint4 type; }; + +template <> struct __nv_tex2dgather_ret<float> { typedef float4 type; }; +template <> struct __nv_tex2dgather_ret<float1> { typedef float4 type; }; +template <> struct __nv_tex2dgather_ret<float2> { typedef float4 type; }; +template <> struct __nv_tex2dgather_ret<float3> { typedef float4 type; }; +template <> struct __nv_tex2dgather_ret<float4> { typedef float4 type; }; + +template <typename T> +static __device__ __forceinline__ typename __nv_tex2dgather_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, int comp=0) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex2dgather_ret<T>::type retval; + __nv_tex_surf_handler("__tex2Dgather_v2", &type_dummy, &retval, t, x, y, comp); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +template<typename T> struct __nv_tex2dgather_rmnf_ret { }; +template<> struct __nv_tex2dgather_rmnf_ret<char> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<signed char> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<unsigned char> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<char1> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<uchar1> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<char2> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<uchar2> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<char3> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<uchar3> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<char4> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<uchar4> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<signed short> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<unsigned short> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<short1> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<ushort1> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<short2> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<ushort2> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<short3> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<ushort3> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<short4> { typedef float4 type; }; +template<> struct __nv_tex2dgather_rmnf_ret<ushort4> { typedef float4 type; }; + +template <typename T> +static __device__ __forceinline__ typename __nv_tex2dgather_rmnf_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, int comp = 0) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex2dgather_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex2Dgather_rmnf_v2", &type_dummy, &retval, t, x, y, comp); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// tex1DLod +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex1DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, level); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex1DLod_rmnf_v2", &type_dummy, &retval, t, x, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex2DLod +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex2DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, level); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex2DLod_rmnf_v2", &type_dummy, &retval, t, x, y, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex1DLayeredLod +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex1DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, level); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex1DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, layer, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex2DLayeredLod +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex2DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, level); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex2DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, layer, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex3DLod +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex3DLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex3DLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// texCubemapLod +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__texCubemapLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__texCubemapLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// texCubemapLayered +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__texCubemapLayered_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__texCubemapLayered_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// texCubemapLayeredLod +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__texCubemapLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, level); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__texCubemapLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer, level); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// texCubemapGrad +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__texCubemapGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__texCubemapGrad_rmnf_v2", &type_dummy, &retval, t, x, y, z, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// texCubemapLayeredGrad +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__texCubemapLayeredGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, &dPdx, &dPdy); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__texCubemapLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, layer, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// tex1DGrad +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex1DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, dPdx, dPdy); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex1DGrad_rmnf_v2", &type_dummy, &retval,t, x,dPdx, dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + + +// tex2DGrad +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex2DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, &dPdx, &dPdy); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex2DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex1DLayeredGrad +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex1DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, dPdx, dPdy); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex1DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, layer, dPdx, dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex2DLayeredGrad +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex2DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, &dPdx, &dPdy); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex2DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, layer, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +// tex3DGrad +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + typename __nv_tex_rmet_ret<T>::type temp; + __nv_tex_surf_handler("__tex3DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy); + return temp; +#endif +} + +template <typename T> +static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T type_dummy; + typename __nv_tex_rmnf_ret<T>::type retval; + __nv_tex_surf_handler("__tex3DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, &dPdx, &dPdy); + return retval; +#endif /* __CUDA_ARCH__ */ +} + +#undef __DEPRECATED__ + +#endif /* __cplusplus && __CUDACC__ */ + +#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */ diff --git a/ext/cudart/include/texture_indirect_functions.h b/ext/cudart/include/texture_indirect_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..3a70d7ee355195bc2e2d8833234f80bf4797741e --- /dev/null +++ b/ext/cudart/include/texture_indirect_functions.h @@ -0,0 +1,771 @@ +/* + * Copyright 1993-2020 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + + +#ifndef __TEXTURE_INDIRECT_FUNCTIONS_H__ +#define __TEXTURE_INDIRECT_FUNCTIONS_H__ + + +#if defined(__cplusplus) && defined(__CUDACC__) + +#include "cuda_runtime_api.h" + + +#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 600) +#define __NV_TEX_SPARSE 1 +#endif /* endif */ + +template <typename T> struct __nv_itex_trait { }; +template<> struct __nv_itex_trait<char> { typedef void type; }; +template<> struct __nv_itex_trait<signed char> { typedef void type; }; +template<> struct __nv_itex_trait<char1> { typedef void type; }; +template<> struct __nv_itex_trait<char2> { typedef void type; }; +template<> struct __nv_itex_trait<char4> { typedef void type; }; +template<> struct __nv_itex_trait<unsigned char> { typedef void type; }; +template<> struct __nv_itex_trait<uchar1> { typedef void type; }; +template<> struct __nv_itex_trait<uchar2> { typedef void type; }; +template<> struct __nv_itex_trait<uchar4> { typedef void type; }; +template<> struct __nv_itex_trait<short> { typedef void type; }; +template<> struct __nv_itex_trait<short1> { typedef void type; }; +template<> struct __nv_itex_trait<short2> { typedef void type; }; +template<> struct __nv_itex_trait<short4> { typedef void type; }; +template<> struct __nv_itex_trait<unsigned short> { typedef void type; }; +template<> struct __nv_itex_trait<ushort1> { typedef void type; }; +template<> struct __nv_itex_trait<ushort2> { typedef void type; }; +template<> struct __nv_itex_trait<ushort4> { typedef void type; }; +template<> struct __nv_itex_trait<int> { typedef void type; }; +template<> struct __nv_itex_trait<int1> { typedef void type; }; +template<> struct __nv_itex_trait<int2> { typedef void type; }; +template<> struct __nv_itex_trait<int4> { typedef void type; }; +template<> struct __nv_itex_trait<unsigned int> { typedef void type; }; +template<> struct __nv_itex_trait<uint1> { typedef void type; }; +template<> struct __nv_itex_trait<uint2> { typedef void type; }; +template<> struct __nv_itex_trait<uint4> { typedef void type; }; +#if !defined(__LP64__) +template<> struct __nv_itex_trait<long> { typedef void type; }; +template<> struct __nv_itex_trait<long1> { typedef void type; }; +template<> struct __nv_itex_trait<long2> { typedef void type; }; +template<> struct __nv_itex_trait<long4> { typedef void type; }; +template<> struct __nv_itex_trait<unsigned long> { typedef void type; }; +template<> struct __nv_itex_trait<ulong1> { typedef void type; }; +template<> struct __nv_itex_trait<ulong2> { typedef void type; }; +template<> struct __nv_itex_trait<ulong4> { typedef void type; }; +#endif /* !__LP64__ */ +template<> struct __nv_itex_trait<float> { typedef void type; }; +template<> struct __nv_itex_trait<float1> { typedef void type; }; +template<> struct __nv_itex_trait<float2> { typedef void type; }; +template<> struct __nv_itex_trait<float4> { typedef void type; }; + + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex1Dfetch(T *ptr, cudaTextureObject_t obj, int x) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex1Dfetch", ptr, obj, x); +#endif +} + +template <class T> +static __device__ T tex1Dfetch(cudaTextureObject_t texObject, int x) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex1Dfetch(&ret, texObject, x); + return ret; +#endif +} + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex1D(T *ptr, cudaTextureObject_t obj, float x) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex1D", ptr, obj, x); +#endif +} + + +template <class T> +static __device__ T tex1D(cudaTextureObject_t texObject, float x) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex1D(&ret, texObject, x); + return ret; +#endif +} + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex2D", ptr, obj, x, y); +#endif +} + +template <class T> +static __device__ T tex2D(cudaTextureObject_t texObject, float x, float y) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2D(&ret, texObject, x, y); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2D(T *ptr, cudaTextureObject_t obj, float x, float y, + bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex2D_sparse", ptr, obj, x, y, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex2D(cudaTextureObject_t texObject, float x, float y, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2D(&ret, texObject, x, y, isResident); + return ret; +#endif +} + +#endif /* __NV_TEX_SPARSE */ + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex3D", ptr, obj, x, y, z); +#endif +} + +template <class T> +static __device__ T tex3D(cudaTextureObject_t texObject, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex3D(&ret, texObject, x, y, z); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex3D(T *ptr, cudaTextureObject_t obj, float x, float y, float z, + bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex3D_sparse", ptr, obj, x, y, z, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex3D(cudaTextureObject_t texObject, float x, float y, float z, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex3D(&ret, texObject, x, y, z, isResident); + return ret; +#endif +} +#endif /* __NV_TEX_SPARSE */ + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex1DLayered(T *ptr, cudaTextureObject_t obj, float x, int layer) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex1DLayered", ptr, obj, x, layer); +#endif +} + +template <class T> +static __device__ T tex1DLayered(cudaTextureObject_t texObject, float x, int layer) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex1DLayered(&ret, texObject, x, layer); + return ret; +#endif +} + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex2DLayered", ptr, obj, x, y, layer); +#endif +} + +template <class T> +static __device__ T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DLayered(&ret, texObject, x, y, layer); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DLayered(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex2DLayered_sparse", ptr, obj, x, y, layer, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex2DLayered(cudaTextureObject_t texObject, float x, float y, int layer, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DLayered(&ret, texObject, x, y, layer, isResident); + return ret; +#endif +} +#endif /* __NV_TEX_SPARSE */ + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type texCubemap(T *ptr, cudaTextureObject_t obj, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itexCubemap", ptr, obj, x, y, z); +#endif +} + + +template <class T> +static __device__ T texCubemap(cudaTextureObject_t texObject, float x, float y, float z) +{ +#ifdef __CUDA_ARCH__ + T ret; + texCubemap(&ret, texObject, x, y, z); + return ret; +#endif +} + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type texCubemapLayered(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itexCubemapLayered", ptr, obj, x, y, z, layer); +#endif +} + +template <class T> +static __device__ T texCubemapLayered(cudaTextureObject_t texObject, float x, float y, float z, int layer) +{ +#ifdef __CUDA_ARCH__ + T ret; + texCubemapLayered(&ret, texObject, x, y, z, layer); + return ret; +#endif +} + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, int comp = 0) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex2Dgather", ptr, obj, x, y, comp); +#endif +} + +template <class T> +static __device__ T tex2Dgather(cudaTextureObject_t to, float x, float y, int comp = 0) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2Dgather(&ret, to, x, y, comp); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2Dgather(T *ptr, cudaTextureObject_t obj, float x, float y, bool* isResident, int comp = 0) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex2Dgather_sparse", ptr, obj, x, y, comp, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex2Dgather(cudaTextureObject_t to, float x, float y, bool* isResident, int comp = 0) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2Dgather(&ret, to, x, y, isResident, comp); + return ret; +#endif +} + +#endif /* __NV_TEX_SPARSE */ + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex1DLod(T *ptr, cudaTextureObject_t obj, float x, float level) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex1DLod", ptr, obj, x, level); +#endif +} + +template <class T> +static __device__ T tex1DLod(cudaTextureObject_t texObject, float x, float level) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex1DLod(&ret, texObject, x, level); + return ret; +#endif +} + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex2DLod", ptr, obj, x, y, level); +#endif +} + +template <class T> +static __device__ T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DLod(&ret, texObject, x, y, level); + return ret; +#endif +} + +#if __NV_TEX_SPARSE + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float level, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex2DLod_sparse", ptr, obj, x, y, level, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex2DLod(cudaTextureObject_t texObject, float x, float y, float level, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DLod(&ret, texObject, x, y, level, isResident); + return ret; +#endif +} + +#endif /* __NV_TEX_SPARSE */ + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex3DLod", ptr, obj, x, y, z, level); +#endif +} + +template <class T> +static __device__ T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex3DLod(&ret, texObject, x, y, z, level); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex3DLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex3DLod_sparse", ptr, obj, x, y, z, level, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex3DLod(cudaTextureObject_t texObject, float x, float y, float z, float level, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex3DLod(&ret, texObject, x, y, z, level, isResident); + return ret; +#endif +} + +#endif /* __NV_TEX_SPARSE */ + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex1DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex1DLayeredLod", ptr, obj, x, layer, level); +#endif +} + +template <class T> +static __device__ T tex1DLayeredLod(cudaTextureObject_t texObject, float x, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex1DLayeredLod(&ret, texObject, x, layer, level); + return ret; +#endif +} + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex2DLayeredLod", ptr, obj, x, y, layer, level); +#endif +} + +template <class T> +static __device__ T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DLayeredLod(&ret, texObject, x, y, layer, level); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, int layer, float level, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex2DLayeredLod_sparse", ptr, obj, x, y, layer, level, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex2DLayeredLod(cudaTextureObject_t texObject, float x, float y, int layer, float level, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DLayeredLod(&ret, texObject, x, y, layer, level, isResident); + return ret; +#endif +} +#endif /* __NV_TEX_SPARSE */ + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type texCubemapLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itexCubemapLod", ptr, obj, x, y, z, level); +#endif +} + +template <class T> +static __device__ T texCubemapLod(cudaTextureObject_t texObject, float x, float y, float z, float level) +{ +#ifdef __CUDA_ARCH__ + T ret; + texCubemapLod(&ret, texObject, x, y, z, level); + return ret; +#endif +} + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type texCubemapGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itexCubemapGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy); +#endif +} + +template <class T> +static __device__ T texCubemapGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T ret; + texCubemapGrad(&ret, texObject, x, y, z, dPdx, dPdy); + return ret; +#endif +} + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredLod(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itexCubemapLayeredLod", ptr, obj, x, y, z, layer, level); +#endif +} + +template <class T> +static __device__ T texCubemapLayeredLod(cudaTextureObject_t texObject, float x, float y, float z, int layer, float level) +{ +#ifdef __CUDA_ARCH__ + T ret; + texCubemapLayeredLod(&ret, texObject, x, y, z, layer, level); + return ret; +#endif +} + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex1DGrad(T *ptr, cudaTextureObject_t obj, float x, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex1DGrad", ptr, obj, x, dPdx, dPdy); +#endif +} + +template <class T> +static __device__ T tex1DGrad(cudaTextureObject_t texObject, float x, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex1DGrad(&ret, texObject, x, dPdx, dPdy); + return ret; +#endif +} + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex2DGrad_v2", ptr, obj, x, y, &dPdx, &dPdy); +#endif + +} + +template <class T> +static __device__ T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DGrad(&ret, texObject, x, y, dPdx, dPdy); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float2 dPdx, float2 dPdy, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex2DGrad_sparse", ptr, obj, x, y, &dPdx, &dPdy, &res); + *isResident = (res != 0); +#endif + +} + +template <class T> +static __device__ T tex2DGrad(cudaTextureObject_t texObject, float x, float y, float2 dPdx, float2 dPdy, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DGrad(&ret, texObject, x, y, dPdx, dPdy, isResident); + return ret; +#endif +} +#endif /* __NV_TEX_SPARSE */ + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex3DGrad_v2", ptr, obj, x, y, z, &dPdx, &dPdy); +#endif +} + +template <class T> +static __device__ T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex3DGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex3DGrad_sparse", ptr, obj, x, y, z, &dPdx, &dPdy, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex3DGrad(cudaTextureObject_t texObject, float x, float y, float z, float4 dPdx, float4 dPdy, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex3DGrad(&ret, texObject, x, y, z, dPdx, dPdy, isResident); + return ret; +#endif +} + +#endif /* __NV_TEX_SPARSE */ + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex1DLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, int layer, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex1DLayeredGrad", ptr, obj, x, layer, dPdx, dPdy); +#endif +} + +template <class T> +static __device__ T tex1DLayeredGrad(cudaTextureObject_t texObject, float x, int layer, float dPdx, float dPdy) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex1DLayeredGrad(&ret, texObject, x, layer, dPdx, dPdy); + return ret; +#endif +} + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itex2DLayeredGrad_v2", ptr, obj, x, y, layer, &dPdx, &dPdy); +#endif +} + +template <class T> +static __device__ T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy); + return ret; +#endif +} + +#if __NV_TEX_SPARSE +template <typename T> +static __device__ typename __nv_itex_trait<T>::type tex2DLayeredGrad(T * ptr, cudaTextureObject_t obj, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + unsigned char res; + __nv_tex_surf_handler("__itex2DLayeredGrad_sparse", ptr, obj, x, y, layer, &dPdx, &dPdy, &res); + *isResident = (res != 0); +#endif +} + +template <class T> +static __device__ T tex2DLayeredGrad(cudaTextureObject_t texObject, float x, float y, int layer, float2 dPdx, float2 dPdy, bool* isResident) +{ +#ifdef __CUDA_ARCH__ + T ret; + tex2DLayeredGrad(&ret, texObject, x, y, layer, dPdx, dPdy, isResident); + return ret; +#endif +} +#endif /* __NV_TEX_SPARSE */ + + +template <typename T> +static __device__ typename __nv_itex_trait<T>::type texCubemapLayeredGrad(T *ptr, cudaTextureObject_t obj, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + __nv_tex_surf_handler("__itexCubemapLayeredGrad_v2", ptr, obj, x, y, z, layer, &dPdx, &dPdy); +#endif +} + +template <class T> +static __device__ T texCubemapLayeredGrad(cudaTextureObject_t texObject, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) +{ +#ifdef __CUDA_ARCH__ + T ret; + texCubemapLayeredGrad(&ret, texObject, x, y, z, layer, dPdx, dPdy); + return ret; +#endif +} + +#undef __NV_TEX_SPARSE + +#endif // __cplusplus && __CUDACC__ +#endif // __TEXTURE_INDIRECT_FUNCTIONS_H__ diff --git a/ext/cudart/include/texture_types.h b/ext/cudart/include/texture_types.h new file mode 100644 index 0000000000000000000000000000000000000000..ef319422874e887dd7e5ac7cf275714f7737c26e --- /dev/null +++ b/ext/cudart/include/texture_types.h @@ -0,0 +1,281 @@ +/* + * Copyright 1993-2012 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__TEXTURE_TYPES_H__) +#define __TEXTURE_TYPES_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "driver_types.h" + +/** + * \addtogroup CUDART_TYPES + * + * @{ + */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#define cudaTextureType1D 0x01 +#define cudaTextureType2D 0x02 +#define cudaTextureType3D 0x03 +#define cudaTextureTypeCubemap 0x0C +#define cudaTextureType1DLayered 0xF1 +#define cudaTextureType2DLayered 0xF2 +#define cudaTextureTypeCubemapLayered 0xFC + +/** + * CUDA texture address modes + */ +enum __device_builtin__ cudaTextureAddressMode +{ + cudaAddressModeWrap = 0, /**< Wrapping address mode */ + cudaAddressModeClamp = 1, /**< Clamp to edge address mode */ + cudaAddressModeMirror = 2, /**< Mirror address mode */ + cudaAddressModeBorder = 3 /**< Border address mode */ +}; + +/** + * CUDA texture filter modes + */ +enum __device_builtin__ cudaTextureFilterMode +{ + cudaFilterModePoint = 0, /**< Point filter mode */ + cudaFilterModeLinear = 1 /**< Linear filter mode */ +}; + +/** + * CUDA texture read modes + */ +enum __device_builtin__ cudaTextureReadMode +{ + cudaReadModeElementType = 0, /**< Read texture as specified element type */ + cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */ +}; + +/** + * CUDA texture reference + */ +struct __device_builtin__ textureReference +{ + /** + * Indicates whether texture reads are normalized or not + */ + int normalized; + /** + * Texture filter mode + */ + enum cudaTextureFilterMode filterMode; + /** + * Texture address mode for up to 3 dimensions + */ + enum cudaTextureAddressMode addressMode[3]; + /** + * Channel descriptor for the texture reference + */ + struct cudaChannelFormatDesc channelDesc; + /** + * Perform sRGB->linear conversion during texture read + */ + int sRGB; + /** + * Limit to the anisotropy ratio + */ + unsigned int maxAnisotropy; + /** + * Mipmap filter mode + */ + enum cudaTextureFilterMode mipmapFilterMode; + /** + * Offset applied to the supplied mipmap level + */ + float mipmapLevelBias; + /** + * Lower end of the mipmap level range to clamp access to + */ + float minMipmapLevelClamp; + /** + * Upper end of the mipmap level range to clamp access to + */ + float maxMipmapLevelClamp; + /** + * Disable any trilinear filtering optimizations. + */ + int disableTrilinearOptimization; + int __cudaReserved[14]; +}; + +/** + * CUDA texture descriptor + */ +struct __device_builtin__ cudaTextureDesc +{ + /** + * Texture address mode for up to 3 dimensions + */ + enum cudaTextureAddressMode addressMode[3]; + /** + * Texture filter mode + */ + enum cudaTextureFilterMode filterMode; + /** + * Texture read mode + */ + enum cudaTextureReadMode readMode; + /** + * Perform sRGB->linear conversion during texture read + */ + int sRGB; + /** + * Texture Border Color + */ + float borderColor[4]; + /** + * Indicates whether texture reads are normalized or not + */ + int normalizedCoords; + /** + * Limit to the anisotropy ratio + */ + unsigned int maxAnisotropy; + /** + * Mipmap filter mode + */ + enum cudaTextureFilterMode mipmapFilterMode; + /** + * Offset applied to the supplied mipmap level + */ + float mipmapLevelBias; + /** + * Lower end of the mipmap level range to clamp access to + */ + float minMipmapLevelClamp; + /** + * Upper end of the mipmap level range to clamp access to + */ + float maxMipmapLevelClamp; + /** + * Disable any trilinear filtering optimizations. + */ + int disableTrilinearOptimization; +}; + +struct __device_builtin__ cudaTextureDesc_v2 +{ + /** + * Texture address mode for up to 3 dimensions + */ + enum cudaTextureAddressMode addressMode[3]; + /** + * Texture filter mode + */ + enum cudaTextureFilterMode filterMode; + /** + * Texture read mode + */ + enum cudaTextureReadMode readMode; + /** + * Perform sRGB->linear conversion during texture read + */ + int sRGB; + /** + * Texture Border Color + */ + float borderColor[4]; + /** + * Indicates whether texture reads are normalized or not + */ + int normalizedCoords; + /** + * Limit to the anisotropy ratio + */ + unsigned int maxAnisotropy; + /** + * Mipmap filter mode + */ + enum cudaTextureFilterMode mipmapFilterMode; + /** + * Offset applied to the supplied mipmap level + */ + float mipmapLevelBias; + /** + * Lower end of the mipmap level range to clamp access to + */ + float minMipmapLevelClamp; + /** + * Upper end of the mipmap level range to clamp access to + */ + float maxMipmapLevelClamp; + /** + * Disable any trilinear filtering optimizations. + */ + int disableTrilinearOptimization; + /** + * Enable seamless cube map filtering. + */ + int seamlessCubemap; +}; + +/** + * An opaque value that represents a CUDA texture object + */ +typedef __device_builtin__ unsigned long long cudaTextureObject_t; + +/** @} */ +/** @} */ /* END CUDART_TYPES */ + +#endif /* !__TEXTURE_TYPES_H__ */ diff --git a/ext/cudart/include/vector_functions.h b/ext/cudart/include/vector_functions.h new file mode 100644 index 0000000000000000000000000000000000000000..bee6cd32c36d94bde65aad1c867352493d07a0dc --- /dev/null +++ b/ext/cudart/include/vector_functions.h @@ -0,0 +1,175 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__VECTOR_FUNCTIONS_H__) +#define __VECTOR_FUNCTIONS_H__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#if defined(__CUDACC_RTC__) +#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#endif /* __CUDACC_RTC__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x); + +__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x); + +__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y); + +__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y); + +__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z); + +__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z); + +__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w); + +__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w); + +__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x); + +__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x); + +__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y); + +__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y); + +__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z); + +__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z); + +__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w); + +__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w); + +__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x); + +__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x); + +__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y); + +__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y); + +__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z); + +__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z); + +__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w); + +__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w); + +__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x); + +__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x); + +__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y); + +__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y); + +__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z); + +__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z); + +__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w); + +__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w); + +__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x); + +__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y); + +__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z); + +__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w); + +__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x); + +__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x); + +__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y); + +__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y); + +__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z); + +__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z); + +__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w); + +__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w); + +__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x); + +__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y); + +__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z); + +__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w); + +#undef __VECTOR_FUNCTIONS_DECL__ + +#if !defined(__CUDACC_RTC__) +#include "vector_functions.hpp" +#endif /* !__CUDACC_RTC__ */ + +#endif /* !__VECTOR_FUNCTIONS_H__ */ diff --git a/ext/cudart/include/vector_functions.hpp b/ext/cudart/include/vector_functions.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ab69cf38045a7c44dae67e7149d49ac4c6148747 --- /dev/null +++ b/ext/cudart/include/vector_functions.hpp @@ -0,0 +1,316 @@ +/* + * Copyright 1993-2014 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__VECTOR_FUNCTIONS_HPP__) +#define __VECTOR_FUNCTIONS_HPP__ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#include "cuda_runtime_api.h" + +#if defined(__CUDACC_RTC__) +#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__ +#else /* !__CUDACC_RTC__ */ +#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__ +#endif /* __CUDACC_RTC__ */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x) +{ + char1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x) +{ + uchar1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y) +{ + char2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y) +{ + uchar2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z) +{ + char3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z) +{ + uchar3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w) +{ + char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w) +{ + uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x) +{ + short1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x) +{ + ushort1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y) +{ + short2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y) +{ + ushort2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z) +{ + short3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z) +{ + ushort3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w) +{ + short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w) +{ + ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x) +{ + int1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x) +{ + uint1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y) +{ + int2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y) +{ + uint2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z) +{ + int3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z) +{ + uint3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w) +{ + int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w) +{ + uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x) +{ + long1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x) +{ + ulong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y) +{ + long2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y) +{ + ulong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z) +{ + long3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z) +{ + ulong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w) +{ + long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w) +{ + ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x) +{ + float1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y) +{ + float2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z) +{ + float3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w) +{ + float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x) +{ + longlong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x) +{ + ulonglong1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y) +{ + longlong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y) +{ + ulonglong2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z) +{ + longlong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z) +{ + ulonglong3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w) +{ + longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w) +{ + ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x) +{ + double1 t; t.x = x; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y) +{ + double2 t; t.x = x; t.y = y; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z) +{ + double3 t; t.x = x; t.y = y; t.z = z; return t; +} + +__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w) +{ + double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t; +} + +#undef __VECTOR_FUNCTIONS_DECL__ + +#endif /* !__VECTOR_FUNCTIONS_HPP__ */ + diff --git a/ext/cudart/include/vector_types.h b/ext/cudart/include/vector_types.h new file mode 100644 index 0000000000000000000000000000000000000000..4cfabcff8a25adaf3f589d38d531bc63cae2fcf6 --- /dev/null +++ b/ext/cudart/include/vector_types.h @@ -0,0 +1,443 @@ +/* + * Copyright 1993-2018 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +#if !defined(__VECTOR_TYPES_H__) +#define __VECTOR_TYPES_H__ + +#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__) +#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__ +#endif + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#ifndef __DOXYGEN_ONLY__ +#include "crt/host_defines.h" +#endif + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +#if !defined(__CUDACC__) && !defined(__CUDACC_RTC__) && \ + defined(_WIN32) && !defined(_WIN64) + +#pragma warning(push) +#pragma warning(disable: 4201 4408) + +#define __cuda_builtin_vector_align8(tag, members) \ +struct __device_builtin__ tag \ +{ \ + union \ + { \ + struct { members }; \ + struct { long long int :1,:0; }; \ + }; \ +} + +#else /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */ + +#define __cuda_builtin_vector_align8(tag, members) \ +struct __device_builtin__ __align__(8) tag \ +{ \ + members \ +} + +#endif /* !__CUDACC__ && !__CUDACC_RTC__ && _WIN32 && !_WIN64 */ + +struct __device_builtin__ char1 +{ + signed char x; +}; + +struct __device_builtin__ uchar1 +{ + unsigned char x; +}; + + +struct __device_builtin__ __align__(2) char2 +{ + signed char x, y; +}; + +struct __device_builtin__ __align__(2) uchar2 +{ + unsigned char x, y; +}; + +struct __device_builtin__ char3 +{ + signed char x, y, z; +}; + +struct __device_builtin__ uchar3 +{ + unsigned char x, y, z; +}; + +struct __device_builtin__ __align__(4) char4 +{ + signed char x, y, z, w; +}; + +struct __device_builtin__ __align__(4) uchar4 +{ + unsigned char x, y, z, w; +}; + +struct __device_builtin__ short1 +{ + short x; +}; + +struct __device_builtin__ ushort1 +{ + unsigned short x; +}; + +struct __device_builtin__ __align__(4) short2 +{ + short x, y; +}; + +struct __device_builtin__ __align__(4) ushort2 +{ + unsigned short x, y; +}; + +struct __device_builtin__ short3 +{ + short x, y, z; +}; + +struct __device_builtin__ ushort3 +{ + unsigned short x, y, z; +}; + +__cuda_builtin_vector_align8(short4, short x; short y; short z; short w;); +__cuda_builtin_vector_align8(ushort4, unsigned short x; unsigned short y; unsigned short z; unsigned short w;); + +struct __device_builtin__ int1 +{ + int x; +}; + +struct __device_builtin__ uint1 +{ + unsigned int x; +}; + +__cuda_builtin_vector_align8(int2, int x; int y;); +__cuda_builtin_vector_align8(uint2, unsigned int x; unsigned int y;); + +struct __device_builtin__ int3 +{ + int x, y, z; +}; + +struct __device_builtin__ uint3 +{ + unsigned int x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) int4 +{ + int x, y, z, w; +}; + +struct __device_builtin__ __builtin_align__(16) uint4 +{ + unsigned int x, y, z, w; +}; + +struct __device_builtin__ long1 +{ + long int x; +}; + +struct __device_builtin__ ulong1 +{ + unsigned long x; +}; + +#if defined(_WIN32) +__cuda_builtin_vector_align8(long2, long int x; long int y;); +__cuda_builtin_vector_align8(ulong2, unsigned long int x; unsigned long int y;); +#else /* !_WIN32 */ + +struct __device_builtin__ __align__(2*sizeof(long int)) long2 +{ + long int x, y; +}; + +struct __device_builtin__ __align__(2*sizeof(unsigned long int)) ulong2 +{ + unsigned long int x, y; +}; + +#endif /* _WIN32 */ + +struct __device_builtin__ long3 +{ + long int x, y, z; +}; + +struct __device_builtin__ ulong3 +{ + unsigned long int x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) long4 +{ + long int x, y, z, w; +}; + +struct __device_builtin__ __builtin_align__(16) ulong4 +{ + unsigned long int x, y, z, w; +}; + +struct __device_builtin__ float1 +{ + float x; +}; + +#if !defined(__CUDACC__) && defined(__arm__) && \ + defined(__ARM_PCS_VFP) && __GNUC__ == 4 && __GNUC_MINOR__ == 6 + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-pedantic" + +struct __device_builtin__ __attribute__((aligned(8))) float2 +{ + float x; float y; float __cuda_gnu_arm_ice_workaround[0]; +}; + +#pragma GCC poison __cuda_gnu_arm_ice_workaround +#pragma GCC diagnostic pop + +#else /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP && + __GNUC__ == 4&& __GNUC_MINOR__ == 6 */ + +__cuda_builtin_vector_align8(float2, float x; float y;); + +#endif /* !__CUDACC__ && __arm__ && __ARM_PCS_VFP && + __GNUC__ == 4&& __GNUC_MINOR__ == 6 */ + +struct __device_builtin__ float3 +{ + float x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) float4 +{ + float x, y, z, w; +}; + +struct __device_builtin__ longlong1 +{ + long long int x; +}; + +struct __device_builtin__ ulonglong1 +{ + unsigned long long int x; +}; + +struct __device_builtin__ __builtin_align__(16) longlong2 +{ + long long int x, y; +}; + +struct __device_builtin__ __builtin_align__(16) ulonglong2 +{ + unsigned long long int x, y; +}; + +struct __device_builtin__ longlong3 +{ + long long int x, y, z; +}; + +struct __device_builtin__ ulonglong3 +{ + unsigned long long int x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) longlong4 +{ + long long int x, y, z ,w; +}; + +struct __device_builtin__ __builtin_align__(16) ulonglong4 +{ + unsigned long long int x, y, z, w; +}; + +struct __device_builtin__ double1 +{ + double x; +}; + +struct __device_builtin__ __builtin_align__(16) double2 +{ + double x, y; +}; + +struct __device_builtin__ double3 +{ + double x, y, z; +}; + +struct __device_builtin__ __builtin_align__(16) double4 +{ + double x, y, z, w; +}; + +#if !defined(__CUDACC__) && defined(_WIN32) && !defined(_WIN64) + +#pragma warning(pop) + +#endif /* !__CUDACC__ && _WIN32 && !_WIN64 */ + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +typedef __device_builtin__ struct char1 char1; +typedef __device_builtin__ struct uchar1 uchar1; +typedef __device_builtin__ struct char2 char2; +typedef __device_builtin__ struct uchar2 uchar2; +typedef __device_builtin__ struct char3 char3; +typedef __device_builtin__ struct uchar3 uchar3; +typedef __device_builtin__ struct char4 char4; +typedef __device_builtin__ struct uchar4 uchar4; +typedef __device_builtin__ struct short1 short1; +typedef __device_builtin__ struct ushort1 ushort1; +typedef __device_builtin__ struct short2 short2; +typedef __device_builtin__ struct ushort2 ushort2; +typedef __device_builtin__ struct short3 short3; +typedef __device_builtin__ struct ushort3 ushort3; +typedef __device_builtin__ struct short4 short4; +typedef __device_builtin__ struct ushort4 ushort4; +typedef __device_builtin__ struct int1 int1; +typedef __device_builtin__ struct uint1 uint1; +typedef __device_builtin__ struct int2 int2; +typedef __device_builtin__ struct uint2 uint2; +typedef __device_builtin__ struct int3 int3; +typedef __device_builtin__ struct uint3 uint3; +typedef __device_builtin__ struct int4 int4; +typedef __device_builtin__ struct uint4 uint4; +typedef __device_builtin__ struct long1 long1; +typedef __device_builtin__ struct ulong1 ulong1; +typedef __device_builtin__ struct long2 long2; +typedef __device_builtin__ struct ulong2 ulong2; +typedef __device_builtin__ struct long3 long3; +typedef __device_builtin__ struct ulong3 ulong3; +typedef __device_builtin__ struct long4 long4; +typedef __device_builtin__ struct ulong4 ulong4; +typedef __device_builtin__ struct float1 float1; +typedef __device_builtin__ struct float2 float2; +typedef __device_builtin__ struct float3 float3; +typedef __device_builtin__ struct float4 float4; +typedef __device_builtin__ struct longlong1 longlong1; +typedef __device_builtin__ struct ulonglong1 ulonglong1; +typedef __device_builtin__ struct longlong2 longlong2; +typedef __device_builtin__ struct ulonglong2 ulonglong2; +typedef __device_builtin__ struct longlong3 longlong3; +typedef __device_builtin__ struct ulonglong3 ulonglong3; +typedef __device_builtin__ struct longlong4 longlong4; +typedef __device_builtin__ struct ulonglong4 ulonglong4; +typedef __device_builtin__ struct double1 double1; +typedef __device_builtin__ struct double2 double2; +typedef __device_builtin__ struct double3 double3; +typedef __device_builtin__ struct double4 double4; + +/******************************************************************************* +* * +* * +* * +*******************************************************************************/ + +struct __device_builtin__ dim3 +{ + unsigned int x, y, z; +#if defined(__cplusplus) +#if __cplusplus >= 201103L + __host__ __device__ constexpr dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {} + __host__ __device__ constexpr dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} + __host__ __device__ constexpr operator uint3(void) const { return uint3{x, y, z}; } +#else + __host__ __device__ dim3(unsigned int vx = 1, unsigned int vy = 1, unsigned int vz = 1) : x(vx), y(vy), z(vz) {} + __host__ __device__ dim3(uint3 v) : x(v.x), y(v.y), z(v.z) {} + __host__ __device__ operator uint3(void) const { uint3 t; t.x = x; t.y = y; t.z = z; return t; } +#endif +#endif /* __cplusplus */ +}; + +typedef __device_builtin__ struct dim3 dim3; + +#undef __cuda_builtin_vector_align8 + +#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__) +#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__ +#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_VECTOR_TYPES_H__ +#endif + +#endif /* !__VECTOR_TYPES_H__ */ diff --git a/ext/cudart/lib/Win32/OpenCL.lib b/ext/cudart/lib/Win32/OpenCL.lib new file mode 100644 index 0000000000000000000000000000000000000000..c06b10f396eb1ee2175d459efba571400892e47b Binary files /dev/null and b/ext/cudart/lib/Win32/OpenCL.lib differ diff --git a/ext/cudart/lib/Win32/cuda.lib b/ext/cudart/lib/Win32/cuda.lib new file mode 100644 index 0000000000000000000000000000000000000000..56151c765658e5be3e672dd47594ea6a8d35e9d1 Binary files /dev/null and b/ext/cudart/lib/Win32/cuda.lib differ diff --git a/ext/cudart/lib/Win32/cudadevrt.lib b/ext/cudart/lib/Win32/cudadevrt.lib new file mode 100644 index 0000000000000000000000000000000000000000..a8f1e0e36f27a0161b0f4978256857ed77758c0a Binary files /dev/null and b/ext/cudart/lib/Win32/cudadevrt.lib differ diff --git a/ext/cudart/lib/Win32/cudart.lib b/ext/cudart/lib/Win32/cudart.lib new file mode 100644 index 0000000000000000000000000000000000000000..385e92bd4660ffe8771e5ba30803990d7668f4ab Binary files /dev/null and b/ext/cudart/lib/Win32/cudart.lib differ diff --git a/ext/cudart/lib/Win32/cudart_static.lib b/ext/cudart/lib/Win32/cudart_static.lib new file mode 100644 index 0000000000000000000000000000000000000000..b28c233b93739792cfc271ed1f75813de757eb48 Binary files /dev/null and b/ext/cudart/lib/Win32/cudart_static.lib differ diff --git a/ext/cudart/lib/x64/OpenCL.lib b/ext/cudart/lib/x64/OpenCL.lib new file mode 100644 index 0000000000000000000000000000000000000000..20ed60d9f2e07df188ab496f61c21f3f5e96862d Binary files /dev/null and b/ext/cudart/lib/x64/OpenCL.lib differ diff --git a/ext/cudart/lib/x64/cuda.lib b/ext/cudart/lib/x64/cuda.lib new file mode 100644 index 0000000000000000000000000000000000000000..f146412d42b563d1962a708d3f628c1852692f69 Binary files /dev/null and b/ext/cudart/lib/x64/cuda.lib differ diff --git a/ext/cudart/lib/x64/cudadevrt.lib b/ext/cudart/lib/x64/cudadevrt.lib new file mode 100644 index 0000000000000000000000000000000000000000..d933b7fbdd650777540d5ca6cf62a5785700e851 Binary files /dev/null and b/ext/cudart/lib/x64/cudadevrt.lib differ diff --git a/ext/cudart/lib/x64/cudart.lib b/ext/cudart/lib/x64/cudart.lib new file mode 100644 index 0000000000000000000000000000000000000000..b023e8d3116164aa1dada4cb60eee025c748cd06 Binary files /dev/null and b/ext/cudart/lib/x64/cudart.lib differ diff --git a/ext/cudart/lib/x64/cudart_static.lib b/ext/cudart/lib/x64/cudart_static.lib new file mode 100644 index 0000000000000000000000000000000000000000..d975c98869fddb1a48fa7a8f5aa393501e9cc655 Binary files /dev/null and b/ext/cudart/lib/x64/cudart_static.lib differ diff --git a/src/encoder/nvidia_encoder.cpp b/src/encoder/nvidia_encoder.cpp index 0951a0d70c7aa38ae4390f1f41b11f395b46c6d6..45abd9100e5cf6d366d53a4d87c96a65acfae2d3 100644 --- a/src/encoder/nvidia_encoder.cpp +++ b/src/encoder/nvidia_encoder.cpp @@ -1,5 +1,7 @@ #include "nvidia_encoder.hpp" +#include <cuda.h> + #if defined(__unix__) #include <dlfcn.h> #endif @@ -8,6 +10,8 @@ typedef NVENCSTATUS(NVENCAPI* NvEncodeAPICreateInstance_Type)(NV_ENCODE_API_FUNC bool NvidiaEncoder::create(lava::device_ptr device, const lava::renderer& renderer, const glm::uvec2& size, uint32_t input_buffers) { + cuInit(0); + if (!this->load_library()) { return false;