#loc1 = loc(unknown) module attributes {dlti.dl_spec = #dlti.dl_spec<i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "clang version 20.0.0git (git@github.com:ivanradanov/llvm-project.git 872c28cfdf6140fafac11eddbb5895f11bc6f295)", llvm.target_triple = "x86_64-unknown-linux-gnu"} { gpu.module @__mlir_gpu_module [#nvvm.target<chip = "sm_80">] attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>, llvm.data_layout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"} { llvm.func ptx_kernelcc @_Z14nvshmem_kernelPiS_(%arg0: !llvm.ptr {llvm.noundef} loc(unknown), %arg1: !llvm.ptr {llvm.noundef} loc(unknown)) attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", "norecurse", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "sm_89"], ["uniform-work-group-size", "true"]], target_cpu = "sm_89", target_features = #llvm.target_features<["+ptx84", "+sm_89"]>} { %0 = llvm.mlir.constant(1 : i32) : i32 loc(#loc1) %1 = llvm.mlir.constant(0 : i32) : i32 loc(#loc1) %2 = llvm.mlir.constant(1 : i64) : i64 loc(#loc1) %3 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %4 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %5 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %6 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) llvm.store %arg0, %3 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) llvm.store %arg1, %4 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) %7 = llvm.load %3 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) llvm.store %1, %7 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) %8 = llvm.load %4 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) llvm.store %0, %8 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.store %1, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) %9 = llvm.call @nvshmem_my_pe() {convergent, no_unwind} : () -> i32 loc(#loc1) llvm.store %9, %6 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1) %10 = llvm.load %6 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %11 = llvm.icmp "eq" %10, %1 : i32 loc(#loc1) cf.cond_br %11, ^bb1, ^bb2 loc(#loc1) ^bb1: // pred: ^bb0 %12 = llvm.load %4 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) %13 = llvm.load %3 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) llvm.call @nvshmem_int_get_nbi(%12, %13, %2, %0) {convergent, no_unwind} : (!llvm.ptr, !llvm.ptr, i64, i32) -> () loc(#loc1) %14 = llvm.load %4 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) %15 = llvm.load %14 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) llvm.store %15, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1) cf.br ^bb3 loc(#loc1) ^bb2: // pred: ^bb0 llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1) cf.br ^bb3 loc(#loc1) ^bb3: // 2 preds: ^bb1, ^bb2 %16 = llvm.load %5 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %17 = llvm.load %3 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) %18 = llvm.load %17 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %19 = llvm.add %18, %16 : i32 loc(#loc1) llvm.store %19, %17 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.return loc(#loc1) } loc(#loc1) llvm.func @nvshmem_my_pe() -> i32 attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "sm_89"]], target_cpu = "sm_89", target_features = #llvm.target_features<["+ptx84", "+sm_89"]>} loc(#loc1) llvm.func @nvshmem_barrier_all() attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "sm_89"]], target_cpu = "sm_89", target_features = #llvm.target_features<["+ptx84", "+sm_89"]>} loc(#loc1) llvm.func @nvshmem_int_get_nbi(!llvm.ptr {llvm.noundef}, !llvm.ptr {llvm.noundef}, i64 {llvm.noundef}, i32 {llvm.noundef}) attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "sm_89"]], target_cpu = "sm_89", target_features = #llvm.target_features<["+ptx84", "+sm_89"]>} loc(#loc1) } loc(#loc) llvm.comdat @__llvm_global_comdat { llvm.comdat_selector @_ZN4dim3C2Ejjj any loc(#loc) } loc(#loc) llvm.mlir.global private unnamed_addr constant @".str"("Got %d PEs, expected %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local} loc(#loc) llvm.mlir.global private unnamed_addr constant @".str.1"("PE %d: localbuf = %d, remote = %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local} loc(#loc) llvm.mlir.global private unnamed_addr constant @".str.2"("Process %d: Execution finished, variable contents: remote = %d, localbuf = %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local} loc(#loc) llvm.mlir.global private unnamed_addr constant @mlir.llvm.nameless_global_0("_Z14nvshmem_kernelPiS_\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local} loc(#loc) llvm.mlir.global private constant @mlir.llvm.nameless_global_1("#loc1 = loc(unknown)\0Amodule attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, \22dlti.endianness\22 = \22little\22>, llvm.data_layout = \22e-i64:64-i128:128-v16:16-v32:32-n16:32:64\22, llvm.target_triple = \22nvptx64-nvidia-cuda\22} {\0A llvm.func ptx_kernelcc @_Z14nvshmem_kernelPiS_(%arg0: !llvm.ptr {llvm.noundef} loc(unknown), %arg1: !llvm.ptr {llvm.noundef} loc(unknown)) attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = [\22mustprogress\22, \22norecurse\22, [\22no-trapping-math\22, \22true\22], [\22stack-protector-buffer-size\22, \228\22], [\22target-cpu\22, \22sm_89\22], [\22uniform-work-group-size\22, \22true\22]], target_cpu = \22sm_89\22, target_features = #llvm.target_features<[\22+ptx84\22, \22+sm_89\22]>} {\0A %c1_i32 = arith.constant 1 : i32 loc(#loc1)\0A %c0_i32 = arith.constant 0 : i32 loc(#loc1)\0A %c1_i64 = arith.constant 1 : i64 loc(#loc1)\0A %0 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)\0A %1 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)\0A %2 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)\0A %3 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)\0A llvm.store %arg0, %0 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)\0A llvm.store %arg1, %1 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)\0A %4 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A llvm.store %c0_i32, %4 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A %5 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A llvm.store %c1_i32, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A llvm.store %c0_i32, %2 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A %6 = llvm.call @nvshmem_my_pe() {convergent, no_unwind} : () -> i32 loc(#loc1)\0A llvm.store %6, %3 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)\0A %7 = llvm.load %3 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)\0A %8 = arith.cmpi eq, %7, %c0_i32 : i32 loc(#loc1)\0A cf.cond_br %8, ^bb1, ^bb2 loc(#loc1)\0A ^bb1: // pred: ^bb0\0A %9 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A %10 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A llvm.call @nvshmem_int_get_nbi(%9, %10, %c1_i64, %c1_i32) {convergent, no_unwind} : (!llvm.ptr, !llvm.ptr, i64, i32) -> () loc(#loc1)\0A %11 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A %12 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)\0A llvm.store %12, %2 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)\0A cf.br ^bb3 loc(#loc1)\0A ^bb2: // pred: ^bb0\0A llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)\0A cf.br ^bb3 loc(#loc1)\0A ^bb3: // 2 preds: ^bb1, ^bb2\0A %13 = llvm.load %2 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)\0A %14 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A %15 = llvm.load %14 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)\0A %16 = arith.addi %15, %13 : i32 loc(#loc1)\0A llvm.store %16, %14 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A llvm.return loc(#loc1)\0A } loc(#loc1)\0A llvm.func @nvshmem_my_pe() -> i32 attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [[\22no-trapping-math\22, \22true\22], [\22stack-protector-buffer-size\22, \228\22], [\22target-cpu\22, \22sm_89\22]], target_cpu = \22sm_89\22, target_features = #llvm.target_features<[\22+ptx84\22, \22+sm_89\22]>} loc(#loc1)\0A llvm.func @nvshmem_barrier_all() attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [[\22no-trapping-math\22, \22true\22], [\22stack-protector-buffer-size\22, \228\22], [\22target-cpu\22, \22sm_89\22]], target_cpu = \22sm_89\22, target_features = #llvm.target_features<[\22+ptx84\22, \22+sm_89\22]>} loc(#loc1)\0A llvm.func @nvshmem_int_get_nbi(!llvm.ptr {llvm.noundef}, !llvm.ptr {llvm.noundef}, i64 {llvm.noundef}, i32 {llvm.noundef}) attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [[\22no-trapping-math\22, \22true\22], [\22stack-protector-buffer-size\22, \228\22], [\22target-cpu\22, \22sm_89\22]], target_cpu = \22sm_89\22, target_features = #llvm.target_features<[\22+ptx84\22, \22+sm_89\22]>} loc(#loc1)\0A} loc(#loc)\0A#loc = loc(\22/home/sw339864/promotion/mlir-research/data-race-detection-benchmark-suite/rmaracebench/nvshmem/gpuInitiated/sync//001-nvshmem-sync-barrierall-local-yes.cu\22:0:0)\0A\00") {addr_space = 0 : i32, alignment = 8 : i64, dso_local, section = ".nv_fatbin"} loc(#loc) llvm.mlir.global internal constant @__cuda_fatbin_wrapper() {addr_space = 0 : i32, alignment = 8 : i64, dso_local, section = ".nvFatBinSegment"} : !llvm.struct<(i32, i32, ptr, ptr)> { %0 = llvm.mlir.zero : !llvm.ptr loc(#loc1) %1 = llvm.mlir.addressof @mlir.llvm.nameless_global_1 : !llvm.ptr loc(#loc1) %c1_i32 = arith.constant 1 : i32 loc(#loc1) %c1180844977_i32 = arith.constant 1180844977 : i32 loc(#loc1) %2 = llvm.mlir.undef : !llvm.struct<(i32, i32, ptr, ptr)> loc(#loc1) %3 = llvm.insertvalue %c1180844977_i32, %2[0] : !llvm.struct<(i32, i32, ptr, ptr)> loc(#loc1) %4 = llvm.insertvalue %c1_i32, %3[1] : !llvm.struct<(i32, i32, ptr, ptr)> loc(#loc1) %5 = llvm.insertvalue %1, %4[2] : !llvm.struct<(i32, i32, ptr, ptr)> loc(#loc1) %6 = llvm.insertvalue %0, %5[3] : !llvm.struct<(i32, i32, ptr, ptr)> loc(#loc1) llvm.return %6 : !llvm.struct<(i32, i32, ptr, ptr)> loc(#loc) } loc(#loc) llvm.mlir.global internal @__cuda_gpubin_handle() {addr_space = 0 : i32, alignment = 8 : i64, dso_local} : !llvm.ptr { %0 = llvm.mlir.zero : !llvm.ptr loc(#loc1) llvm.return %0 : !llvm.ptr loc(#loc) } loc(#loc) llvm.mlir.global_ctors {ctors = [@__cuda_module_ctor], priorities = [65535 : i32]} loc(#loc) llvm.func @_Z29__device_stub__nvshmem_kernelPiS_(%arg0: !llvm.ptr {llvm.noundef} loc(unknown), %arg1: !llvm.ptr {llvm.noundef} loc(unknown)) attributes {frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"], ["uniform-work-group-size", "true"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} { %c1_i32 = arith.constant 1 : i32 loc(#loc1) %c2_i64 = arith.constant 2 : i64 loc(#loc1) %c12_i64 = arith.constant 12 : i64 loc(#loc1) %0 = llvm.mlir.addressof @_Z29__device_stub__nvshmem_kernelPiS_ : !llvm.ptr loc(#loc1) %1 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %2 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %3 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %4 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %5 = llvm.alloca %c1_i32 x i64 {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %6 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %7 = llvm.alloca %c1_i32 x !llvm.struct<(i64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %8 = llvm.alloca %c1_i32 x !llvm.struct<(i64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) llvm.store %arg0, %1 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) llvm.store %arg1, %2 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) %9 = llvm.alloca %c2_i64 x !llvm.ptr {alignment = 16 : i64} : (i64) -> !llvm.ptr loc(#loc1) llvm.store %1, %9 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) %10 = llvm.getelementptr %9[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr loc(#loc1) llvm.store %2, %10 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) %11 = llvm.call @__cudaPopCallConfiguration(%3, %4, %5, %6) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 loc(#loc1) %12 = llvm.load %5 {alignment = 8 : i64} : !llvm.ptr -> i64 loc(#loc1) %13 = llvm.load %6 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) "llvm.intr.memcpy"(%7, %3, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1) %14 = llvm.getelementptr inbounds %7[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1) %15 = llvm.load %14 {alignment = 8 : i64} : !llvm.ptr -> i64 loc(#loc1) %16 = llvm.getelementptr inbounds %7[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1) %17 = llvm.load %16 {alignment = 8 : i64} : !llvm.ptr -> i32 loc(#loc1) "llvm.intr.memcpy"(%8, %4, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1) %18 = llvm.getelementptr inbounds %8[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1) %19 = llvm.load %18 {alignment = 8 : i64} : !llvm.ptr -> i64 loc(#loc1) %20 = llvm.getelementptr inbounds %8[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1) %21 = llvm.load %20 {alignment = 8 : i64} : !llvm.ptr -> i32 loc(#loc1) llvm.call @__mlir_launch_coerced_kernel__Z29__device_stub__nvshmem_kernelPiS_(%0, %15, %17, %19, %21, %12, %13, %arg0, %arg1) : (!llvm.ptr, i64, i32, i64, i32, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> () loc(#loc1) llvm.return loc(#loc1) } loc(#loc1) llvm.func @__cudaPopCallConfiguration(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 loc(#loc1) llvm.func @cudaLaunchKernel(!llvm.ptr, i64, i32, i64, i32, !llvm.ptr, i64, !llvm.ptr) -> i32 loc(#loc1) llvm.func @main(%arg0: i32 {llvm.noundef} loc(unknown), %arg1: !llvm.ptr {llvm.noundef} loc(unknown)) -> (i32 {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} { %c1_i32 = arith.constant 1 : i32 loc(#loc1) %c0_i32 = arith.constant 0 : i32 loc(#loc1) %c2_i32 = arith.constant 2 : i32 loc(#loc1) %0 = llvm.mlir.addressof @".str" : !llvm.ptr loc(#loc1) %c4_i64 = arith.constant 4 : i64 loc(#loc1) %c0_i64 = arith.constant 0 : i64 loc(#loc1) %c12_i64 = arith.constant 12 : i64 loc(#loc1) %1 = llvm.mlir.addressof @_Z29__device_stub__nvshmem_kernelPiS_ : !llvm.ptr loc(#loc1) %2 = llvm.mlir.zero : !llvm.ptr loc(#loc1) %3 = llvm.mlir.addressof @".str.1" : !llvm.ptr loc(#loc1) %4 = llvm.mlir.addressof @".str.2" : !llvm.ptr loc(#loc1) %5 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %6 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %7 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %8 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %9 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %10 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %11 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %12 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %13 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %14 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %15 = llvm.alloca %c1_i32 x i64 {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %16 = llvm.alloca %c1_i32 x !llvm.array<2 x ptr> {alignment = 16 : i64} : (i32) -> !llvm.ptr loc(#loc1) %17 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %18 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %19 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %20 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %21 = llvm.alloca %c1_i32 x !llvm.struct<(i64, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %22 = llvm.alloca %c1_i32 x !llvm.struct<(i64, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) llvm.store %c0_i32, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.store %arg0, %6 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.store %arg1, %7 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) llvm.call @nvshmem_init() : () -> () loc(#loc1) %23 = llvm.call @nvshmem_team_my_pe(%c2_i32) : (i32) -> i32 loc(#loc1) llvm.store %23, %10 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) %24 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %25 = llvm.call @cudaSetDevice(%24) : (i32) -> i32 loc(#loc1) %26 = llvm.call @nvshmem_my_pe() : () -> i32 loc(#loc1) llvm.store %26, %11 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) %27 = llvm.call @nvshmem_n_pes() : () -> i32 loc(#loc1) llvm.store %27, %12 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) %28 = llvm.load %12 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %29 = arith.cmpi ne, %28, %c2_i32 : i32 loc(#loc1) cf.cond_br %29, ^bb1, ^bb2 loc(#loc1) ^bb1: // pred: ^bb0 %30 = llvm.load %12 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %31 = llvm.call @printf(%0, %30, %c2_i32) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32) -> i32 loc(#loc1) llvm.call @nvshmem_global_exit(%c1_i32) : (i32) -> () loc(#loc1) cf.br ^bb2 loc(#loc1) ^bb2: // 2 preds: ^bb0, ^bb1 %32 = llvm.call @nvshmem_malloc(%c4_i64) : (i64) -> !llvm.ptr loc(#loc1) llvm.store %32, %13 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) %33 = llvm.call @nvshmem_malloc(%c4_i64) : (i64) -> !llvm.ptr loc(#loc1) llvm.store %33, %14 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) llvm.store %c0_i64, %15 {alignment = 8 : i64} : i64, !llvm.ptr loc(#loc1) %34 = llvm.load %13 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) llvm.store %34, %16 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) %35 = llvm.getelementptr inbounds %16[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr loc(#loc1) %36 = llvm.load %14 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) llvm.store %36, %35 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) llvm.call @_ZN4dim3C2Ejjj(%17, %c1_i32, %c1_i32, %c1_i32) {no_unwind} : (!llvm.ptr, i32, i32, i32) -> () loc(#loc1) llvm.call @_ZN4dim3C2Ejjj(%18, %c1_i32, %c1_i32, %c1_i32) {no_unwind} : (!llvm.ptr, i32, i32, i32) -> () loc(#loc1) "llvm.intr.memcpy"(%19, %17, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1) "llvm.intr.memcpy"(%20, %18, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1) %37 = llvm.getelementptr inbounds %16[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x ptr> loc(#loc1) %38 = llvm.load %15 {alignment = 8 : i64} : !llvm.ptr -> i64 loc(#loc1) "llvm.intr.memcpy"(%21, %19, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1) %39 = llvm.getelementptr inbounds %21[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1) %40 = llvm.load %39 {alignment = 4 : i64} : !llvm.ptr -> i64 loc(#loc1) %41 = llvm.getelementptr inbounds %21[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1) %42 = llvm.load %41 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) "llvm.intr.memcpy"(%22, %20, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1) %43 = llvm.getelementptr inbounds %22[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1) %44 = llvm.load %43 {alignment = 4 : i64} : !llvm.ptr -> i64 loc(#loc1) %45 = llvm.getelementptr inbounds %22[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1) %46 = llvm.load %45 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %47 = llvm.call @nvshmemx_collective_launch(%1, %40, %42, %44, %46, %37, %38, %2) : (!llvm.ptr, i64, i32, i64, i32, !llvm.ptr, i64, !llvm.ptr) -> i32 loc(#loc1) %48 = llvm.load %13 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) %49 = llvm.call @cudaMemcpy(%8, %48, %c4_i64, %c2_i32) : (!llvm.ptr, !llvm.ptr, i64, i32) -> i32 loc(#loc1) %50 = llvm.load %14 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) %51 = llvm.call @cudaMemcpy(%9, %50, %c4_i64, %c2_i32) : (!llvm.ptr, !llvm.ptr, i64, i32) -> i32 loc(#loc1) llvm.call @nvshmem_barrier_all() : () -> () loc(#loc1) %52 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %53 = llvm.load %9 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %54 = llvm.load %8 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %55 = llvm.call @printf(%3, %52, %53, %54) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32, i32) -> i32 loc(#loc1) llvm.call @nvshmem_barrier_all() : () -> () loc(#loc1) %56 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %57 = llvm.load %8 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %58 = llvm.load %9 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) %59 = llvm.call @printf(%4, %56, %57, %58) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32, i32) -> i32 loc(#loc1) %60 = llvm.load %13 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) llvm.call @nvshmem_free(%60) : (!llvm.ptr) -> () loc(#loc1) %61 = llvm.load %14 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) llvm.call @nvshmem_free(%61) : (!llvm.ptr) -> () loc(#loc1) llvm.call @nvshmem_finalize() : () -> () loc(#loc1) llvm.return %c0_i32 : i32 loc(#loc1) } loc(#loc1) llvm.func @nvshmem_init() attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @nvshmem_team_my_pe(i32 {llvm.noundef}) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @cudaSetDevice(i32 {llvm.noundef}) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @nvshmem_my_pe() -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @nvshmem_n_pes() -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @printf(!llvm.ptr {llvm.noundef}, ...) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @nvshmem_global_exit(i32 {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @nvshmem_malloc(i64 {llvm.noundef}) -> !llvm.ptr attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func linkonce_odr unnamed_addr @_ZN4dim3C2Ejjj(%arg0: !llvm.ptr {llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, llvm.noundef} loc(unknown), %arg1: i32 {llvm.noundef} loc(unknown), %arg2: i32 {llvm.noundef} loc(unknown), %arg3: i32 {llvm.noundef} loc(unknown)) comdat(@__llvm_global_comdat::@_ZN4dim3C2Ejjj) attributes {alignment = 2 : i64, frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} { %c1_i32 = arith.constant 1 : i32 loc(#loc1) %0 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1) %1 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %2 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) %3 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1) llvm.store %arg0, %0 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) llvm.store %arg1, %1 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.store %arg2, %2 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.store %arg3, %3 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) %4 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) %5 = llvm.getelementptr inbounds %4[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"struct.dim3", (i32, i32, i32)> loc(#loc1) %6 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) llvm.store %6, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) %7 = llvm.getelementptr inbounds %4[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"struct.dim3", (i32, i32, i32)> loc(#loc1) %8 = llvm.load %2 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) llvm.store %8, %7 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) %9 = llvm.getelementptr inbounds %4[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"struct.dim3", (i32, i32, i32)> loc(#loc1) %10 = llvm.load %3 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1) llvm.store %10, %9 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1) llvm.return loc(#loc1) } loc(#loc1) llvm.func @nvshmemx_collective_launch(!llvm.ptr {llvm.noundef}, i64, i32, i64, i32, !llvm.ptr {llvm.noundef}, i64 {llvm.noundef}, !llvm.ptr {llvm.noundef}) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @cudaMemcpy(!llvm.ptr {llvm.noundef}, !llvm.ptr {llvm.noundef}, i64 {llvm.noundef}, i32 {llvm.noundef}) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @nvshmem_barrier_all() attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @nvshmem_free(!llvm.ptr {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func @nvshmem_finalize() attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1) llvm.func internal @__cuda_register_globals(%arg0: !llvm.ptr loc(unknown)) attributes {dso_local} { %0 = llvm.mlir.addressof @_Z29__device_stub__nvshmem_kernelPiS_ : !llvm.ptr loc(#loc1) %1 = llvm.mlir.addressof @mlir.llvm.nameless_global_0 : !llvm.ptr loc(#loc1) %c-1_i32 = arith.constant -1 : i32 loc(#loc1) %2 = llvm.mlir.zero : !llvm.ptr loc(#loc1) %3 = llvm.call @__cudaRegisterFunction(%arg0, %0, %1, %1, %c-1_i32, %2, %2, %2, %2, %2) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 loc(#loc1) llvm.return loc(#loc1) } loc(#loc1) llvm.func @__cudaRegisterFunction(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 loc(#loc1) llvm.func @__cudaRegisterVar(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, i64, i32, i32) loc(#loc1) llvm.func @__cudaRegisterManagedVar(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i32) loc(#loc1) llvm.func @__cudaRegisterSurface(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, i32) loc(#loc1) llvm.func @__cudaRegisterTexture(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, i32, i32) loc(#loc1) llvm.func @__cudaRegisterFatBinary(!llvm.ptr) -> !llvm.ptr loc(#loc1) llvm.func internal @__cuda_module_ctor() attributes {dso_local} { %0 = llvm.mlir.addressof @__cuda_fatbin_wrapper : !llvm.ptr loc(#loc1) %1 = llvm.mlir.addressof @__cuda_gpubin_handle : !llvm.ptr loc(#loc1) %2 = llvm.mlir.addressof @__cuda_module_dtor : !llvm.ptr loc(#loc1) %3 = llvm.call @__cudaRegisterFatBinary(%0) : (!llvm.ptr) -> !llvm.ptr loc(#loc1) llvm.store %3, %1 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1) llvm.call @__cuda_register_globals(%3) : (!llvm.ptr) -> () loc(#loc1) llvm.call @__cudaRegisterFatBinaryEnd(%3) : (!llvm.ptr) -> () loc(#loc1) %4 = llvm.call @atexit(%2) : (!llvm.ptr) -> i32 loc(#loc1) llvm.return loc(#loc1) } loc(#loc1) llvm.func @__cudaRegisterFatBinaryEnd(!llvm.ptr) loc(#loc1) llvm.func @__cudaUnregisterFatBinary(!llvm.ptr) loc(#loc1) llvm.func internal @__cuda_module_dtor() attributes {dso_local} { %0 = llvm.mlir.addressof @__cuda_gpubin_handle : !llvm.ptr loc(#loc1) %1 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1) llvm.call @__cudaUnregisterFatBinary(%1) : (!llvm.ptr) -> () loc(#loc1) llvm.return loc(#loc1) } loc(#loc1) llvm.func @atexit(!llvm.ptr) -> i32 loc(#loc1) llvm.func @__mlir_launch_coerced_kernel__Z29__device_stub__nvshmem_kernelPiS_(!llvm.ptr, i64, i32, i64, i32, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) loc(#loc1) } loc(#loc) #loc = loc("/home/sw339864/promotion/mlir-research/data-race-detection-benchmark-suite/rmaracebench/nvshmem/gpuInitiated/sync//001-nvshmem-sync-barrierall-local-yes.cu":0:0)