#loc1 = loc(unknown)
module attributes {dlti.dl_spec = #dlti.dl_spec<i128 = dense<128> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, f128 = dense<128> : vector<2xi64>, !llvm.ptr<270> = dense<32> : vector<4xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i1 = dense<8> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, f80 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.stack_alignment" = 128 : i64>, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "clang version 20.0.0git (git@github.com:ivanradanov/llvm-project.git 872c28cfdf6140fafac11eddbb5895f11bc6f295)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
  gpu.module @__mlir_gpu_module [#nvvm.target<chip = "sm_80">] attributes {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>, llvm.data_layout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"} {
    llvm.func ptx_kernelcc @_Z14nvshmem_kernelPiS_(%arg0: !llvm.ptr {llvm.noundef} loc(unknown), %arg1: !llvm.ptr {llvm.noundef} loc(unknown)) attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", "norecurse", ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "sm_89"], ["uniform-work-group-size", "true"]], target_cpu = "sm_89", target_features = #llvm.target_features<["+ptx84", "+sm_89"]>} {
      %0 = llvm.mlir.constant(1 : i32) : i32 loc(#loc1)
      %1 = llvm.mlir.constant(0 : i32) : i32 loc(#loc1)
      %2 = llvm.mlir.constant(1 : i64) : i64 loc(#loc1)
      %3 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
      %4 = llvm.alloca %0 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
      %5 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
      %6 = llvm.alloca %0 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
      llvm.store %arg0, %3 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
      llvm.store %arg1, %4 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
      %7 = llvm.load %3 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
      llvm.store %1, %7 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
      %8 = llvm.load %4 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
      llvm.store %0, %8 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
      llvm.store %1, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
      %9 = llvm.call @nvshmem_my_pe() {convergent, no_unwind} : () -> i32 loc(#loc1)
      llvm.store %9, %6 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
      llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)
      %10 = llvm.load %6 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
      %11 = llvm.icmp "eq" %10, %1 : i32 loc(#loc1)
      cf.cond_br %11, ^bb1, ^bb2 loc(#loc1)
    ^bb1:  // pred: ^bb0
      %12 = llvm.load %4 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
      %13 = llvm.load %3 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
      llvm.call @nvshmem_int_get_nbi(%12, %13, %2, %0) {convergent, no_unwind} : (!llvm.ptr, !llvm.ptr, i64, i32) -> () loc(#loc1)
      %14 = llvm.load %4 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
      %15 = llvm.load %14 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
      llvm.store %15, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
      llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)
      cf.br ^bb3 loc(#loc1)
    ^bb2:  // pred: ^bb0
      llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)
      cf.br ^bb3 loc(#loc1)
    ^bb3:  // 2 preds: ^bb1, ^bb2
      %16 = llvm.load %5 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
      %17 = llvm.load %3 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
      %18 = llvm.load %17 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
      %19 = llvm.add %18, %16 : i32 loc(#loc1)
      llvm.store %19, %17 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
      llvm.return loc(#loc1)
    } loc(#loc1)
    llvm.func @nvshmem_my_pe() -> i32 attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "sm_89"]], target_cpu = "sm_89", target_features = #llvm.target_features<["+ptx84", "+sm_89"]>} loc(#loc1)
    llvm.func @nvshmem_barrier_all() attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "sm_89"]], target_cpu = "sm_89", target_features = #llvm.target_features<["+ptx84", "+sm_89"]>} loc(#loc1)
    llvm.func @nvshmem_int_get_nbi(!llvm.ptr {llvm.noundef}, !llvm.ptr {llvm.noundef}, i64 {llvm.noundef}, i32 {llvm.noundef}) attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "sm_89"]], target_cpu = "sm_89", target_features = #llvm.target_features<["+ptx84", "+sm_89"]>} loc(#loc1)
  } loc(#loc)
  llvm.comdat @__llvm_global_comdat {
    llvm.comdat_selector @_ZN4dim3C2Ejjj any loc(#loc)
  } loc(#loc)
  llvm.mlir.global private unnamed_addr constant @".str"("Got %d PEs, expected %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local} loc(#loc)
  llvm.mlir.global private unnamed_addr constant @".str.1"("PE %d: localbuf = %d, remote = %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local} loc(#loc)
  llvm.mlir.global private unnamed_addr constant @".str.2"("Process %d: Execution finished, variable contents: remote = %d, localbuf = %d\0A\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local} loc(#loc)
  llvm.mlir.global private unnamed_addr constant @mlir.llvm.nameless_global_0("_Z14nvshmem_kernelPiS_\00") {addr_space = 0 : i32, alignment = 1 : i64, dso_local} loc(#loc)
  llvm.mlir.global private constant @mlir.llvm.nameless_global_1("#loc1 = loc(unknown)\0Amodule attributes {dlti.dl_spec = #dlti.dl_spec<i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, \22dlti.endianness\22 = \22little\22>, llvm.data_layout = \22e-i64:64-i128:128-v16:16-v32:32-n16:32:64\22, llvm.target_triple = \22nvptx64-nvidia-cuda\22} {\0A  llvm.func ptx_kernelcc @_Z14nvshmem_kernelPiS_(%arg0: !llvm.ptr {llvm.noundef} loc(unknown), %arg1: !llvm.ptr {llvm.noundef} loc(unknown)) attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = [\22mustprogress\22, \22norecurse\22, [\22no-trapping-math\22, \22true\22], [\22stack-protector-buffer-size\22, \228\22], [\22target-cpu\22, \22sm_89\22], [\22uniform-work-group-size\22, \22true\22]], target_cpu = \22sm_89\22, target_features = #llvm.target_features<[\22+ptx84\22, \22+sm_89\22]>} {\0A    %c1_i32 = arith.constant 1 : i32 loc(#loc1)\0A    %c0_i32 = arith.constant 0 : i32 loc(#loc1)\0A    %c1_i64 = arith.constant 1 : i64 loc(#loc1)\0A    %0 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)\0A    %1 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)\0A    %2 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)\0A    %3 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)\0A    llvm.store %arg0, %0 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)\0A    llvm.store %arg1, %1 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)\0A    %4 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A    llvm.store %c0_i32, %4 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A    %5 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A    llvm.store %c1_i32, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A    llvm.store %c0_i32, %2 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A    %6 = llvm.call @nvshmem_my_pe() {convergent, no_unwind} : () -> i32 loc(#loc1)\0A    llvm.store %6, %3 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A    llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)\0A    %7 = llvm.load %3 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)\0A    %8 = arith.cmpi eq, %7, %c0_i32 : i32 loc(#loc1)\0A    cf.cond_br %8, ^bb1, ^bb2 loc(#loc1)\0A  ^bb1:  // pred: ^bb0\0A    %9 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A    %10 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A    llvm.call @nvshmem_int_get_nbi(%9, %10, %c1_i64, %c1_i32) {convergent, no_unwind} : (!llvm.ptr, !llvm.ptr, i64, i32) -> () loc(#loc1)\0A    %11 = llvm.load %1 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A    %12 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)\0A    llvm.store %12, %2 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A    llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)\0A    cf.br ^bb3 loc(#loc1)\0A  ^bb2:  // pred: ^bb0\0A    llvm.call @nvshmem_barrier_all() {convergent, no_unwind} : () -> () loc(#loc1)\0A    cf.br ^bb3 loc(#loc1)\0A  ^bb3:  // 2 preds: ^bb1, ^bb2\0A    %13 = llvm.load %2 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)\0A    %14 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)\0A    %15 = llvm.load %14 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)\0A    %16 = arith.addi %15, %13 : i32 loc(#loc1)\0A    llvm.store %16, %14 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)\0A    llvm.return loc(#loc1)\0A  } loc(#loc1)\0A  llvm.func @nvshmem_my_pe() -> i32 attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [[\22no-trapping-math\22, \22true\22], [\22stack-protector-buffer-size\22, \228\22], [\22target-cpu\22, \22sm_89\22]], target_cpu = \22sm_89\22, target_features = #llvm.target_features<[\22+ptx84\22, \22+sm_89\22]>} loc(#loc1)\0A  llvm.func @nvshmem_barrier_all() attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [[\22no-trapping-math\22, \22true\22], [\22stack-protector-buffer-size\22, \228\22], [\22target-cpu\22, \22sm_89\22]], target_cpu = \22sm_89\22, target_features = #llvm.target_features<[\22+ptx84\22, \22+sm_89\22]>} loc(#loc1)\0A  llvm.func @nvshmem_int_get_nbi(!llvm.ptr {llvm.noundef}, !llvm.ptr {llvm.noundef}, i64 {llvm.noundef}, i32 {llvm.noundef}) attributes {convergent, frame_pointer = #llvm.framePointerKind<all>, no_unwind, passthrough = [[\22no-trapping-math\22, \22true\22], [\22stack-protector-buffer-size\22, \228\22], [\22target-cpu\22, \22sm_89\22]], target_cpu = \22sm_89\22, target_features = #llvm.target_features<[\22+ptx84\22, \22+sm_89\22]>} loc(#loc1)\0A} loc(#loc)\0A#loc = loc(\22/home/sw339864/promotion/mlir-research/data-race-detection-benchmark-suite/rmaracebench/nvshmem/gpuInitiated/sync//001-nvshmem-sync-barrierall-local-yes.cu\22:0:0)\0A\00") {addr_space = 0 : i32, alignment = 8 : i64, dso_local, section = ".nv_fatbin"} loc(#loc)
  llvm.mlir.global internal constant @__cuda_fatbin_wrapper() {addr_space = 0 : i32, alignment = 8 : i64, dso_local, section = ".nvFatBinSegment"} : !llvm.struct<(i32, i32, ptr, ptr)> {
    %0 = llvm.mlir.zero : !llvm.ptr loc(#loc1)
    %1 = llvm.mlir.addressof @mlir.llvm.nameless_global_1 : !llvm.ptr loc(#loc1)
    %c1_i32 = arith.constant 1 : i32 loc(#loc1)
    %c1180844977_i32 = arith.constant 1180844977 : i32 loc(#loc1)
    %2 = llvm.mlir.undef : !llvm.struct<(i32, i32, ptr, ptr)> loc(#loc1)
    %3 = llvm.insertvalue %c1180844977_i32, %2[0] : !llvm.struct<(i32, i32, ptr, ptr)>  loc(#loc1)
    %4 = llvm.insertvalue %c1_i32, %3[1] : !llvm.struct<(i32, i32, ptr, ptr)>  loc(#loc1)
    %5 = llvm.insertvalue %1, %4[2] : !llvm.struct<(i32, i32, ptr, ptr)>  loc(#loc1)
    %6 = llvm.insertvalue %0, %5[3] : !llvm.struct<(i32, i32, ptr, ptr)>  loc(#loc1)
    llvm.return %6 : !llvm.struct<(i32, i32, ptr, ptr)> loc(#loc)
  } loc(#loc)
  llvm.mlir.global internal @__cuda_gpubin_handle() {addr_space = 0 : i32, alignment = 8 : i64, dso_local} : !llvm.ptr {
    %0 = llvm.mlir.zero : !llvm.ptr loc(#loc1)
    llvm.return %0 : !llvm.ptr loc(#loc)
  } loc(#loc)
  llvm.mlir.global_ctors {ctors = [@__cuda_module_ctor], priorities = [65535 : i32]} loc(#loc)
  llvm.func @_Z29__device_stub__nvshmem_kernelPiS_(%arg0: !llvm.ptr {llvm.noundef} loc(unknown), %arg1: !llvm.ptr {llvm.noundef} loc(unknown)) attributes {frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"], ["uniform-work-group-size", "true"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
    %c1_i32 = arith.constant 1 : i32 loc(#loc1)
    %c2_i64 = arith.constant 2 : i64 loc(#loc1)
    %c12_i64 = arith.constant 12 : i64 loc(#loc1)
    %0 = llvm.mlir.addressof @_Z29__device_stub__nvshmem_kernelPiS_ : !llvm.ptr loc(#loc1)
    %1 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %2 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %3 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %4 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %5 = llvm.alloca %c1_i32 x i64 {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %6 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %7 = llvm.alloca %c1_i32 x !llvm.struct<(i64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %8 = llvm.alloca %c1_i32 x !llvm.struct<(i64, i32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    llvm.store %arg0, %1 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    llvm.store %arg1, %2 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    %9 = llvm.alloca %c2_i64 x !llvm.ptr {alignment = 16 : i64} : (i64) -> !llvm.ptr loc(#loc1)
    llvm.store %1, %9 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    %10 = llvm.getelementptr %9[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr loc(#loc1)
    llvm.store %2, %10 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    %11 = llvm.call @__cudaPopCallConfiguration(%3, %4, %5, %6) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 loc(#loc1)
    %12 = llvm.load %5 {alignment = 8 : i64} : !llvm.ptr -> i64 loc(#loc1)
    %13 = llvm.load %6 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    "llvm.intr.memcpy"(%7, %3, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1)
    %14 = llvm.getelementptr inbounds %7[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1)
    %15 = llvm.load %14 {alignment = 8 : i64} : !llvm.ptr -> i64 loc(#loc1)
    %16 = llvm.getelementptr inbounds %7[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1)
    %17 = llvm.load %16 {alignment = 8 : i64} : !llvm.ptr -> i32 loc(#loc1)
    "llvm.intr.memcpy"(%8, %4, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1)
    %18 = llvm.getelementptr inbounds %8[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1)
    %19 = llvm.load %18 {alignment = 8 : i64} : !llvm.ptr -> i64 loc(#loc1)
    %20 = llvm.getelementptr inbounds %8[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1)
    %21 = llvm.load %20 {alignment = 8 : i64} : !llvm.ptr -> i32 loc(#loc1)
    llvm.call @__mlir_launch_coerced_kernel__Z29__device_stub__nvshmem_kernelPiS_(%0, %15, %17, %19, %21, %12, %13, %arg0, %arg1) : (!llvm.ptr, i64, i32, i64, i32, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> () loc(#loc1)
    llvm.return loc(#loc1)
  } loc(#loc1)
  llvm.func @__cudaPopCallConfiguration(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 loc(#loc1)
  llvm.func @cudaLaunchKernel(!llvm.ptr, i64, i32, i64, i32, !llvm.ptr, i64, !llvm.ptr) -> i32 loc(#loc1)
  llvm.func @main(%arg0: i32 {llvm.noundef} loc(unknown), %arg1: !llvm.ptr {llvm.noundef} loc(unknown)) -> (i32 {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", "norecurse", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
    %c1_i32 = arith.constant 1 : i32 loc(#loc1)
    %c0_i32 = arith.constant 0 : i32 loc(#loc1)
    %c2_i32 = arith.constant 2 : i32 loc(#loc1)
    %0 = llvm.mlir.addressof @".str" : !llvm.ptr loc(#loc1)
    %c4_i64 = arith.constant 4 : i64 loc(#loc1)
    %c0_i64 = arith.constant 0 : i64 loc(#loc1)
    %c12_i64 = arith.constant 12 : i64 loc(#loc1)
    %1 = llvm.mlir.addressof @_Z29__device_stub__nvshmem_kernelPiS_ : !llvm.ptr loc(#loc1)
    %2 = llvm.mlir.zero : !llvm.ptr loc(#loc1)
    %3 = llvm.mlir.addressof @".str.1" : !llvm.ptr loc(#loc1)
    %4 = llvm.mlir.addressof @".str.2" : !llvm.ptr loc(#loc1)
    %5 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %6 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %7 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %8 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %9 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %10 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %11 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %12 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %13 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %14 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %15 = llvm.alloca %c1_i32 x i64 {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %16 = llvm.alloca %c1_i32 x !llvm.array<2 x ptr> {alignment = 16 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %17 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %18 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %19 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %20 = llvm.alloca %c1_i32 x !llvm.struct<"struct.dim3", (i32, i32, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %21 = llvm.alloca %c1_i32 x !llvm.struct<(i64, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %22 = llvm.alloca %c1_i32 x !llvm.struct<(i64, i32)> {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    llvm.store %c0_i32, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    llvm.store %arg0, %6 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    llvm.store %arg1, %7 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    llvm.call @nvshmem_init() : () -> () loc(#loc1)
    %23 = llvm.call @nvshmem_team_my_pe(%c2_i32) : (i32) -> i32 loc(#loc1)
    llvm.store %23, %10 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    %24 = llvm.load %10 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %25 = llvm.call @cudaSetDevice(%24) : (i32) -> i32 loc(#loc1)
    %26 = llvm.call @nvshmem_my_pe() : () -> i32 loc(#loc1)
    llvm.store %26, %11 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    %27 = llvm.call @nvshmem_n_pes() : () -> i32 loc(#loc1)
    llvm.store %27, %12 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    %28 = llvm.load %12 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %29 = arith.cmpi ne, %28, %c2_i32 : i32 loc(#loc1)
    cf.cond_br %29, ^bb1, ^bb2 loc(#loc1)
  ^bb1:  // pred: ^bb0
    %30 = llvm.load %12 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %31 = llvm.call @printf(%0, %30, %c2_i32) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32) -> i32 loc(#loc1)
    llvm.call @nvshmem_global_exit(%c1_i32) : (i32) -> () loc(#loc1)
    cf.br ^bb2 loc(#loc1)
  ^bb2:  // 2 preds: ^bb0, ^bb1
    %32 = llvm.call @nvshmem_malloc(%c4_i64) : (i64) -> !llvm.ptr loc(#loc1)
    llvm.store %32, %13 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    %33 = llvm.call @nvshmem_malloc(%c4_i64) : (i64) -> !llvm.ptr loc(#loc1)
    llvm.store %33, %14 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    llvm.store %c0_i64, %15 {alignment = 8 : i64} : i64, !llvm.ptr loc(#loc1)
    %34 = llvm.load %13 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    llvm.store %34, %16 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    %35 = llvm.getelementptr inbounds %16[1] : (!llvm.ptr) -> !llvm.ptr, !llvm.ptr loc(#loc1)
    %36 = llvm.load %14 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    llvm.store %36, %35 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    llvm.call @_ZN4dim3C2Ejjj(%17, %c1_i32, %c1_i32, %c1_i32) {no_unwind} : (!llvm.ptr, i32, i32, i32) -> () loc(#loc1)
    llvm.call @_ZN4dim3C2Ejjj(%18, %c1_i32, %c1_i32, %c1_i32) {no_unwind} : (!llvm.ptr, i32, i32, i32) -> () loc(#loc1)
    "llvm.intr.memcpy"(%19, %17, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1)
    "llvm.intr.memcpy"(%20, %18, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1)
    %37 = llvm.getelementptr inbounds %16[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<2 x ptr> loc(#loc1)
    %38 = llvm.load %15 {alignment = 8 : i64} : !llvm.ptr -> i64 loc(#loc1)
    "llvm.intr.memcpy"(%21, %19, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1)
    %39 = llvm.getelementptr inbounds %21[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1)
    %40 = llvm.load %39 {alignment = 4 : i64} : !llvm.ptr -> i64 loc(#loc1)
    %41 = llvm.getelementptr inbounds %21[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1)
    %42 = llvm.load %41 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    "llvm.intr.memcpy"(%22, %20, %c12_i64) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> () loc(#loc1)
    %43 = llvm.getelementptr inbounds %22[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1)
    %44 = llvm.load %43 {alignment = 4 : i64} : !llvm.ptr -> i64 loc(#loc1)
    %45 = llvm.getelementptr inbounds %22[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(i64, i32)> loc(#loc1)
    %46 = llvm.load %45 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %47 = llvm.call @nvshmemx_collective_launch(%1, %40, %42, %44, %46, %37, %38, %2) : (!llvm.ptr, i64, i32, i64, i32, !llvm.ptr, i64, !llvm.ptr) -> i32 loc(#loc1)
    %48 = llvm.load %13 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    %49 = llvm.call @cudaMemcpy(%8, %48, %c4_i64, %c2_i32) : (!llvm.ptr, !llvm.ptr, i64, i32) -> i32 loc(#loc1)
    %50 = llvm.load %14 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    %51 = llvm.call @cudaMemcpy(%9, %50, %c4_i64, %c2_i32) : (!llvm.ptr, !llvm.ptr, i64, i32) -> i32 loc(#loc1)
    llvm.call @nvshmem_barrier_all() : () -> () loc(#loc1)
    %52 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %53 = llvm.load %9 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %54 = llvm.load %8 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %55 = llvm.call @printf(%3, %52, %53, %54) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32, i32) -> i32 loc(#loc1)
    llvm.call @nvshmem_barrier_all() : () -> () loc(#loc1)
    %56 = llvm.load %11 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %57 = llvm.load %8 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %58 = llvm.load %9 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    %59 = llvm.call @printf(%4, %56, %57, %58) vararg(!llvm.func<i32 (ptr, ...)>) : (!llvm.ptr, i32, i32, i32) -> i32 loc(#loc1)
    %60 = llvm.load %13 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    llvm.call @nvshmem_free(%60) : (!llvm.ptr) -> () loc(#loc1)
    %61 = llvm.load %14 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    llvm.call @nvshmem_free(%61) : (!llvm.ptr) -> () loc(#loc1)
    llvm.call @nvshmem_finalize() : () -> () loc(#loc1)
    llvm.return %c0_i32 : i32 loc(#loc1)
  } loc(#loc1)
  llvm.func @nvshmem_init() attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @nvshmem_team_my_pe(i32 {llvm.noundef}) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @cudaSetDevice(i32 {llvm.noundef}) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @nvshmem_my_pe() -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @nvshmem_n_pes() -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @printf(!llvm.ptr {llvm.noundef}, ...) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @nvshmem_global_exit(i32 {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @nvshmem_malloc(i64 {llvm.noundef}) -> !llvm.ptr attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func linkonce_odr unnamed_addr @_ZN4dim3C2Ejjj(%arg0: !llvm.ptr {llvm.align = 4 : i64, llvm.dereferenceable = 12 : i64, llvm.nonnull, llvm.noundef} loc(unknown), %arg1: i32 {llvm.noundef} loc(unknown), %arg2: i32 {llvm.noundef} loc(unknown), %arg3: i32 {llvm.noundef} loc(unknown)) comdat(@__llvm_global_comdat::@_ZN4dim3C2Ejjj) attributes {alignment = 2 : i64, frame_pointer = #llvm.framePointerKind<all>, no_inline, no_unwind, optimize_none, passthrough = ["mustprogress", ["uwtable", "2"], ["min-legal-vector-width", "0"], ["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} {
    %c1_i32 = arith.constant 1 : i32 loc(#loc1)
    %0 = llvm.alloca %c1_i32 x !llvm.ptr {alignment = 8 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %1 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %2 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    %3 = llvm.alloca %c1_i32 x i32 {alignment = 4 : i64} : (i32) -> !llvm.ptr loc(#loc1)
    llvm.store %arg0, %0 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    llvm.store %arg1, %1 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    llvm.store %arg2, %2 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    llvm.store %arg3, %3 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    %4 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    %5 = llvm.getelementptr inbounds %4[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"struct.dim3", (i32, i32, i32)> loc(#loc1)
    %6 = llvm.load %1 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    llvm.store %6, %5 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    %7 = llvm.getelementptr inbounds %4[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"struct.dim3", (i32, i32, i32)> loc(#loc1)
    %8 = llvm.load %2 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    llvm.store %8, %7 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    %9 = llvm.getelementptr inbounds %4[0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"struct.dim3", (i32, i32, i32)> loc(#loc1)
    %10 = llvm.load %3 {alignment = 4 : i64} : !llvm.ptr -> i32 loc(#loc1)
    llvm.store %10, %9 {alignment = 4 : i64} : i32, !llvm.ptr loc(#loc1)
    llvm.return loc(#loc1)
  } loc(#loc1)
  llvm.func @nvshmemx_collective_launch(!llvm.ptr {llvm.noundef}, i64, i32, i64, i32, !llvm.ptr {llvm.noundef}, i64 {llvm.noundef}, !llvm.ptr {llvm.noundef}) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @cudaMemcpy(!llvm.ptr {llvm.noundef}, !llvm.ptr {llvm.noundef}, i64 {llvm.noundef}, i32 {llvm.noundef}) -> i32 attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @nvshmem_barrier_all() attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @nvshmem_free(!llvm.ptr {llvm.noundef}) attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func @nvshmem_finalize() attributes {frame_pointer = #llvm.framePointerKind<all>, passthrough = [["no-trapping-math", "true"], ["stack-protector-buffer-size", "8"], ["target-cpu", "x86-64"]], target_cpu = "x86-64", target_features = #llvm.target_features<["+cmov", "+cx8", "+fxsr", "+mmx", "+sse", "+sse2", "+x87"]>, tune_cpu = "generic"} loc(#loc1)
  llvm.func internal @__cuda_register_globals(%arg0: !llvm.ptr loc(unknown)) attributes {dso_local} {
    %0 = llvm.mlir.addressof @_Z29__device_stub__nvshmem_kernelPiS_ : !llvm.ptr loc(#loc1)
    %1 = llvm.mlir.addressof @mlir.llvm.nameless_global_0 : !llvm.ptr loc(#loc1)
    %c-1_i32 = arith.constant -1 : i32 loc(#loc1)
    %2 = llvm.mlir.zero : !llvm.ptr loc(#loc1)
    %3 = llvm.call @__cudaRegisterFunction(%arg0, %0, %1, %1, %c-1_i32, %2, %2, %2, %2, %2) : (!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 loc(#loc1)
    llvm.return loc(#loc1)
  } loc(#loc1)
  llvm.func @__cudaRegisterFunction(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) -> i32 loc(#loc1)
  llvm.func @__cudaRegisterVar(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, i64, i32, i32) loc(#loc1)
  llvm.func @__cudaRegisterManagedVar(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i64, i32) loc(#loc1)
  llvm.func @__cudaRegisterSurface(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, i32) loc(#loc1)
  llvm.func @__cudaRegisterTexture(!llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, i32, i32, i32) loc(#loc1)
  llvm.func @__cudaRegisterFatBinary(!llvm.ptr) -> !llvm.ptr loc(#loc1)
  llvm.func internal @__cuda_module_ctor() attributes {dso_local} {
    %0 = llvm.mlir.addressof @__cuda_fatbin_wrapper : !llvm.ptr loc(#loc1)
    %1 = llvm.mlir.addressof @__cuda_gpubin_handle : !llvm.ptr loc(#loc1)
    %2 = llvm.mlir.addressof @__cuda_module_dtor : !llvm.ptr loc(#loc1)
    %3 = llvm.call @__cudaRegisterFatBinary(%0) : (!llvm.ptr) -> !llvm.ptr loc(#loc1)
    llvm.store %3, %1 {alignment = 8 : i64} : !llvm.ptr, !llvm.ptr loc(#loc1)
    llvm.call @__cuda_register_globals(%3) : (!llvm.ptr) -> () loc(#loc1)
    llvm.call @__cudaRegisterFatBinaryEnd(%3) : (!llvm.ptr) -> () loc(#loc1)
    %4 = llvm.call @atexit(%2) : (!llvm.ptr) -> i32 loc(#loc1)
    llvm.return loc(#loc1)
  } loc(#loc1)
  llvm.func @__cudaRegisterFatBinaryEnd(!llvm.ptr) loc(#loc1)
  llvm.func @__cudaUnregisterFatBinary(!llvm.ptr) loc(#loc1)
  llvm.func internal @__cuda_module_dtor() attributes {dso_local} {
    %0 = llvm.mlir.addressof @__cuda_gpubin_handle : !llvm.ptr loc(#loc1)
    %1 = llvm.load %0 {alignment = 8 : i64} : !llvm.ptr -> !llvm.ptr loc(#loc1)
    llvm.call @__cudaUnregisterFatBinary(%1) : (!llvm.ptr) -> () loc(#loc1)
    llvm.return loc(#loc1)
  } loc(#loc1)
  llvm.func @atexit(!llvm.ptr) -> i32 loc(#loc1)
  llvm.func @__mlir_launch_coerced_kernel__Z29__device_stub__nvshmem_kernelPiS_(!llvm.ptr, i64, i32, i64, i32, i64, !llvm.ptr, !llvm.ptr, !llvm.ptr) loc(#loc1)
} loc(#loc)
#loc = loc("/home/sw339864/promotion/mlir-research/data-race-detection-benchmark-suite/rmaracebench/nvshmem/gpuInitiated/sync//001-nvshmem-sync-barrierall-local-yes.cu":0:0)