Problem of inlining nvshmemx_collective_launch usages
The snippet can be accessed without any authentication.
Authored by
Semih Burak
From nvshmem-collectiveLaunch-example.cu, currently I get nvshmem-collectiveLaunch-example.mlir, by applying:
clangFlags="-O0 $S -I$CPATH -fno-exceptions -mllvm --emit-mlir -mllvm --transformer-enable -mllvm \
--transformer-pre-merge-mlir-pipeline="canonicalize,convert-llvm-to-cf,canonicalize,convert-llvm-to-arith,canonicalize" "
$clang $clangFlags $inputPath -o $outputFile
nvshmem-collectiveLaunch-example.cu 2.87 KiB
// RACE LABELS BEGIN
/*
{
"RACE_KIND": "local",
"ACCESS_SET": ["local buffer write","load"],
"RACE_PAIR": ["nvshmem_int_get_nbi@36","LOAD@38"],
"NPROCS": 2,
"CONSISTENCY_CALLS": ["nvshmem_barrier_all"],
"SYNC_CALLS": ["nvshmem_barrier_all"],
"DESCRIPTION": "Two conflicting operations get_nbi and load which are not correctly separated by an nvshmem_barrier_all leading to a local race. The shmem_quiet should be in between the two conflicting operations to be correct."
}
*/
// RACE LABELS END
#include <cuda.h>
#include <nvshmem.h>
#include <nvshmemx.h>
#include <stdio.h>
// Number of processing elements
#define PROC_NUM 2
__global__ void nvshmem_kernel(int *remote, int *localbuf) {
// Initialize memory
*remote = 0;
*localbuf = 1;
int x = 0;
int my_pe = nvshmem_my_pe();
// Synchronize across all PEs
nvshmem_barrier_all();
if (my_pe == 0) {
// CONFLICT
nvshmem_int_get_nbi(localbuf, remote, 1, 1);
// CONFLICT
x = *localbuf;
nvshmem_barrier_all();
} else {
nvshmem_barrier_all();
}
*remote += x;
}
int main(int argc, char **argv) {
int remote, localbuf;
// Initialize NVSHMEM
nvshmem_init();
int mype_node = nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE);
cudaSetDevice(mype_node);
// Get the number of PEs and the current PE's rank
int my_pe = nvshmem_my_pe();
int num_pe = nvshmem_n_pes();
// Ensure the required number of PEs
if (num_pe != PROC_NUM) {
printf("Got %d PEs, expected %d\n", num_pe, PROC_NUM);
nvshmem_global_exit(1);
}
// Allocate symmetric memory on the device
int *remote_d = (int *)nvshmem_malloc(sizeof(int));
int *localbuf_d = (int *)nvshmem_malloc(sizeof(int));
// Step 3: Allocate shared memory across PEs
size_t shared_data_size = 0 * sizeof(int);
// Step 4: Define kernel execution parameters
void *args[] = {remote_d, localbuf_d}; // Kernel arguments
dim3 blocks(1);
dim3 threads(1);
// Launch kernel collectively across all PEs
nvshmemx_collective_launch((void *)nvshmem_kernel, blocks, threads, args, shared_data_size, 0);
// Copy data back to host
cudaMemcpy(&remote, remote_d, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&localbuf, localbuf_d, sizeof(int), cudaMemcpyDeviceToHost);
// Synchronize
nvshmem_barrier_all();
printf("PE %d: localbuf = %d, remote = %d\n", my_pe, localbuf, remote);
// Synchronize again
nvshmem_barrier_all();
printf("Process %d: Execution finished, variable contents: remote = %d, localbuf = %d\n", my_pe, remote, localbuf);
// Free NVSHMEM symmetric memory
nvshmem_free(remote_d);
nvshmem_free(localbuf_d);
// Finalize NVSHMEM
nvshmem_finalize();
return 0;
}
nvshmem-collectiveLaunch-example.mlir 35.21 KiB
Please register or sign in to comment