{"id":21628,"library":"nvshmem4py-cu12","title":"nvshmem4py-cu12","description":"Python bindings for NVSHMEM (NVIDIA's implementation of OpenSHMEM for GPUs). Version 0.3.0 requires Python >=3.9 and CUDA 12.x. This package enables peer-to-peer GPU communication across NVLink and InfiniBand. Under active development with frequent breaking changes.","status":"active","version":"0.3.0","language":"python","source_language":"en","source_url":"https://github.com/NVIDIA/nvshmem4py","tags":["NVSHMEM","GPU","CUDA","MPI","distributed","peer-to-peer","NVIDIA"],"install":[{"cmd":"pip install nvshmem4py-cu12","lang":"bash","label":"Install from PyPI"}],"dependencies":[{"reason":"Required for GPU array operations and device memory management.","package":"cupy-cuda12x","optional":false},{"reason":"Required for parallel process initialization (e.g., MPI.COMM_WORLD).","package":"mpi4py","optional":true}],"imports":[{"note":"nvshmem.init() is correct. Some old examples used 'from nvshmem4py import init' which no longer works.","symbol":"init","correct":"from nvshmem import init"},{"symbol":"barrier","correct":"from nvshmem import barrier"},{"symbol":"my_pe_n","correct":"from nvshmem import my_pe_n"},{"symbol":"n_pes","correct":"from nvshmem import n_pes"}],"quickstart":{"code":"import os\nimport cupy as cp\nfrom nvshmem import init, barrier, my_pe_n, n_pes\n\n# Initialize NVSHMEM (must be called after MPI_Init or similar)\ninit()\n\nrank = my_pe_n()\nnranks = n_pes()\n\n# Allocate symmetric memory on GPU\nbuf = cp.empty(1024, dtype=cp.float32)\n\n# Barrier to synchronize\nbarrier()\n\nprint(f\"Rank {rank}/{nranks} ready.\", flush=True)\n\n# Example: send data from rank 0 to rank 1 (if nranks > 1)\nif nranks > 1:\n    if rank == 0:\n        buf[:] = 1.0\n        nvshmem.putmem(buf.data.ptr, 1, 0, 1024 * 4)  # put to rank 1\n    elif rank == 1:\n        nvshmem.getmem(buf.data.ptr, 0, 0, 1024 * 4)  # get from rank 0\n    barrier()\n\nprint(f\"Rank {rank} finished.\", flush=True)","lang":"python","description":"Initialize NVSHMEM, allocate symmetric GPU memory, perform put/get, and barrier."},"warnings":[{"fix":"Ensure MPI_Init is called before nvshmem.init() in the same process.","message":"NVSHMEM must be initialized after MPI_Init or equivalent. Calling init() before MPI will cause undefined behavior.","severity":"breaking","affected_versions":"all"},{"fix":"Use 'from nvshmem import ...' instead of 'from nvshmem4py import ...'.","message":"The library name changed from nvshmem4py to nvshmem (or nvshmem4py-cu12 for the CUDA 12 variant). Importing 'nvshmem4py' directly will fail.","severity":"breaking","affected_versions":">=0.3.0"},{"fix":"Replace sync_all() calls with barrier().","message":"The function nvshmem.sync_all() is deprecated. Use barrier() for synchronizing all PEs.","severity":"deprecated","affected_versions":"<=0.2.x"},{"fix":"Use cp.empty() or cp.zeros() to allocate device arrays for NVSHMEM operations.","message":"Symmetric memory must be allocated via CuPy or other supported allocator that respects NVSHMEM's memory pool. Using raw cudaMalloc may lead to errors.","severity":"gotcha","affected_versions":"all"},{"fix":"Always synchronize with barrier() after each phase of communication.","message":"NVSHMEM operations require all processes to participate. Missing a barrier between collective operations can cause deadlocks.","severity":"gotcha","affected_versions":"all"}],"env_vars":null,"last_verified":"2026-04-27T00:00:00.000Z","next_check":"2026-07-26T00:00:00.000Z","problems":[{"fix":"Install nvshmem4py-cu12: pip install nvshmem4py-cu12. Then use 'from nvshmem import init'.","cause":"Trying to import 'nvshmem' without installing the package or using the old name 'nvshmem4py'.","error":"ImportError: No module named 'nvshmem'"},{"fix":"Install cupy-cuda12x: pip install cupy-cuda12x","cause":"CuPy for CUDA 12 is not installed.","error":"nvshmem4py-cu12 requires CuPy with CUDA 12.x. ImportError: No module named 'cupy'"},{"fix":"Allocate memory via CuPy (cp.empty) or directly via nvshmem.shmalloc.","cause":"Using non-symmetric memory (e.g., numpy arrays) in NVSHMEM operations.","error":"RuntimeError: NVSHMEM internal error: invalid symmetric memory region"},{"fix":"Call nvshmem.init() before any other NVSHMEM function.","cause":"Calling NVSHMEM functions before init().","error":"RuntimeError: NVSHMEM not initialized. Call nvshmem.init() first."}],"ecosystem":"pypi","meta_description":null,"install_score":null,"install_tag":null,"quickstart_score":null,"quickstart_tag":null}