Source code for nabu.cuda.utils

import atexit
from math import ceil
import numpy as np
from ..resources.gpu import GPUDescription

try:
    import pycuda
    import pycuda.driver as cuda
    from pycuda import gpuarray as garray
    from pycuda.tools import clear_context_caches
    from pycuda.compiler import get_nvcc_version as pycuda_get_nvcc_version

    __has_pycuda__ = True
    __pycuda_error_msg__ = None
    if pycuda.VERSION[0] < 2020:
        print("Error: need pycuda >= 2020.1")
        __has_pycuda__ = False
except ImportError as err:
    __has_pycuda__ = False
    __pycuda_error_msg__ = str(err)
try:
    import skcuda

    __has_cufft__ = True
except ImportError:
    __has_cufft__ = False

try:
    import cupy

    __has_cupy__ = True
except ImportError:
    __has_cupy__ = False


[docs] def get_cuda_context(device_id=None, cleanup_at_exit=True): """ Create or get a CUDA context. """ current_ctx = cuda.Context.get_current() # If a context already exists, use this one # TODO what if the device used is different from device_id ? if current_ctx is not None: return current_ctx # Otherwise create a new context cuda.init() if device_id is None: device_id = 0 # Use the Context obtained by retaining the device's primary context, # which is the one used by the CUDA runtime API (ex. scikit-cuda). # Unlike Context.make_context(), the newly-created context is not made current. context = cuda.Device(device_id).retain_primary_context() context.push() # Register a clean-up function at exit def _finish_up(context): if context is not None: context.pop() context = None clear_context_caches() if cleanup_at_exit: atexit.register(_finish_up, context) return context
[docs] def count_cuda_devices(): if cuda.Context.get_current() is None: cuda.init() return cuda.Device.count()
[docs] def get_gpu_memory(device_id): """ Return the total memory (in GigaBytes) of a device. """ cuda.init() return cuda.Device(device_id).total_memory() / 1e9
[docs] def is_gpu_usable(): """ Test whether at least one Nvidia GPU is available. """ try: n_gpus = count_cuda_devices() except Exception as exc: # Fragile if exc.__str__() != "cuInit failed: no CUDA-capable device is detected": raise n_gpus = 0 res = n_gpus > 0 return res
[docs] def detect_cuda_gpus(): """ Detect the available Nvidia CUDA GPUs on the current host. Returns -------- gpus: dict Dictionary where the key is the GPU ID, and the value is a `pycuda.driver.Device` object. error_msg: str In the case where there is an error, the message is returned in this item. Otherwise, it is a None object. """ gpus = {} error_msg = None if not (__has_pycuda__): return {}, __pycuda_error_msg__ try: cuda.init() except Exception as exc: error_msg = str(exc) if error_msg is not None: return {}, error_msg try: n_gpus = cuda.Device.count() except Exception as exc: error_msg = str(exc) if error_msg is not None: return {}, error_msg for i in range(n_gpus): gpus[i] = cuda.Device(i) return gpus, None
[docs] def collect_cuda_gpus(): """ Return a dictionary of GPU ids and brief description of each CUDA-compatible GPU with a few fields. """ gpus, error_msg = detect_cuda_gpus() if error_msg is not None: return None cuda_gpus = {} for gpu_id, gpu in gpus.items(): cuda_gpus[gpu_id] = GPUDescription(gpu).get_dict() return cuda_gpus
[docs] def get_nvcc_version(nvcc_cmd="nvcc"): try: ver = "".join(pycuda_get_nvcc_version(nvcc_cmd)).split("release")[1].strip().split(" ")[0].strip(",") except: ver = None return ver
[docs] def check_textures_availability(): """ Check whether Cuda textures can be used. The only limitation is pycuda which does not support texture objects. Textures references were deprecated, and removed from Cuda 12. """ nvcc_ver = get_nvcc_version() if nvcc_ver is None: return False # unknown - can't parse NVCC version for some reason nvcc_major = int(nvcc_ver.split(".")[0]) return nvcc_major < 12
""" pycuda/driver.py np.complex64: SIGNED_INT32, num_channels = 2 np.float64: SIGNED_INT32, num_channels = 2 np.complex128: array_format.SIGNED_INT32, num_channels = 4 double precision: pycuda-helpers.hpp: typedef float fp_tex_float; // --> float32 typedef int2 fp_tex_double; // --> float64 typedef uint2 fp_tex_cfloat; // --> complex64 typedef int4 fp_tex_cdouble; // --> complex128 """
[docs] def cuarray_format_to_dtype(cuarr_fmt): # reverse of cuda.dtype_to_array_format fmt = cuda.array_format mapping = { fmt.UNSIGNED_INT8: np.uint8, fmt.UNSIGNED_INT16: np.uint16, fmt.UNSIGNED_INT32: np.uint32, fmt.SIGNED_INT8: np.int8, fmt.SIGNED_INT16: np.int16, fmt.SIGNED_INT32: np.int32, fmt.FLOAT: np.float32, } if cuarr_fmt not in mapping: raise TypeError("Unknown format %s" % cuarr_fmt) return mapping[cuarr_fmt]
[docs] def cuarray_shape_dtype(cuarray): desc = cuarray.get_descriptor_3d() shape = (desc.height, desc.width) if desc.depth > 0: shape = (desc.depth,) + shape dtype = cuarray_format_to_dtype(desc.format) return shape, dtype
[docs] def get_shape_dtype(arr): if isinstance(arr, garray.GPUArray) or isinstance(arr, np.ndarray): return arr.shape, arr.dtype elif isinstance(arr, cuda.Array): return cuarray_shape_dtype(arr) else: raise ValueError("Unknown array type %s" % str(type(arr)))
[docs] def copy_array(dst, src, check=False, src_dtype=None, dst_x_in_bytes=0, dst_y=0): """ Copy a source array to a destination array. Source and destination can be either numpy.ndarray, pycuda.Driver.Array, or pycuda.gpuarray.GPUArray. Parameters ----------- dst: pycuda.driver.Array or pycuda.gpuarray.GPUArray or numpy.ndarray Destination array. Its content will be overwritten by copy. src: pycuda.driver.Array or pycuda.gpuarray.GPUArray or numpy.ndarray Source array. check: bool, optional Whether to check src and dst shape and data type. """ shape_src, dtype_src = get_shape_dtype(src) shape_dst, dtype_dst = get_shape_dtype(dst) dtype_src = src_dtype or dtype_src if check: if shape_src != shape_dst: raise ValueError("shape_src != shape_dst : have %s and %s" % (str(shape_src), str(shape_dst))) if dtype_src != dtype_dst: raise ValueError("dtype_src != dtype_dst : have %s and %s" % (str(dtype_src), str(dtype_dst))) if len(shape_src) == 2: copy = cuda.Memcpy2D() h, w = shape_src elif len(shape_src) == 3: copy = cuda.Memcpy3D() d, h, w = shape_src copy.depth = d else: raise ValueError("Expected arrays with 2 or 3 dimensions") if isinstance(src, cuda.Array): copy.set_src_array(src) elif isinstance(src, garray.GPUArray): copy.set_src_device(src.gpudata) else: # numpy copy.set_src_host(src) if isinstance(dst, cuda.Array): copy.set_dst_array(dst) # Support offset (x, y) in target (for copying to texture) copy.dst_x_in_bytes = dst_x_in_bytes copy.dst_y = dst_y elif isinstance(dst, garray.GPUArray): copy.set_dst_device(dst.gpudata) else: # numpy copy.set_dst_host(dst) copy.width_in_bytes = copy.dst_pitch = w * np.dtype(dtype_src).itemsize copy.dst_height = copy.height = h # ?? if len(shape_src) == 2: copy(True) else: copy()
###
[docs] def copy_big_gpuarray(dst, src, itemsize=4, checks=False): """ Copy a big `pycuda.gpuarray.GPUArray` into another. Transactions of more than 2**32 -1 octets fail, so are doing several partial copies of smaller arrays. """ d2h = isinstance(dst, np.ndarray) if checks: assert dst.dtype == src.dtype assert dst.shape == src.shape limit = 2**32 - 1 if np.prod(dst.shape) * itemsize < limit: if d2h: src.get(ary=dst) else: dst[:] = src[:] return def get_shape2(shape): shape2 = list(shape) while np.prod(shape2) * 4 > limit: shape2[0] //= 2 return tuple(shape2) shape2 = get_shape2(dst.shape) nz0 = dst.shape[0] nz = shape2[0] n_transfers = ceil(nz0 / nz) for i in range(n_transfers): zmax = min((i + 1) * nz, nz0) if d2h: src[i * nz : zmax].get(ary=dst[i * nz : zmax]) else: dst[i * nz : zmax] = src[i * nz : zmax]
[docs] def replace_array_memory(arr, new_shape): """ Replace the underlying buffer data of a `pycuda.gpuarray.GPUArray`. This function is dangerous ! It should merely be used to clear memory, the array should not be used afterwise. """ arr.gpudata.free() arr.gpudata = arr.allocator(int(np.prod(new_shape) * arr.dtype.itemsize)) arr.shape = new_shape # TODO re-compute strides return arr
[docs] def pycuda_to_cupy(arr_pycuda): arr_cupy_mem = cupy.cuda.UnownedMemory(arr_pycuda.ptr, arr_pycuda.size, arr_pycuda) arr_cupy_memptr = cupy.cuda.MemoryPointer(arr_cupy_mem, offset=0) return cupy.ndarray(arr_pycuda.shape, dtype=arr_pycuda.dtype, memptr=arr_cupy_memptr) # pylint: disable=E1123
[docs] def cupy_to_pycuda(arr_cupy): return garray.empty(arr_cupy.shape, arr_cupy.dtype, gpudata=arr_cupy.data.ptr)