Source code for nabu.reconstruction.rings_cuda

import numpy as np
from ..utils import docstring, get_cuda_srcfile, updiv
from ..cuda.processing import CudaProcessing, __has_pycuda__
from ..processing.padding_cuda import CudaPadding
from ..processing.fft_cuda import get_fft_class, get_available_fft_implems
from ..processing.transpose import CudaTranspose
from ..thirdparty.tomocupy_remove_stripe import remove_all_stripe_pycuda, __have_tomocupy_deringer__
from .rings import MunchDeringer, SinoMeanDeringer, VoDeringer

if __has_pycuda__:
    import pycuda.gpuarray as garray
    from ..cuda.kernel import CudaKernel

try:
    from pycudwt import Wavelets

    __have_pycudwt__ = True
except ImportError:
    __have_pycudwt__ = False


[docs] class CudaMunchDeringer(MunchDeringer): def __init__( self, sigma, sinos_shape, levels=None, wname="db15", padding=None, padding_mode="edge", fft_backend="skcuda", cuda_options=None, ): """ Initialize a "Munch Et Al" sinogram deringer with the Cuda backend. See References for more information. Parameters ----------- sigma: float Standard deviation of the damping parameter. The higher value of sigma, the more important the filtering effect on the rings. levels: int, optional Number of wavelets decomposition levels. By default (None), the maximum number of decomposition levels is used. wname: str, optional Default is "db15" (Daubechies, 15 vanishing moments) sinos_shape: tuple, optional Shape of the sinogram (or sinograms stack). References ---------- B. Munch, P. Trtik, F. Marone, M. Stampanoni, Stripe and ring artifact removal with combined wavelet-Fourier filtering, Optics Express 17(10):8567-8591, 2009. """ super().__init__(sigma, sinos_shape, levels=levels, wname=wname, padding=padding, padding_mode=padding_mode) self._check_can_use_wavelets() self.cuda_processing = CudaProcessing(**(cuda_options or {})) self.ctx = self.cuda_processing.ctx self._init_pycudwt() self._init_padding() self._init_fft(fft_backend) self._setup_fw_kernel() def _check_can_use_wavelets(self): if not (__have_pycudwt__): raise ValueError("Needs pycudwt to use this class") def _init_padding(self): if self.padding is None: return self.padder = CudaPadding( self.sinos_shape[1:], ((0, 0), self.padding), mode=self.padding_mode, cuda_options={"ctx": self.cuda_processing.ctx}, ) def _init_fft(self, fft_backend): self.fft_cls = get_fft_class(backend=fft_backend) # For all k >= 1, we perform a batched (I)FFT along axis 0 on an array # of shape (n_a/2^k, n_x/2^k) (up to DWT size rounding) if self.fft_cls.implem == "vkfft": self._create_plans_vkfft() else: self._create_plans_skfft() def _create_plans_skfft(self): self._fft_plans = {} for level, d_vcoeff in self._d_vertical_coeffs.items(): self._fft_plans[level] = self.fft_cls(d_vcoeff.shape, np.float32, r2c=True, axes=(0,), ctx=self.ctx) def _create_plans_vkfft(self): """ VKFFT does not support batched R2C transforms along axis 0 ("slow axis"). We can either use C2C (faster, but needs more memory) or transpose the arrays to do R2C along axis=1. Here we transpose the arrays. """ self._fft_plans = {} self._transpose_forward_1 = {} self._transpose_forward_2 = {} self._transpose_inverse_1 = {} self._transpose_inverse_2 = {} for level, d_vcoeff in self._d_vertical_coeffs.items(): shape = d_vcoeff.shape # Normally, a batched 1D fft on 2D data of shape (Ny, Nx) along axis 0 returns an array of shape (Ny/2+1, Nx): # # (Ny, Nx) --[fft_0]--> (Ny/2, Nx) # f32 c64 # # In this case, we can only do batched 1D transform along axis 1, so we have to trick with transposes: # # (Ny, Nx) --[T]--> (Nx, Ny) --[fft_1]--> (Nx, Ny/2) --[T]--> (Ny/2, Nx) # f32 f32 c64 c64 # # (In both cases IFFT is done the same way from right to left) self._transpose_forward_1[level] = CudaTranspose(shape, np.float32, ctx=self.ctx) self._fft_plans[level] = self.fft_cls(shape[::-1], np.float32, r2c=True, ctx=self.ctx) self._transpose_forward_2[level] = CudaTranspose((shape[1], shape[0] // 2 + 1), np.complex64, ctx=self.ctx) self._transpose_inverse_1[level] = CudaTranspose((shape[0] // 2 + 1, shape[1]), np.complex64, ctx=self.ctx) self._transpose_inverse_2[level] = CudaTranspose(shape[::-1], np.float32, ctx=self.ctx) def _init_pycudwt(self): if self.levels is None: self.levels = 100 # will be clipped by pycudwt sino_shape = self.sinos_shape[1:] if self.padding is None else self.sino_padded_shape self.cudwt = Wavelets(np.zeros(sino_shape, "f"), self.wname, self.levels) self.levels = self.cudwt.levels # Access memory allocated by "pypwt" from pycuda self._d_sino = garray.empty(sino_shape, np.float32, gpudata=self.cudwt.image_int_ptr()) self._get_vertical_coeffs() def _get_vertical_coeffs(self): self._d_vertical_coeffs = {} # Transfer the (0-memset) coefficients in order to get all the shapes coeffs = self.cudwt.coeffs for i in range(self.cudwt.levels): shape = coeffs[i + 1][1].shape self._d_vertical_coeffs[i + 1] = garray.empty( shape, np.float32, gpudata=self.cudwt.coeff_int_ptr(3 * i + 2) ) def _setup_fw_kernel(self): self._fw_kernel = CudaKernel( "kern_fourierwavelets", filename=get_cuda_srcfile("fourier_wavelets.cu"), signature="Piif", ) def _apply_fft(self, level): d_coeffs = self._d_vertical_coeffs[level] # All the memory is allocated (or re-used) under the hood if self.fft_cls.implem == "vkfft": d_coeffs_t = self._transpose_forward_1[level]( d_coeffs ) # allocates self._transpose_forward_1[level].processing.dst d_coeffs_t_f = self._fft_plans[level].fft(d_coeffs_t) # allocates self._fft_plans[level].output_fft d_coeffs_f = self._transpose_forward_2[level]( d_coeffs_t_f ) # allocates self._transpose_forward_2[level].processing.dst else: d_coeffs_f = self._fft_plans[level].fft(d_coeffs) return d_coeffs_f def _apply_ifft(self, d_coeffs_f, level): d_coeffs = self._d_vertical_coeffs[level] if self.fft_cls.implem == "vkfft": d_coeffs_t_f = self._transpose_inverse_1[level](d_coeffs_f, dst=self._fft_plans[level].output_fft) d_coeffs_t = self._fft_plans[level].ifft( d_coeffs_t_f, output=self._transpose_forward_1[level].processing.dst ) self._transpose_inverse_2[level](d_coeffs_t, dst=d_coeffs) else: self._fft_plans[level].ifft(d_coeffs_f, output=d_coeffs) def _destripe_2D(self, d_sino, output): if not (d_sino.flags.c_contiguous): sino = self.cuda_processing.allocate_array("_d_sino", d_sino.shape, np.float32) sino[:] = d_sino[:] else: sino = d_sino if self.padding is not None: sino = self.padder.pad(sino) # set the "image" for DWT (memcpy D2D) self._d_sino.set(sino) # perform forward DWT self.cudwt.forward() for i in range(self.cudwt.levels): level = i + 1 Ny, Nx = self._d_vertical_coeffs[level].shape # Batched FFT along axis 0 d_vertical_coeffs_f = self._apply_fft(level) # Dampen the wavelets coefficients self._fw_kernel(d_vertical_coeffs_f, Nx, Ny, self.sigma) # IFFT self._apply_ifft(d_vertical_coeffs_f, level) # Finally, inverse DWT self.cudwt.inverse() d_out = self._d_sino if self.padding is not None: d_out = self._d_sino[:, self.padding[0] : -self.padding[1]] # memcpy2D output.set(d_out) return output
[docs] def can_use_cuda_deringer(): """ Check wether cuda implementation of deringer can be used. Checking for installed modules is not enough, as for example pyvkfft can be installed without cuda devices """ can_do_fft = get_available_fft_implems() != [] return can_do_fft and __have_pycudwt__
[docs] class CudaVoDeringer(VoDeringer): """ An interface to topocupy's "remove_all_stripe". """ def _check_requirement(self): if not (__have_tomocupy_deringer__): raise ImportError("need cupy")
[docs] def remove_rings_radios(self, radios): return remove_all_stripe_pycuda(radios, **self._remove_all_stripe_kwargs)
[docs] def remove_rings_sinograms(self, sinos): radios = sinos.transpose(axes=(1, 0, 2)) # view, no copy self.remove_rings_radios(radios) return sinos
[docs] def remove_rings_sinogram(self, sino): radios = sino.reshape(sino.shape[0], 1, sino.shape[1]) # no copy self.remove_rings_radios(radios) return sino
remove_rings = remove_rings_sinograms
[docs] class CudaSinoMeanDeringer(SinoMeanDeringer): @docstring(SinoMeanDeringer) def __init__( self, sinos_shape, mode="subtract", filter_cutoff=None, padding_mode="edge", fft_num_threads=None, **cuda_options, ): self.processing = CudaProcessing(**(cuda_options or {})) super().__init__(sinos_shape, mode, filter_cutoff, padding_mode, fft_num_threads) self._init_kernels() def _init_kernels(self): self.d_sino_profile = self.processing.allocate_array("sino_profile", self.n_x) self._mean_kernel = self.processing.kernel( "vertical_mean", filename=get_cuda_srcfile("normalization.cu"), signature="PPiii", ) self._mean_kernel_block = (32, 1, 1) self._mean_kernel_grid = [updiv(self.sinos_shape[-1], self._mean_kernel_block[0]), 1, 1] self._mean_kernel_args = [self.d_sino_profile, np.int32(self.n_x), np.int32(self.n_angles), np.int32(1)] self._mean_kernel_kwargs = { "grid": self._mean_kernel_grid, "block": self._mean_kernel_block, } self._op_kernel = self.processing.kernel( "inplace_generic_op_3Dby1D", filename=get_cuda_srcfile("ElementOp.cu"), signature="PPiii", options=["-DGENERIC_OP=%d" % (3 if self.mode == "divide" else 1)], ) self._op_kernel_block = (16, 16, 1) self._op_kernel_grid = [updiv(a, b) for a, b in zip(self.sinos_shape[1:][::-1], self._op_kernel_block[:-1])] + [ 1 ] self._op_kernel_args = [self.d_sino_profile, np.int32(self.n_x), np.int32(self.n_angles), np.int32(1)] self._op_kernel_kwargs = { "grid": self._op_kernel_grid, "block": self._op_kernel_block, } def _init_filter(self, filter_cutoff, fft_num_threads, padding_mode): super()._init_filter(filter_cutoff, fft_num_threads, padding_mode) if filter_cutoff is None: return self._d_filter_f = self.processing.to_device("_filter_f", self._filter_f) self.padder = CudaPadding( (self.n_x, 1), ((self._pad_left, self._pad_right), (0, 0)), mode=self.padding_mode, cuda_options={"ctx": self.processing.ctx}, ) fft_cls = get_fft_class() self._fft = fft_cls(self._filter_size, np.float32, r2c=True) def _apply_filter(self, sino_profile): if self._filter_f is None: return sino_profile sino_profile = sino_profile.reshape((-1, 1)) # view sino_profile_p = self.padder.pad(sino_profile).ravel() sino_profile_f = self._fft.fft(sino_profile_p) sino_profile_f *= self._d_filter_f self._fft.ifft(sino_profile_f, output=sino_profile_p) self.d_sino_profile[:] = sino_profile_p[self._pad_left : -self._pad_right] return self.d_sino_profile
[docs] def remove_rings_sinogram(self, sino, output=None): # if output is not None: raise NotImplementedError # if not (sino.flags.c_contiguous): d_sino = self.processing.allocate_array("d_sino", sino.shape, np.float32) d_sino[:] = sino[:] else: d_sino = sino self._mean_kernel(d_sino, *self._mean_kernel_args, **self._mean_kernel_kwargs) self._apply_filter(self.d_sino_profile) self._op_kernel(d_sino, *self._op_kernel_args, **self._op_kernel_kwargs) if not (sino.flags.c_contiguous): sino[:] = self.processing.d_sino[:] return sino
[docs] def remove_rings_sinograms(self, sinograms): for i in range(sinograms.shape[0]): self.remove_rings_sinogram(sinograms[i])