Source code for nabu.io.reader

import os
from math import ceil
from multiprocessing.pool import ThreadPool
import numpy as np
from silx.io.dictdump import h5todict
from tomoscan.io import HDF5File
from .utils import get_compacted_dataslices, convert_dict_values
from ..misc.binning import binning as image_binning
from ..utils import subsample_dict, get_3D_subregion, get_num_threads

try:
    from fabio.edfimage import EdfImage
except ImportError:
    EdfImage = None


[docs] class Reader: """ Abstract class for various file readers. """ def __init__(self, sub_region=None): """ Parameters ---------- sub_region: tuple, optional Coordinates in the form (start_x, end_x, start_y, end_y), to read a subset of each frame. It can be used for Regions of Interest (ROI). Indices start at zero ! """ self._set_default_parameters(sub_region) def _set_default_parameters(self, sub_region): self._set_subregion(sub_region) def _set_subregion(self, sub_region): self.sub_region = sub_region if sub_region is not None: start_x, end_x, start_y, end_y = sub_region self.start_x = start_x self.end_x = end_x self.start_y = start_y self.end_y = end_y else: self.start_x = 0 self.end_x = None self.start_y = 0 self.end_y = None
[docs] def get_data(self, data_url): """ Get data from a silx.io.url.DataUrl """ raise ValueError("Base class")
[docs] def release(self): """ Release the file if needed. """ pass
[docs] class NPReader(Reader): multi_load = True def __init__(self, sub_region=None, mmap=True): """ Reader for NPY/NPZ files. Mostly used for internal development. Please refer to the documentation of nabu.io.reader.Reader """ super().__init__(sub_region=sub_region) self._file_desc = {} self._set_mmap(mmap) def _set_mmap(self, mmap): self.mmap_mode = "r" if mmap else None def _open(self, data_url): file_path = data_url.file_path() file_ext = self._get_file_type(file_path) if file_ext == "npz": if file_path not in self._file_desc: self._file_desc[file_path] = np.load(file_path, mmap_mode=self.mmap_mode) data_ref = self._file_desc[file_path][data_url.data_path()] else: data_ref = np.load(file_path, mmap_mode=self.mmap_mode) return data_ref @staticmethod def _get_file_type(fname): if fname.endswith(".npy"): return "npy" elif fname.endswith(".npz"): return "npz" else: raise ValueError("Not a numpy file: %s" % fname)
[docs] def get_data(self, data_url): data_ref = self._open(data_url) data_slice = data_url.data_slice() if data_slice is None: res = data_ref[self.start_y : self.end_y, self.start_x : self.end_x] else: res = data_ref[data_slice, self.start_y : self.end_y, self.start_x : self.end_x] return res
[docs] def release(self): for fname, fdesc in self._file_desc.items(): if fdesc is not None: fdesc.close() self._file_desc[fname] = None
def __del__(self): self.release()
[docs] class EDFReader(Reader): multi_load = False # not implemented def __init__(self, sub_region=None): """ A class for reading series of EDF Files. Multi-frames EDF are not supported. """ if EdfImage is None: raise ImportError("Need fabio to use this reader") super().__init__(sub_region=sub_region) self._reader = EdfImage() self._first_fname = None
[docs] def read(self, fname): if self._first_fname is None: self._first_fname = fname self._reader.read(fname) if self.sub_region is None: data = self._reader.data else: data = self._reader.fast_read_roi(fname, (slice(self.start_y, self.end_y), slice(self.start_x, self.end_x))) self._reader.close() return data
[docs] def get_data(self, data_url): return self.read(data_url.file_path())
[docs] class HDF5Reader(Reader): multi_load = True def __init__(self, sub_region=None): """ A class for reading a HDF5 File. """ super().__init__(sub_region=sub_region) self._file_desc = {} def _open(self, file_path): if file_path not in self._file_desc: self._file_desc[file_path] = HDF5File(file_path, "r", swmr=True)
[docs] def get_data(self, data_url): file_path = data_url.file_path() self._open(file_path) h5dataset = self._file_desc[file_path][data_url.data_path()] data_slice = data_url.data_slice() if data_slice is None: res = h5dataset[self.start_y : self.end_y, self.start_x : self.end_x] else: res = h5dataset[data_slice, self.start_y : self.end_y, self.start_x : self.end_x] return res
[docs] def release(self): for fname, fdesc in self._file_desc.items(): if fdesc is not None: try: fdesc.close() self._file_desc[fname] = None except Exception as exc: print("Error while closing %s: %s" % (fname, str(exc)))
def __del__(self): self.release()
[docs] class HDF5Loader: """ An alternative class to HDF5Reader where information is first passed at class instantiation """ def __init__(self, fname, data_path, sub_region=None, data_buffer=None, pre_allocate=True, dtype="f"): self.fname = fname self.data_path = data_path self._set_subregion(sub_region) if not ((data_buffer is not None) ^ (pre_allocate is True)): raise ValueError("Please provide either 'data_buffer' or 'pre_allocate'") self.data = data_buffer self._loaded = False if pre_allocate: expected_shape = get_hdf5_dataset_shape(fname, data_path, sub_region=sub_region) self.data = np.zeros(expected_shape, dtype=dtype) def _set_subregion(self, sub_region): self.sub_region = sub_region if sub_region is not None: start_z, end_z, start_y, end_y, start_x, end_x = sub_region self.start_x, self.end_x = start_x, end_x self.start_y, self.end_y = start_y, end_y self.start_z, self.end_z = start_z, end_z else: self.start_x, self.end_x = None, None self.start_y, self.end_y = None, None self.start_z, self.end_z = None, None
[docs] def load_data(self, force_load=False): if self._loaded and not force_load: return self.data with HDF5File(self.fname, "r") as fdesc: if self.data is None: self.data = fdesc[self.data_path][ self.start_z : self.end_z, self.start_y : self.end_y, self.start_x : self.end_x ] else: self.data[:] = fdesc[self.data_path][ self.start_z : self.end_z, self.start_y : self.end_y, self.start_x : self.end_x ] self._loaded = True return self.data
[docs] class ChunkReader: """ A reader of chunk of images. """ def __init__( self, files, sub_region=None, detector_corrector=None, pre_allocate=True, data_buffer=None, convert_float=False, shape=None, dtype=None, binning=None, dataset_subsampling=None, num_threads=None, ): """ Initialize a "ChunkReader". A chunk is a stack of images. Parameters ---------- files: dict Dictionary where the key is the file/data index, and the value is a silx.io.url.DataUrl pointing to the data. The dict must contain only the files which shall be used ! Note: the shape and data type is infered from the first data file. sub_region: tuple, optional If provided, this must be a tuple in the form (start_x, end_x, start_y, end_y). Each image will be cropped to this region. This is used to specify a chunk of files. Each of the parameters can be None, in this case the default start and end are taken in each dimension. pre_allocate: bool Whether to pre-allocate data before reading. data_buffer: array-like, optional If `pre_allocate` is set to False, this parameter has to be provided. It is an array-like object which will hold the data. convert_float: bool Whether to convert data to float32, regardless of the input data type. shape: tuple, optional Shape of each image. If not provided, it is inferred from the first image in the collection. dtype: `numpy.dtype`, optional Data type of each image. If not provided, it is inferred from the first image in the collection. binning: int or tuple of int, optional Whether to bin the data. If multi-dimensional binning is done, the parameter must be in the form (binning_x, binning_y). Each image will be binned by these factors. dataset_subsampling: int or tuple, optional Subsampling factor when reading the images. If an integer `n` is provided, then one image out of `n` will be read. If a tuple of integers (step, begin) is given, the data is read as data[begin::step] num_threads: int, optional Number of threads to use for binning the data. Default is to use all available threads. This parameter has no effect when binning is disabled. Notes ------ The files are provided as a collection of `silx.io.DataURL`. The file type is inferred from the extension. Binning is different from subsampling. Using binning will not speed up the data retrieval (quite the opposite), since the whole (subregion of) data is read and then binning is performed. """ self.detector_corrector = detector_corrector self._get_reader_class(files) self.dataset_subsampling = dataset_subsampling self.num_threads = get_num_threads(num_threads) self._set_files(files) self._get_shape_and_dtype(shape, dtype, binning) self._set_subregion(sub_region) self._init_reader() self._loaded = False self.convert_float = convert_float if convert_float: self.out_dtype = np.float32 else: self.out_dtype = self.dtype if not ((data_buffer is not None) ^ (pre_allocate is True)): raise ValueError("Please provide either 'data_buffer' or 'pre_allocate'") self.files_data = data_buffer if data_buffer is not None: # overwrite out_dtype self.out_dtype = data_buffer.dtype if data_buffer.shape != self.chunk_shape: raise ValueError("Expected shape %s but got %s" % (self.shape, data_buffer.shape)) if pre_allocate: self.files_data = np.zeros(self.chunk_shape, dtype=self.out_dtype) if (self.binning is not None) and (np.dtype(self.out_dtype).kind in ["u", "i"]): raise ValueError( "Output datatype cannot be integer when using binning. Please set the 'convert_float' parameter to True or specify a 'data_buffer'." ) def _set_files(self, files): if len(files) == 0: raise ValueError("Expected at least one data file") self._files_begin_idx = 0 if isinstance(self.dataset_subsampling, (tuple, list)): self._files_begin_idx = self.dataset_subsampling[1] self.dataset_subsampling = self.dataset_subsampling[0] self.n_files = len(files) self.files = files self._sorted_files_indices = sorted(files.keys()) self._fileindex_to_idx = dict.fromkeys(self._sorted_files_indices) self._configure_subsampling() def _infer_file_type(self, files): fname = files[sorted(files.keys())[0]].file_path() ext = os.path.splitext(fname)[-1].replace(".", "") if ext not in Readers: raise ValueError("Unknown file format %s. Supported formats are: %s" % (ext, str(Readers.keys()))) return ext def _get_reader_class(self, files): ext = self._infer_file_type(files) reader_class = Readers[ext] self._reader_class = reader_class def _get_shape_and_dtype(self, shape, dtype, binning): if shape is None or dtype is None: shape, dtype = self._infer_shape_and_dtype() assert len(shape) == 2, "Expected the shape of an image (2-tuple)" self.shape_total = shape self.dtype = dtype self._set_binning(binning) def _configure_subsampling(self): dataset_subsampling = self.dataset_subsampling self.files_subsampled = self.files if dataset_subsampling is not None and dataset_subsampling > 1: self.files_subsampled = subsample_dict(self.files, dataset_subsampling) self.n_files = len(self.files_subsampled) if not (self._reader_class.multi_load): # 3D loading not supported for this reader. # Data is loaded frames by frame, so subsample directly self.files self.files = self.files_subsampled self._sorted_files_indices = sorted(self.files.keys()) self._fileindex_to_idx = dict.fromkeys(self._sorted_files_indices) def _infer_shape_and_dtype(self): self._reader_entire_image = self._reader_class() first_file_dataurl = self.files[self._sorted_files_indices[0]] first_file_data = self._reader_entire_image.get_data(first_file_dataurl) return first_file_data.shape, first_file_data.dtype def _set_subregion(self, sub_region): sub_region = sub_region or (None, None, None, None) start_x, end_x, start_y, end_y = sub_region if start_x is None: start_x = 0 if start_y is None: start_y = 0 if end_x is None: end_x = self.shape_total[1] if end_y is None: end_y = self.shape_total[0] self.sub_region = (start_x, end_x, start_y, end_y) self.shape = (end_y - start_y, end_x - start_x) if self.binning is not None: self.shape = (self.shape[0] // self.binning[1], self.shape[1] // self.binning[0]) self.chunk_shape = (self.n_files,) + self.shape if self.detector_corrector is not None: self.detector_corrector.set_sub_region_transformation(target_sub_region=self.sub_region) def _init_reader(self): # instantiate reader with user params if self.detector_corrector is not None: adapted_subregion = self.detector_corrector.get_adapted_subregion(self.sub_region) else: adapted_subregion = self.sub_region self.file_reader = self._reader_class(sub_region=adapted_subregion) def _set_binning(self, binning): if binning is None: self.binning = None return if np.isscalar(binning): binning = (binning, binning) else: assert len(binning) == 2, "Expected binning in the form (binning_x, binning_y)" if binning[0] == 1 and binning[1] == 1: self.binning = None return for b in binning: if int(b) != b: raise ValueError("Expected an integer number for binning values, but got %s" % binning) self.binning = binning
[docs] def get_data(self, file_url): """ Get the data associated to a file url. """ arr = self.file_reader.get_data(file_url) if arr.ndim == 2: if self.detector_corrector is not None: arr = self.detector_corrector.transform(arr) if self.binning is not None: arr = image_binning(arr, self.binning[::-1]) else: if self.detector_corrector is not None: if self.detector_corrector is not None: _, ( src_x_start, src_x_end, src_z_start, src_z_end, ) = self.detector_corrector.get_actual_shapes_source_target() arr_target = np.empty([len(arr), src_z_end - src_z_start, src_x_end - src_x_start], "f") def apply_corrector(i_img_tuple): i, img = i_img_tuple arr_target[i] = self.detector_corrector.transform(img) with ThreadPool(self.num_threads) as tp: tp.map(apply_corrector, enumerate(arr)) arr = arr_target if self.binning is not None: nz = arr.shape[0] res = np.zeros((nz,) + image_binning(arr[0], self.binning[::-1]).shape, dtype="f") def apply_binning(img_res_tuple): img, res = img_res_tuple res[:] = image_binning(img, self.binning[::-1]) with ThreadPool(self.num_threads) as tp: tp.map(apply_binning, zip(arr, res)) arr = res return arr
def _load_single(self): for i, fileidx in enumerate(self._sorted_files_indices): file_url = self.files[fileidx] self.files_data[i] = self.get_data(file_url) self._fileindex_to_idx[fileidx] = i def _load_multi(self): urls_compacted = get_compacted_dataslices( self.files, subsampling=self.dataset_subsampling, begin=self._files_begin_idx ) loaded = {} start_idx = 0 sorted_files_indices = sorted(urls_compacted.keys()) for idx in sorted_files_indices: url = urls_compacted[idx] url_str = str(url) is_loaded = loaded.get(url_str, False) if is_loaded: continue ds = url.data_slice() delta_z = ds.stop - ds.start if ds.step is not None and ds.step > 1: delta_z = ceil(delta_z / ds.step) end_idx = start_idx + delta_z self.files_data[start_idx:end_idx] = self.get_data(url) start_idx += delta_z loaded[url_str] = True
[docs] def load_files(self, overwrite: bool = False): """ Load the files whose links was provided at class instantiation. Parameters ----------- overwrite: bool, optional Whether to force reloading the files if already loaded. """ if self._loaded and not (overwrite): raise ValueError("Radios were already loaded. Call load_files(overwrite=True) to force reloading") if self.file_reader.multi_load: self._load_multi() else: self._load_single() self._loaded = True
load_data = load_files @property def data(self): return self.files_data
Readers = { "edf": EDFReader, "hdf5": HDF5Reader, "h5": HDF5Reader, "nx": HDF5Reader, "npz": NPReader, "npy": NPReader, }
[docs] def load_images_from_dataurl_dict(data_url_dict, **chunk_reader_kwargs): """ Load a dictionary of dataurl into numpy arrays. Parameters ---------- data_url_dict: dict A dictionary where the keys are integers (the index of each image in the dataset), and the values are numpy.ndarray (data_url_dict). Other parameters ----------------- chunk_reader_kwargs: params Named parameters passed to `nabu.io.reader.ChunkReader`. Returns -------- res: dict A dictionary where the keys are the same as `data_url_dict`, and the values are numpy arrays. """ chunk_reader = ChunkReader(data_url_dict, **chunk_reader_kwargs) img_dict = {} for img_idx, img_url in data_url_dict.items(): img_dict[img_idx] = chunk_reader.get_data(img_url) return img_dict
[docs] def load_images_stack_from_hdf5(fname, h5_data_path, sub_region=None): """ Load a 3D dataset from a HDF5 file. Parameters ----------- fname: str File path h5_data_path: str Data path within the HDF5 file sub_region: tuple, optional Tuple indicating which sub-volume to load, in the form (xmin, xmax, ymin, ymax, zmin, zmax) where the 3D dataset has the python shape (N_Z, N_Y, N_X). This means that the data will be loaded as `data[zmin:zmax, ymin:ymax, xmin:xmax]`. """ xmin, xmax, ymin, ymax, zmin, zmax = get_3D_subregion(sub_region) with HDF5File(fname, "r") as f: d_ptr = f[h5_data_path] data = d_ptr[zmin:zmax, ymin:ymax, xmin:xmax] return data
[docs] def get_hdf5_dataset_shape(fname, h5_data_path, sub_region=None): zmin, zmax, ymin, ymax, xmin, xmax = get_3D_subregion(sub_region) with HDF5File(fname, "r") as f: d_ptr = f[h5_data_path] shape = d_ptr.shape n_z, n_y, n_x = shape # perhaps there is more elegant res_shape = [] for n, bounds in zip([n_z, n_y, n_x], ((zmin, zmax), (ymin, ymax), (xmin, xmax))): res_shape.append(np.arange(n)[bounds[0] : bounds[1]].size) return tuple(res_shape)
[docs] def check_virtual_sources_exist(fname, data_path): with HDF5File(fname, "r") as f: if data_path not in f: print("No dataset %s in file %s" % (data_path, fname)) return False dptr = f[data_path] if not dptr.is_virtual: return True for vsource in dptr.virtual_sources(): vsource_fname = os.path.join(os.path.dirname(dptr.file.filename), vsource.file_name) if not os.path.isfile(vsource_fname): print("No such file: %s" % vsource_fname) return False elif not check_virtual_sources_exist(vsource_fname, vsource.dset_name): print("Error with virtual source %s" % vsource_fname) return False return True
[docs] def import_h5_to_dict(h5file, h5path, asarray=False): """ Wrapper on top of silx.io.dictdump.dicttoh5 replacing "None" with None Parameters ----------- h5file: str File name h5path: str Path in the HDF5 file asarray: bool, optional Whether to convert each numeric value to an 0D array. Default is False. """ dic = h5todict(h5file, path=h5path, asarray=asarray) modified_dic = convert_dict_values(dic, {"None": None}, bytes_tostring=True) return modified_dic