Source code for silx.opencl.sift.plan

#!/usr/bin/env python
#
#
#    Project: Sift implementation in Python + OpenCL
#             https://github.com/silx-kit/silx
#
#    Copyright (C) 2013-2024  European Synchrotron Radiation Facility, Grenoble, France
#
# Permission is hereby granted, free of charge, to any person
# obtaining a copy of this software and associated documentation
# files (the "Software"), to deal in the Software without
# restriction, including without limitation the rights to use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following
# conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.

"""
Contains a class for creating a plan, allocating arrays, compiling kernels and
other things like that...
to calculate SIFT keypoints and descriptors.


This code implements the SIFT algorithm
The SIFT algorithm belongs to the University of British Columbia. It is
protected by patent US6711293. If you are on a country where this pattent
applies (like the USA), please check if you are allowed to use it. The
University of British Columbia does not require a license for its use for
non-commercial research applications.


This algorithm is patented: U.S. Patent 6,711,293:
"Method and apparatus for identifying scale invariant features in an image and use of same for locating an object in an image",
David Lowe's patent for the SIFT algorithm,  Mar. 8, 1999. 
It is due to expire in March 2019. 
"""

__authors__ = ["Jérôme Kieffer", "Pierre Paleo"]
__contact__ = "jerome.kieffer@esrf.eu"
__license__ = "MIT"
__copyright__ = "European Synchrotron Radiation Facility, Grenoble, France"
__date__ = "10/11/2020"
__status__ = "production"

import os
import time
import math
import logging
import gc
import numpy
from .param import par
from silx.opencl import ocl, pyopencl, kernel_workgroup_size
from silx.opencl.utils import get_opencl_code, nextpower
from ..processing import OpenclProcessing, BufferDescription
from .utils import calc_size, kernel_size

logger = logging.getLogger(__name__)


[docs] class SiftPlan(OpenclProcessing): """This class implements a way to calculate SIFT keypoints. How to calculate a set of SIFT keypoint on an image:: siftp = sift.SiftPlan(img.shape,img.dtype,devicetype="GPU") kp = siftp.keypoints(img) kp is a nx132 array. the second dimension is composed of x,y, scale and angle as well as 128 floats describing the keypoint This SIFT algorithm is patented: U.S. Patent 6,711,293: "Method and apparatus for identifying scale invariant features in an image and use of same for locating an object in an image", """ kernels_max_wg_size = { "orientation_cpu": 1, "orientation_gpu": 128, "descriptor_gpu1": (8, 4, 4), "descriptor_gpu2": (8, 8, 8), "descriptor_cpu": (1,), } converter = { numpy.dtype(numpy.uint8): "u8_to_float", numpy.dtype(numpy.uint16): "u16_to_float", numpy.dtype(numpy.uint32): "u32_to_float", numpy.dtype(numpy.uint64): "u64_to_float", numpy.dtype(numpy.int32): "s32_to_float", numpy.dtype(numpy.int64): "s64_to_float", # numpy.dtype(numpy.float64): "double_to_float", } sigmaRatio = 2.0 ** (1.0 / par.Scales) PIX_PER_KP = 10 # pre_allocate buffers for keypoints dtype_kp = numpy.dtype( [ ("x", numpy.float32), ("y", numpy.float32), ("scale", numpy.float32), ("angle", numpy.float32), ("desc", (numpy.uint8, 128)), ] ) def __init__( self, shape=None, dtype=None, template=None, PIX_PER_KP=None, init_sigma=None, ctx=None, devicetype="all", platformid=None, deviceid=None, block_size=None, memory=None, profile=False, ): """ Constructor of the class :param shape: shape of the input image :param dtype: data type of the input image :param devicetype: can be 'CPU' or 'GPU' :param template: extract shape and dtype from an image :param profile: collect timing info :param device: 2-tuple of integers :param PIX_PER_KP: number of keypoint pre-allocated: 1 for 10 pixel :param block_size: set to 1 under macosX on CPU :param context: provide an external context :param init_sigma: blurring width, you should have good reasons to modify the 1.6 default value... """ self.kernels_max_wg_size = self.__class__.kernels_max_wg_size.copy() if template is not None: self.shape = template.shape self.dtype = template.dtype else: self.shape = shape self.dtype = numpy.dtype(dtype) if len(self.shape) == 3: self.RGB = True self.shape = self.shape[:2] elif len(self.shape) == 2: self.RGB = False else: raise RuntimeError( "Unable to process image of shape %s" % ( tuple( self.shape, ) ) ) if PIX_PER_KP: self.PIX_PER_KP = int(PIX_PER_KP) self.kpsize = None if init_sigma is None: init_sigma = par.InitSigma # no test on the values, just make sure it is a float self._init_sigma = float(init_sigma) memory = self._calc_memory(block_size) OpenclProcessing.__init__( self, ctx=ctx, devicetype=devicetype, platformid=platformid, deviceid=deviceid, block_size=block_size, profile=profile, memory=memory, ) # TODO WORKGROUP .... self.kernels_wg = {} self.scales = [] # in XY order self.procsize = ( [] ) # same as procsize but with dimension in (X,Y) not (slow, fast) self.wgsize = [] self.octave_max = None self.red_size = None self._calc_scales() self.LOW_END = 0 self._calc_workgroups() self.compile_kernels() self._allocate_buffers() self.cnt = numpy.empty(1, dtype=numpy.int32) if "CPU" in self.device.type: self.USE_CPU = True else: self.USE_CPU = False def _calc_scales(self): """ Nota scales are in XY order """ shape = self.shape[-1::-1] self.scales = [tuple(numpy.int32(i) for i in shape)] min_size = 2 * par.BorderDist + 2 while min(shape) > min_size: shape = tuple(numpy.int32(i // 2) for i in shape) self.scales.append(shape) self.scales.pop() self.octave_max = len(self.scales) def _calc_memory(self, block_size=None): """ Estimates the memory footprint of all buffer to ensure it fits on the device """ block_size = int(block_size) if block_size else 4096 # upper limit # Just the context + kernel takes about 75MB on the GPU memory = 75 * 2**20 size_of_float = numpy.dtype(numpy.float32).itemsize size_of_input = numpy.dtype(self.dtype).itemsize # raw images: size = self.shape[0] * self.shape[1] memory += size * size_of_input # initial_image (no raw_float) if self.RGB: memory += 2 * size * (size_of_input) # one of three was already counted nr_blur = par.Scales + 3 # 3 blurs and 2 tmp nr_dogs = par.Scales + 2 memory += size * (nr_blur + nr_dogs) * size_of_float self.kpsize = int( self.shape[0] * self.shape[1] // self.PIX_PER_KP ) # Is the number of kp independant of the octave ? int64 causes problems with pyopencl memory += ( self.kpsize * size_of_float * 4 * 2 ) # those are array of float4 to register keypoints, we need two of them memory += self.kpsize * 128 # stores the descriptors: 128 unsigned chars memory += 4 # keypoint index Counter wg_float = min(block_size, numpy.sqrt(self.shape[0] * self.shape[1])) self.red_size = nextpower(wg_float) memory += 4 * 2 * self.red_size # temporary storage for reduction ######################################################################## # Calculate space for gaussian kernels ######################################################################## curSigma = 1.0 if par.DoubleImSize else 0.5 if self._init_sigma > curSigma: sigma = math.sqrt(self._init_sigma**2 - curSigma**2) size = kernel_size(sigma, True) logger.debug("pre-Allocating %s float for init blur" % size) memory += size * size_of_float prevSigma = self._init_sigma for _ in range(par.Scales + 2): increase = prevSigma * math.sqrt(self.sigmaRatio**2 - 1.0) size = kernel_size(increase, True) logger.debug( "pre-Allocating %s float for blur sigma: %s" % (size, increase) ) memory += size * size_of_float prevSigma *= self.sigmaRatio # self.memory = memory return memory def _allocate_buffers(self): """ All buffers are allocated here """ shape = self.shape buffers = [ BufferDescription("min", 1, numpy.float32, None), BufferDescription("max", 1, numpy.float32, None), BufferDescription("255", 1, numpy.float32, None), BufferDescription("cnt", 1, numpy.int32, None), BufferDescription("Kp_1", (self.kpsize, 4), numpy.float32, None), BufferDescription("Kp_2", (self.kpsize, 4), numpy.float32, None), BufferDescription("descr", (self.kpsize, 128), numpy.uint8, None), BufferDescription("descriptors", (self.kpsize, 128), numpy.uint8, None), BufferDescription("tmp", shape, numpy.float32, None), BufferDescription("ori", shape, numpy.float32, None), BufferDescription( "DoGs", (par.Scales + 2, shape[0], shape[1]), numpy.float32, None ), BufferDescription( "max_min", (self.red_size, 2), numpy.float32, None ), # temporary buffer for max/min reduction ] if self.dtype != numpy.float32: if self.RGB: rgbshape = self.shape[0], self.shape[1], 3 buffers.append(BufferDescription("raw", rgbshape, self.dtype, None)) else: buffers.append(BufferDescription("raw", shape, self.dtype, None)) for scale in range(par.Scales + 3): buffers.append( BufferDescription("scale_%i" % scale, shape, numpy.float32, None) ) self.allocate_buffers(buffers, use_array=True) self.cl_mem["255"].fill(255.0) ######################################################################## # Allocate space for gaussian kernels ######################################################################## curSigma = 1.0 if par.DoubleImSize else 0.5 if self._init_sigma > curSigma: sigma = math.sqrt(self._init_sigma**2 - curSigma**2) self._init_gaussian(sigma) prevSigma = self._init_sigma for _ in range(par.Scales + 2): increase = prevSigma * math.sqrt(self.sigmaRatio**2 - 1.0) self._init_gaussian(increase) prevSigma *= self.sigmaRatio def _init_gaussian(self, sigma): """Create a buffer of the right size according to the width of the gaussian ... :param sigma: width of the gaussian, the length of the function will be 8*sigma + 1 Same calculation done on CPU x = numpy.arange(size) - (size - 1.0) / 2.0 gaussian = numpy.exp(-(x / sigma) ** 2 / 2.0).astype(numpy.float32) gaussian /= gaussian.sum(dtype=numpy.float32) """ pyopencl.enqueue_barrier(self.queue).wait() name = "gaussian_%s" % sigma size = kernel_size(sigma, True) wg_size = nextpower(size) logger.info( "Allocating %s float for blur sigma: %s. wg=%s max_wg=%s", size, sigma, wg_size, self.block_size, ) wg1 = self.kernels_wg["gaussian"] if wg1 >= wg_size: gaussian_gpu = pyopencl.array.empty(self.queue, size, dtype=numpy.float32) pyopencl.enqueue_barrier(self.queue).wait() kernel = self.kernels.get_kernel("gaussian") shm1 = pyopencl.LocalMemory(4 * wg_size) shm2 = pyopencl.LocalMemory(4 * wg_size) evt = kernel( self.queue, (wg_size,), (wg_size,), gaussian_gpu.data, numpy.float32(sigma), # const float sigma, numpy.int32(size), # const int SIZE shm1, shm2, ) # some shared memory pyopencl.enqueue_barrier(self.queue).wait() if self.profile: self.events.append(("gaussian %s" % sigma, evt)) else: logger.info( "Workgroup size error: gaussian wg: %s < max_work_group_size: %s", wg1, self.block_size, ) # common bug on OSX when running on CPU x = numpy.arange(size) - (size - 1.0) / 2.0 gaus = numpy.exp(-((x / sigma) ** 2) / 2.0).astype(numpy.float32) gaus /= gaus.sum(dtype=numpy.float32) gaussian_gpu = pyopencl.array.to_device(self.queue, gaus) self.cl_mem[name] = gaussian_gpu return gaussian_gpu
[docs] def compile_kernels(self): """Call the OpenCL compiler TODO: use the parameters to define the compile-time constants and use them all in kernels. """ to_compile = [ "sift", "convolution", "preprocess", "algebra", "image", "gaussian", "reductions", "memset", ] to_compile += list(self.kernels_max_wg_size.keys()) compile_options = "-D WORKGROUP_SIZE=%s" % self.block_size try: OpenclProcessing.compile_kernels( self, [os.path.join("sift", kernel) for kernel in to_compile], compile_options=compile_options, ) except Exception as err: logger.error("error while compiling sift: %s", err) else: for kn in self.kernels.get_kernels(): res = self.check_workgroup_size(kn) self.kernels_wg[kn] = min(res, self.block_size)
def _free_kernels(self): """free all kernels""" self.programs = {} def _calc_workgroups(self): """First try to guess the best workgroup size, then calculate all global worksize Nota: The workgroup size is limited by the device, some devices report wrong size. The workgroup size is limited to the 2**n below then image size (hence changes with octaves) The second dimension of the wg size should be large, the first small: i.e. (1,64) The processing size should be a multiple of workgroup size. """ device = self.ctx.devices[0] max_work_item_sizes = device.max_work_item_sizes if self.block_size: self.block_size = min(max_work_item_sizes[0], self.block_size) else: self.block_size = max_work_item_sizes[0] # MacOSX driver on CPU usually reports bad workgroup size: this is addressed in ocl self.block_size = min(self.block_size, self.device.max_work_group_size) for k, v in self.kernels_max_wg_size.items(): if isinstance(v, int): self.kernels_wg[k] = min(v, self.block_size) else: # probably a list prod = numpy.prod(v) if prod <= self.block_size: self.kernels_wg[k] = prod # else it is not possible to run this kernel. # If the kernel is not present in the dict, it should not be used. wg_float = min(self.block_size, numpy.sqrt(self.shape[0] * self.shape[1])) self.red_size = nextpower(wg_float) # we recalculate the shapes ... shape = self.shape min_size = 2 * par.BorderDist + 2 while min(shape) > min_size: wg = (min(nextpower(shape[-1]), self.block_size), 1) self.wgsize.append(wg) self.procsize.append(calc_size(shape[-1::-1], wg)) shape = tuple(i // 2 for i in shape)
[docs] def keypoints(self, image, mask=None): """Calculates the keypoints of the image TODO: use a temporary list with events and use a single test at the end :param image: ndimage of 2D (or 3D if RGB) :param mask: TODO: implement a mask for sieving out the keypoints :return: vector of keypoint (1D numpy array) """ # self.reset_timer() with self.sem: total_size = 0 keypoints = [] descriptors = [] assert image.shape[:2] == self.shape assert image.dtype in [self.dtype, numpy.float32] # old versions of pyopencl do not check for data contiguity if not (isinstance(image, pyopencl.array.Array)) and not ( image.flags["C_CONTIGUOUS"] ): image = numpy.ascontiguousarray(image) t0 = time.time() if image.dtype == numpy.float32: if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy( self.queue, self.cl_mem["scale_0"].data, image.data ) else: evt = pyopencl.enqueue_copy( self.queue, self.cl_mem["scale_0"].data, image ) if self.profile: self.events.append(("copy H->D", evt)) elif self.dtype == numpy.float64: # A preprocessing kernel double_to_float exists, but is commented (RUNS ONLY ON GPU WITH FP64) # TODO: benchmark this kernel vs the current pure CPU format conversion with numpy.float32 # and uncomment it if it proves faster (dubious, because of data transfer bottleneck) evt = pyopencl.enqueue_copy( self.queue, self.cl_mem["scale_0"].data, image.astype(numpy.float32) ) if self.profile: self.events.append(("copy H->D", evt)) elif ( (len(image.shape) == 3) and (image.dtype == numpy.uint8) and (self.RGB) ): if isinstance(image, pyopencl.array.Array): evt = pyopencl.enqueue_copy( self.queue, self.cl_mem["raw"].data, image.data ) else: evt = pyopencl.enqueue_copy( self.queue, self.cl_mem["raw"].data, image ) if self.profile: self.events.append(("copy H->D", evt)) evt = self.kernels.get_kernel("rgb_to_float")( self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["raw"].data, self.cl_mem["scale_0"].data, *self.scales[0], ) if self.profile: self.events.append(("RGB -> float", evt)) elif self.dtype in self.converter: program = self.kernels.get_kernel(self.converter[self.dtype]) evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["raw"].data, image) if self.profile: self.events.append(("copy H->D", evt)) evt = program( self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["raw"].data, self.cl_mem["scale_0"].data, *self.scales[0], ) if self.profile: self.events.append(("convert -> float", evt)) else: raise RuntimeError( "invalid input format error (%s)" % (str(self.dtype)) ) wg1 = self.kernels_wg["max_min_global_stage1"] wg2 = self.kernels_wg["max_min_global_stage2"] if min(wg1, wg2) < self.red_size: # common bug on OSX when running on CPU logger.info( "Unable to use MinMax Reduction: stage1 wg: %s; stage2 wg: %s < max_work_group_size: %s, expected: %s", wg1, wg2, self.block_size, self.red_size, ) kernel = self.kernels.get_kernel("max_min_vec16") k = kernel( self.queue, (1,), (1,), self.cl_mem["scale_0"].data, numpy.int32(self.shape[0] * self.shape[1]), self.cl_mem["max"].data, self.cl_mem["min"].data, ) if self.profile: self.events.append(("max_min_serial", k)) # python implementation: # buffer_ = self.cl_mem["scale_0"].get() # self.cl_mem["max"].set(numpy.array([buffer_.max()], dtype=numpy.float32)) # self.cl_mem["min"].set(numpy.array([buffer_.min()], dtype=numpy.float32)) else: kernel1 = self.kernels.get_kernel("max_min_global_stage1") kernel2 = self.kernels.get_kernel("max_min_global_stage2") # logger.debug("self.red_size: %s", self.red_size) shm = pyopencl.LocalMemory(self.red_size * 2 * 4) k1 = kernel1( self.queue, (self.red_size * self.red_size,), (self.red_size,), self.cl_mem["scale_0"].data, self.cl_mem["max_min"].data, numpy.int32(self.shape[0] * self.shape[1]), shm, ) k2 = kernel2( self.queue, (self.red_size,), (self.red_size,), self.cl_mem["max_min"].data, self.cl_mem["max"].data, self.cl_mem["min"].data, shm, ) if self.profile: self.events.append(("max_min_stage1", k1)) self.events.append(("max_min_stage2", k2)) evt = self.kernels.get_kernel("normalizes")( self.queue, self.procsize[0], self.wgsize[0], self.cl_mem["scale_0"].data, self.cl_mem["min"].data, self.cl_mem["max"].data, self.cl_mem["255"].data, *self.scales[0], ) if self.profile: self.events.append(("normalize", evt)) curSigma = 1.0 if par.DoubleImSize else 0.5 octave = 0 if self._init_sigma > curSigma: logger.debug("Bluring image to achieve std: %f", self._init_sigma) sigma = math.sqrt(self._init_sigma**2 - curSigma**2) self._gaussian_convolution( self.cl_mem["scale_0"], self.cl_mem["scale_0"], sigma, 0 ) for octave in range(self.octave_max): kp, descriptor = self._one_octave(octave) logger.info("in octave %i found %i kp" % (octave, kp.shape[0])) if len(kp): # sieve out coordinates with NaNs mask = numpy.where(numpy.logical_not(numpy.isnan(kp.sum(axis=-1)))) keypoints.append(kp[mask]) descriptors.append(descriptor[mask]) total_size += len(mask[0]) ######################################################################## # Merge keypoints in central memory ######################################################################## output = numpy.recarray(shape=(total_size,), dtype=self.dtype_kp) last = 0 for ds, desc in zip(keypoints, descriptors): l = ds.shape[0] if l > 0: output[last : last + l].x = ds[:, 0] output[last : last + l].y = ds[:, 1] output[last : last + l].scale = ds[:, 2] output[last : last + l].angle = ds[:, 3] output[last : last + l].desc = desc last += l logger.info("Execution time: %.3fms" % (1000 * (time.time() - t0))) return output
__call__ = keypoints def _gaussian_convolution(self, input_data, output_data, sigma, octave=0): """ Calculate the gaussian convolution with precalculated kernels. :param input_data: pyopencl array with input :param output_data: pyopencl array with result :param sigma: width of the gaussian :param octave: related to the size on the input images * Uses a temporary buffer * Needs gaussian kernel to be available on device """ temp_data = self.cl_mem["tmp"] gaussian = self.cl_mem["gaussian_%s" % sigma] k1 = self.kernels.get_kernel("horizontal_convolution")( self.queue, self.procsize[octave], self.wgsize[octave], input_data.data, temp_data.data, gaussian.data, numpy.int32(gaussian.size), *self.scales[octave], ) k2 = self.kernels.get_kernel("vertical_convolution")( self.queue, self.procsize[octave], self.wgsize[octave], temp_data.data, output_data.data, gaussian.data, numpy.int32(gaussian.size), *self.scales[octave], ) if self.profile: self.events += [ ("Blur sigma %s octave %s" % (sigma, octave), k1), ("Blur sigma %s octave %s" % (sigma, octave), k2), ] def _one_octave(self, octave): """ Does all scales within an octave :param octave: number of the octave """ prevSigma = self._init_sigma logger.info("Calculating octave %i" % octave) wgsize = (128,) # (max(self.wgsize[octave]),) #TODO: optimize kpsize32 = numpy.int32(self.kpsize) self._reset_keypoints() octsize = numpy.int32(2**octave) last_start = numpy.int32(0) for scale in range(par.Scales + 2): sigma = prevSigma * math.sqrt(self.sigmaRatio**2 - 1.0) logger.info( "Octave %i scale %s blur with sigma %s" % (octave, scale, sigma) ) ######################################################################## # Calculate gaussian blur and DoG ######################################################################## self._gaussian_convolution( self.cl_mem["scale_%i" % scale], self.cl_mem["scale_%i" % (scale + 1)], sigma, octave, ) prevSigma *= self.sigmaRatio evt = self.kernels.get_kernel("combine")( self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["scale_%i" % (scale + 1)].data, numpy.float32(-1.0), self.cl_mem["scale_%i" % (scale)].data, numpy.float32(+1.0), self.cl_mem["DoGs"].data, numpy.int32(scale), *self.scales[octave], ) if self.profile: self.events.append(("DoG %s %s" % (octave, scale), evt)) for scale in range(1, par.Scales + 1): evt = self.kernels.get_kernel("local_maxmin")( self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["DoGs"].data, # __global float* DOGS, self.cl_mem["Kp_1"].data, # __global keypoint* output, numpy.int32(par.BorderDist), # int border_dist, numpy.float32(par.PeakThresh), # float peak_thresh, octsize, # int octsize, numpy.float32(par.EdgeThresh1), # float EdgeThresh0, numpy.float32(par.EdgeThresh), # float EdgeThresh, self.cl_mem["cnt"].data, # __global int* counter, kpsize32, # int nb_keypoints, numpy.int32(scale), # int scale, *self.scales[octave], ) # int width, int height) if self.profile: self.events.append(("local_maxmin %s %s" % (octave, scale), evt)) procsize = calc_size((self.kpsize,), wgsize) cp_evt = pyopencl.enqueue_copy( self.queue, self.cnt, self.cl_mem["cnt"].data ) # TODO: modify interp_keypoint so that it reads end_keypoint from GPU memory evt = self.kernels.get_kernel("interp_keypoint")( self.queue, procsize, wgsize, self.cl_mem["DoGs"].data, # __global float* DOGS, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, last_start, # int start_keypoint, self.cnt[0], # int end_keypoint, numpy.float32(par.PeakThresh), # float peak_thresh, numpy.float32(self._init_sigma), # float InitSigma, *self.scales[octave], ) # int width, int height) if self.profile: self.events += [ ("get cnt", cp_evt), ("interp_keypoint %s %s" % (octave, scale), evt), ] newcnt = self._compact(last_start) evt = self.kernels.get_kernel("compute_gradient_orientation")( self.queue, self.procsize[octave], self.wgsize[octave], self.cl_mem["scale_%s" % (scale)].data, # __global float* igray, self.cl_mem["tmp"].data, # __global float *grad, self.cl_mem["ori"].data, # __global float *ori, *self.scales[octave], ) # int width,int height if self.profile: self.events.append( ("compute_gradient_orientation %s %s" % (octave, scale), evt) ) # Orientation assignement: 1D kernel, rather heavy kernel if ( newcnt and newcnt > last_start ): # launch kernel only if neededwgsize = (128,) if self.USE_CPU: orientation_name = "orientation_cpu" scales = self.scales[octave] else: orientation_name = "orientation_gpu" scales = list(self.scales[octave]) + [ pyopencl.LocalMemory(36 * 4), pyopencl.LocalMemory(128 * 4), pyopencl.LocalMemory(128 * 4), ] orientation = self.kernels.get_kernel(orientation_name) wg = self.kernels_max_wg_size[orientation_name] wgsize2 = (wg,) procsize = (int(newcnt * wg),) evt = orientation( self.queue, procsize, wgsize2, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem["tmp"].data, # __global float* grad, self.cl_mem["ori"].data, # __global float* ori, self.cl_mem["cnt"].data, # __global int* counter, octsize, # int octsize, numpy.float32( par.OriSigma ), # float OriSigma, //WARNING: (1.5), it is not "InitSigma (=1.6)" kpsize32, # int max of nb_keypoints, numpy.int32(last_start), # int keypoints_start, newcnt, # int keypoints_end, *scales, ) # int grad_width, int grad_height) # newcnt = self.cl_mem["cnt"].get()[0] #do not forget to update numbers of keypoints, modified above ! evt_cp = pyopencl.enqueue_copy( self.queue, self.cnt, self.cl_mem["cnt"].data ) newcnt = self.cnt[ 0 ] # do not forget to update numbers of keypoints, modified above ! for _ in range(3): # up to 3 attempts if self.USE_CPU or (self.LOW_END > 1): logger.info("Computing descriptors with CPU optimized kernels") descriptor_name = "descriptor_cpu" wg = self.kernels_max_wg_size[descriptor_name][0] wgsize2 = (wg,) procsize2 = (int(newcnt * wg),) else: if self.LOW_END: logger.info( "Computing descriptors with older-GPU optimized kernels" ) descriptor_name = "descriptor_gpu1" wgsize2 = self.kernels_max_wg_size[descriptor_name] procsize2 = ( int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2], ) # if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2): # # will fail anyway: # self.LOW_END += 1 # continue else: logger.info( "Computing descriptors with newer-GPU optimized kernels" ) descriptor_name = "descriptor_gpu2" wgsize2 = self.kernels_max_wg_size[descriptor_name] procsize2 = ( int(newcnt * wgsize2[0]), wgsize2[1], wgsize2[2], ) # if self.kernels_wg[descriptor_name] < numpy.prod(wgsize2): # # will fail anyway: # self.LOW_END += 1 # continue try: descriptor = self.kernels.get_kernel(descriptor_name) evt2 = descriptor( self.queue, procsize2, wgsize2, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem[ "descriptors" ].data, # ___global unsigned char *descriptors self.cl_mem["tmp"].data, # __global float* grad, self.cl_mem["ori"].data, # __global float* ori, octsize, # int octsize, numpy.int32(last_start), # int keypoints_start, self.cl_mem["cnt"].data, # int* keypoints_end, *self.scales[octave], ) # int grad_width, int grad_height) evt2.wait() except (pyopencl.RuntimeError, pyopencl._cl.LogicError) as error: self.LOW_END += 1 logger.error( "Descriptor failed with %s. Switching to lower_end mode" % error ) continue else: break if self.profile: self.events += [ ("%s %s %s" % (orientation_name, octave, scale), evt), ("copy cnt D->H", evt_cp), ("%s %s %s" % (descriptor_name, octave, scale), evt2), ] evt_cp = pyopencl.enqueue_copy( self.queue, self.cnt, self.cl_mem["cnt"].data ) last_start = self.cnt[0] if self.profile: self.events.append(("copy cnt D->H", evt_cp)) ######################################################################## # Rescale all images to populate all octaves ######################################################################## if octave < self.octave_max - 1: evt = self.kernels.get_kernel("shrink")( self.queue, self.procsize[octave + 1], self.wgsize[octave + 1], self.cl_mem["scale_%i" % (par.Scales)].data, self.cl_mem["scale_0"].data, numpy.int32(2), numpy.int32(2), self.scales[octave][0], self.scales[octave][1], *self.scales[octave + 1], ) if self.profile: self.events.append( ( "shrink %s->%s" % (self.scales[octave], self.scales[octave + 1]), evt, ) ) results = numpy.empty((last_start, 4), dtype=numpy.float32) descriptors = numpy.empty((last_start, 128), dtype=numpy.uint8) if last_start: evt = pyopencl.enqueue_copy(self.queue, results, self.cl_mem["Kp_1"].data) evt2 = pyopencl.enqueue_copy( self.queue, descriptors, self.cl_mem["descriptors"].data ) if self.profile: self.events += [("copy D->H", evt), ("copy D->H", evt2)] return results, descriptors def _compact(self, start=numpy.int32(0)): """ Compact the vector of keypoints starting from start :param start: start compacting at this adress. Before just copy :type start: numpy.int32 """ wgsize = (self.kernels_wg["compact"],) cp0_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) kp_counter = self.cnt[0] procsize = calc_size((self.kpsize,), wgsize) if kp_counter > 0.9 * self.kpsize: logger.warning( "Keypoint counter overflow risk: counted %s / %s" % (kp_counter, self.kpsize) ) logger.info("Compact %s -> %s / %s" % (start, kp_counter, self.kpsize)) self.cnt[0] = start cp1_evt = pyopencl.enqueue_copy(self.queue, self.cl_mem["cnt"].data, self.cnt) evt = self.kernels.get_kernel("compact")( self.queue, procsize, wgsize, self.cl_mem["Kp_1"].data, # __global keypoint* keypoints, self.cl_mem["Kp_2"].data, # __global keypoint* output, self.cl_mem["cnt"].data, # __global int* counter, start, # int start, kp_counter, ) # int nbkeypoints cp2_evt = pyopencl.enqueue_copy(self.queue, self.cnt, self.cl_mem["cnt"].data) # swap keypoints: self.cl_mem["Kp_1"], self.cl_mem["Kp_2"] = ( self.cl_mem["Kp_2"], self.cl_mem["Kp_1"], ) # memset buffer Kp_2 # self.cl_mem["Kp_2"].fill(-1, self.queue) mem_evt = self.kernels.get_kernel("memset_float")( self.queue, calc_size((4 * self.kpsize,), wgsize), wgsize, self.cl_mem["Kp_2"].data, numpy.float32(-1), numpy.int32(4 * self.kpsize), ) if self.profile: self.events += [ ("copy cnt D->H", cp0_evt), ("copy cnt H->D", cp1_evt), ("compact", evt), ("copy cnt D->H", cp2_evt), ("memset 2", mem_evt), ] return self.cnt[0] def _reset_keypoints(self): """ Todo: implement directly in OpenCL instead of relying on pyOpenCL """ wg_size = (self.kernels_wg["memset_float"],) evt1 = self.kernels.get_kernel("memset_float")( self.queue, calc_size((4 * self.kpsize,), wg_size), wg_size, self.cl_mem["Kp_1"].data, numpy.float32(-1), numpy.int32(4 * self.kpsize), ) # evt2 = self.kernels.get_kernel("memset"].memset_float(self.queue, calc_size((4 * self.kpsize,), wg_size), wg_size, self.cl_mem["Kp_2"].data, numpy.float32(-1), numpy.int32(4 * self.kpsize)) evt3 = self.kernels.get_kernel("memset_int")( self.queue, (1,), (1,), self.cl_mem["cnt"].data, numpy.int32(0), numpy.int32(1), ) if self.profile: self.events += [("memset 1", evt1), ("memset cnt", evt3)] # self.cl_mem["Kp_1"].fill(-1, self.queue) # self.cl_mem["Kp_2"].fill(-1, self.queue) # self.cl_mem["cnt"].fill(0, self.queue)
[docs] def count_kp(self, output): """ Print the number of keypoint per octave """ kpt = 0 for octave, data in enumerate(output): if output.shape[0] > 0: ksum = (data[:, 1] != -1.0).sum() kpt += ksum print( "octave %i kp count %i/%i size %s ratio:%s" % ( octave, ksum, self.kpsize, self.scales[octave], 1000.0 * ksum / self.scales[octave][1] / self.scales[octave][0], ) ) print( "Found total %i guess %s pixels per keypoint" % (kpt, self.shape[0] * self.shape[1] / kpt) )
def demo(): # Prepare debugging from scipy.datasets import ascent img = ascent() s = SiftPlan(template=img) print(s.keypoints(img)) if __name__ == "__main__": demo()