Bartzi · June 12, 2017 08:57
diff --git a/bilinear_interpolation.py b/bilinear_interpolation.py
 from chainer import Function
 from chainer import cuda
 from chainer.utils import type_check, force_array


 def cupy_place(arr, mask, val):
    _place_kernel = cuda.cupy.ElementwiseKernel(
        'T a, S mask, T val',
        'T out',
        'out = mask ? val : a',
        'cupy_place')

    val = cuda.cupy.array(val, dtype=arr.dtype)
    out = cuda.cupy.empty_like(arr)
    return _place_kernel(arr, mask, val, out)


 def apply_with_mask(arr, mask, elementwise_func, *args):
    value = elementwise_func(arr, *args)
    return cupy_place(arr, mask, value)


 class BilinearInterpolation(Function):

    def __init__(self, out_height, out_width):
        self.out_height = out_height
        self.out_width = out_width
        self.x_max = None
        self.y_max = None
        self.x_min = None
        self.y_min = None

    def check_type_forward(self, in_types):
        type_check.expect(in_types.size() == 2)
        x_type, points_type = in_types
        type_check.expect(
            x_type.ndim == 3,
            x_type.dtype.kind == 'f',
            points_type.ndim == 3,
            points_type.dtype.kind == 'f',
            points_type.shape[-1] == self.out_width * self.out_height,
        )

    def forward_cpu(self, inputs):
        xp = cuda.get_array_module(*inputs)
        U, points = inputs
        batch_size, height, width = U.shape

        # resize grid to be in image dimensions
        points = (points + 1) / 2
        points[:, 0] *= width
        points[:, 1] *= height

        x_s = points[:, 0]
        y_s = points[:, 1]

        # determine min indices
        self.x_min = xp.clip(xp.floor(x_s * 0.999999), 0, width - 1).astype(xp.int32)
        self.y_min = xp.clip(xp.floor(y_s * 0.999999), 0, width - 1).astype(xp.int32)

        # determine max indices
        self.x_max = xp.clip((self.x_min + 1), 0, width - 1).astype(xp.int32)
        self.y_max = xp.clip((self.y_min + 1), 0, height - 1).astype(xp.int32)

        # remove points outside of image
        x_invalid = xp.logical_or(x_s < 0, (width - 1) < x_s)
        y_invalid = xp.logical_or(y_s < 0, (height - 1) < y_s)
        self.invalid = xp.logical_or(x_invalid, y_invalid)

        self.left_weight = 1.0 - (x_s - self.x_min).astype(U.dtype)
        self.top_weight = 1.0 - (y_s - self.y_min).astype(U.dtype)
        self.right_weight = 1.0 - self.left_weight.astype(U.dtype)
        self.bottom_weight = 1.0 - self.top_weight.astype(U.dtype)

        self.left_weight[self.invalid] = 0
        self.top_weight[self.invalid] = 0
        self.right_weight[self.invalid] = 0
        self.bottom_weight[self.invalid] = 0

        batch_axis = xp.expand_dims(xp.arange(batch_size), 1)
        top_interpolated = self.left_weight * U[batch_axis, self.y_min, self.x_min] + self.right_weight * U[batch_axis, self.y_min, self.x_max]
        bottom_interpolated = self.left_weight * U[batch_axis, self.y_max, self.x_min] + self.right_weight * U[batch_axis, self.y_max, self.x_max]

        V = self.top_weight * top_interpolated + self.bottom_weight * bottom_interpolated
        V = V.reshape((batch_size, self.out_height, self.out_width))

        return force_array(V, dtype=U.dtype),

    def forward_gpu(self, inputs):
        xp = cuda.get_array_module(*inputs)
        U, points = inputs
        batch_size, height, width = U.shape

        # resize grid to be in image dimensions
        points = (points + 1) / 2
        points[:, 0] *= width
        points[:, 1] *= height

        x_s = points[:, 0]
        y_s = points[:, 1]

        # move points on boundary a little bit inward to ensure correct computation
        # apply masking twice in order to avoid race conditions
        x_s_copy = x_s.copy()
        y_s_copy = y_s.copy()

        on_boundary = (x_s == 0)
        replacement = apply_with_mask(x_s_copy, on_boundary, xp.nextafter, xp.array(1.0).astype(x_s.dtype))
        x_s = cupy_place(x_s_copy, on_boundary, replacement)

        on_boundary = (x_s == width - 1)
        replacement = apply_with_mask(x_s_copy, on_boundary, xp.nextafter, xp.array(0.0).astype(x_s.dtype))
        x_s = cupy_place(x_s_copy, on_boundary, replacement)

        on_boundary = (y_s == 0)
        replacement = apply_with_mask(y_s_copy, on_boundary, xp.nextafter, xp.array(1.0).astype(y_s.dtype))
        y_s = cupy_place(y_s_copy, on_boundary, replacement)

        on_boundary = (y_s == width - 1)
        replacement = apply_with_mask(y_s_copy, on_boundary, xp.nextafter, xp.array(0.0).astype(y_s.dtype))
        y_s = cupy_place(y_s_copy, on_boundary, replacement)

        # determine min indices
        self.x_min = xp.clip(xp.floor(x_s), 0, width - 1).astype(xp.int32)
        self.y_min = xp.clip(xp.floor(y_s), 0, width - 1).astype(xp.int32)

        # determine max indices
        self.x_max = xp.clip((self.x_min + 1), 0, width - 1).astype(xp.int32)
        self.y_max = xp.clip((self.y_min + 1), 0, height - 1).astype(xp.int32)

        # remove points outside of image
        x_invalid = xp.logical_or(x_s < 0, (width - 1) < x_s)
        y_invalid = xp.logical_or(y_s < 0, (height - 1) < y_s)
        self.invalid = xp.logical_or(x_invalid, y_invalid)

        self.left_weight = 1.0 - (x_s - self.x_min).astype(U.dtype)
        self.top_weight = 1.0 - (y_s - self.y_min).astype(U.dtype)
        self.right_weight = 1.0 - self.left_weight.astype(U.dtype)
        self.bottom_weight = 1.0 - self.top_weight.astype(U.dtype)

        replacement = xp.zeros_like(self.invalid, dtype=U.dtype)
        self.left_weight = cupy_place(self.left_weight.copy(), self.invalid, replacement.copy())
        self.top_weight = cupy_place(self.top_weight.copy(), self.invalid, replacement.copy())
        self.right_weight = cupy_place(self.right_weight.copy(), self.invalid, replacement.copy())
        self.bottom_weight = cupy_place(self.bottom_weight.copy(), self.invalid, replacement.copy())

        interpolate_kernel = cuda.cupy.ElementwiseKernel(
            'raw T U, T left, T right, T top, T bottom, int32 x_min, int32 y_min, int32 x_max, int32 y_max, int32 pixels_per_batch, int32 width, int32 height',
            'T V',
            '''
                int batchNum = i / pixels_per_batch;
                int batchOffset = batchNum * width * height;

                T top_left = left * U[batchOffset + y_min * width + x_min];
                T top_right = right * U[batchOffset + y_min * width + x_max];
                T bottom_left = left * U[batchOffset + y_max * width + x_min];
                T bottom_right = right * U[batchOffset + y_max * width + x_max];

                V = top * (top_left + top_right) + bottom * (bottom_left + bottom_right);
            ''',
            'interpolation_kernel'
        )

        V = interpolate_kernel(
            xp.ravel(U),
            self.left_weight,
            self.right_weight,
            self.top_weight,
            self.bottom_weight,
            self.x_min,
            self.y_min,
            self.x_max,
            self.y_max,
            self.out_height * self.out_width,
            width,
            height,
        )

        V = V.reshape((batch_size, self.out_height, self.out_width))
        return force_array(V, dtype=U.dtype),

    def backward_cpu(self, inputs, gradients):
        xp = cuda.get_array_module(*inputs)
        U, points = inputs
        gV = gradients[0]
        
        batch_size, height, width = U.shape
        flattened_gV = xp.reshape(xp.ravel(gV), (batch_size, -1))

        gU = xp.zeros((batch_size, height * width), dtype=U.dtype)

        batch_axis = xp.expand_dims(xp.arange(batch_size), 1)

        U_tl = U[batch_axis, self.y_min, self.x_min]
        U_tr = U[batch_axis, self.y_min, self.x_max]
        U_bl = U[batch_axis, self.y_max, self.x_min]
        U_br = U[batch_axis, self.y_max, self.x_max]

        gx = self.top_weight * (U_tr - U_tl) + self.bottom_weight * (U_br - U_bl)
        gy = self.left_weight * (U_bl - U_tl) + self.right_weight * (U_br - U_tr)
        gx = gx * flattened_gV
        gy = gy * flattened_gV

        gx = xp.expand_dims(gx, 1)
        gy = xp.expand_dims(gy, 1)
        gpoints = xp.hstack((gx, gy))
        
        top_left_indices = width * self.y_min + self.x_min
        top_right_indices = width * self.y_min + self.x_max
        bottom_left_indices = width * self.y_max + self.x_min
        bottom_right_indices = width * self.y_max + self.x_max
        indices = xp.hstack((top_left_indices, top_right_indices, bottom_left_indices, bottom_right_indices))

        top_left_weight = self.top_weight * self.left_weight * flattened_gV
        top_right_weight = self.top_weight * self.right_weight * flattened_gV
        bottom_left_weight = self.bottom_weight * self.left_weight * flattened_gV
        bottom_right_weight = self.bottom_weight * self.right_weight * flattened_gV
        weights = xp.hstack((top_left_weight, top_right_weight, bottom_left_weight, bottom_right_weight))

        for b in range(batch_size):
            gU[b] = xp.bincount(indices[b], weights=weights[b], minlength=height * width)

        return force_array(gU.reshape(*U.shape), dtype=gV.dtype), force_array(gpoints.reshape(*points.shape), dtype=points.dtype),

    def backward_gpu(self, inputs, gradients):
        xp = cuda.get_array_module(*inputs)
        U, points = inputs
        batch_size, height, width = U.shape

        gV, = gradients
        flattened_gV = xp.reshape(xp.ravel(gV), (batch_size, -1))

        batch_indices = xp.asarray(xp.repeat(xp.arange(batch_size, dtype=xp.int32), points.shape[-1]))
        top_left_indices = batch_indices * (width * height) + self.y_min.ravel() * width + self.x_min.ravel()
        top_right_indices = batch_indices * (width * height) + self.y_min.ravel() * width + self.x_max.ravel()
        bottom_left_indices = batch_indices * (width * height) + self.y_max.ravel() * width + self.x_min.ravel()
        bottom_right_indices = batch_indices * (width * height) + self.y_max.ravel() * width + self.x_max.ravel()

        U_tl = xp.take(U, top_left_indices).reshape((batch_size, -1))
        U_tr = xp.take(U, top_right_indices).reshape((batch_size, -1))
        U_bl = xp.take(U, bottom_left_indices).reshape((batch_size, -1))
        U_br = xp.take(U, bottom_right_indices).reshape((batch_size, -1))

        gx = self.top_weight * (U_tr - U_tl) + self.bottom_weight * (U_br - U_bl)
        gy = self.left_weight * (U_bl - U_tl) + self.right_weight * (U_br - U_tr)

        gx = xp.expand_dims(gx * flattened_gV, 1)
        gy = xp.expand_dims(gy * flattened_gV, 1)
        gpoints = xp.hstack((gx, gy))

        top_left_weight = self.top_weight * self.left_weight * flattened_gV
        top_right_weight = self.top_weight * self.right_weight * flattened_gV
        bottom_left_weight = self.bottom_weight * self.left_weight * flattened_gV
        bottom_right_weight = self.bottom_weight * self.right_weight * flattened_gV

        gU_kernel = cuda.cupy.ElementwiseKernel(
            'int32 elements_per_batch, int32 width, int32 height, T top_left_weight, T top_right_weight, T bottom_left_weight, T bottom_right_weight, int32 x_min, int32 x_max, int32 y_min, int32 y_max',
            'raw T gU',
            '''
                int batchNum = i / elements_per_batch;
                int batchOffset = batchNum * width * height;

                int index = (batchOffset + y_min * width + x_min);
                atomicAdd(&gU[index], top_left_weight);

                index = (batchOffset + y_min * width + x_max);
                atomicAdd(&gU[index], top_right_weight);
                
                index = (batchOffset + y_max * width + x_min);
                atomicAdd(&gU[index], bottom_left_weight);

                index = (batchOffset + y_max * width + x_max);
                atomicAdd(&gU[index], bottom_right_weight);
            ''',
            'compute_gU_bilinear_interpolation',
        )

        gU = xp.zeros_like(U, dtype=U.dtype)
        gU = gU_kernel(self.out_width * self.out_height, width, height, top_left_weight, top_right_weight, bottom_left_weight, bottom_right_weight, self.x_min, self.x_max, self.y_min, self.y_max, gU)

        return force_array(gU.reshape(*U.shape), dtype=U.dtype), force_array(gpoints, dtype=points.dtype)


 def bilinear_interpolation(batch, points_s, out_height, out_width):
    return BilinearInterpolation(out_height, out_width)(batch, points_s)
	from chainer import Function
	from chainer import cuda
	from chainer.utils import type_check, force_array


	def cupy_place(arr, mask, val):
	_place_kernel = cuda.cupy.ElementwiseKernel(
	'T a, S mask, T val',
	'T out',
	'out = mask ? val : a',
	'cupy_place')

	val = cuda.cupy.array(val, dtype=arr.dtype)
	out = cuda.cupy.empty_like(arr)
	return _place_kernel(arr, mask, val, out)


	def apply_with_mask(arr, mask, elementwise_func, *args):
	value = elementwise_func(arr, *args)
	return cupy_place(arr, mask, value)


	class BilinearInterpolation(Function):

	def __init__(self, out_height, out_width):
	self.out_height = out_height
	self.out_width = out_width
	self.x_max = None
	self.y_max = None
	self.x_min = None
	self.y_min = None

	def check_type_forward(self, in_types):
	type_check.expect(in_types.size() == 2)
	x_type, points_type = in_types
	type_check.expect(
	x_type.ndim == 3,
	x_type.dtype.kind == 'f',
	points_type.ndim == 3,
	points_type.dtype.kind == 'f',
	points_type.shape[-1] == self.out_width * self.out_height,
	)

	def forward_cpu(self, inputs):
	xp = cuda.get_array_module(*inputs)
	U, points = inputs
	batch_size, height, width = U.shape

	# resize grid to be in image dimensions
	points = (points + 1) / 2
	points[:, 0] *= width
	points[:, 1] *= height

	x_s = points[:, 0]
	y_s = points[:, 1]

	# determine min indices
	self.x_min = xp.clip(xp.floor(x_s * 0.999999), 0, width - 1).astype(xp.int32)
	self.y_min = xp.clip(xp.floor(y_s * 0.999999), 0, width - 1).astype(xp.int32)

	# determine max indices
	self.x_max = xp.clip((self.x_min + 1), 0, width - 1).astype(xp.int32)
	self.y_max = xp.clip((self.y_min + 1), 0, height - 1).astype(xp.int32)

	# remove points outside of image
	x_invalid = xp.logical_or(x_s < 0, (width - 1) < x_s)
	y_invalid = xp.logical_or(y_s < 0, (height - 1) < y_s)
	self.invalid = xp.logical_or(x_invalid, y_invalid)

	self.left_weight = 1.0 - (x_s - self.x_min).astype(U.dtype)
	self.top_weight = 1.0 - (y_s - self.y_min).astype(U.dtype)
	self.right_weight = 1.0 - self.left_weight.astype(U.dtype)
	self.bottom_weight = 1.0 - self.top_weight.astype(U.dtype)

	self.left_weight[self.invalid] = 0
	self.top_weight[self.invalid] = 0
	self.right_weight[self.invalid] = 0
	self.bottom_weight[self.invalid] = 0

	batch_axis = xp.expand_dims(xp.arange(batch_size), 1)
	top_interpolated = self.left_weight * U[batch_axis, self.y_min, self.x_min] + self.right_weight * U[batch_axis, self.y_min, self.x_max]
	bottom_interpolated = self.left_weight * U[batch_axis, self.y_max, self.x_min] + self.right_weight * U[batch_axis, self.y_max, self.x_max]

	V = self.top_weight * top_interpolated + self.bottom_weight * bottom_interpolated
	V = V.reshape((batch_size, self.out_height, self.out_width))

	return force_array(V, dtype=U.dtype),

	def forward_gpu(self, inputs):
	xp = cuda.get_array_module(*inputs)
	U, points = inputs
	batch_size, height, width = U.shape

	# resize grid to be in image dimensions
	points = (points + 1) / 2
	points[:, 0] *= width
	points[:, 1] *= height

	x_s = points[:, 0]
	y_s = points[:, 1]

	# move points on boundary a little bit inward to ensure correct computation
	# apply masking twice in order to avoid race conditions
	x_s_copy = x_s.copy()
	y_s_copy = y_s.copy()

	on_boundary = (x_s == 0)
	replacement = apply_with_mask(x_s_copy, on_boundary, xp.nextafter, xp.array(1.0).astype(x_s.dtype))
	x_s = cupy_place(x_s_copy, on_boundary, replacement)

	on_boundary = (x_s == width - 1)
	replacement = apply_with_mask(x_s_copy, on_boundary, xp.nextafter, xp.array(0.0).astype(x_s.dtype))
	x_s = cupy_place(x_s_copy, on_boundary, replacement)

	on_boundary = (y_s == 0)
	replacement = apply_with_mask(y_s_copy, on_boundary, xp.nextafter, xp.array(1.0).astype(y_s.dtype))
	y_s = cupy_place(y_s_copy, on_boundary, replacement)

	on_boundary = (y_s == width - 1)
	replacement = apply_with_mask(y_s_copy, on_boundary, xp.nextafter, xp.array(0.0).astype(y_s.dtype))
	y_s = cupy_place(y_s_copy, on_boundary, replacement)

	# determine min indices
	self.x_min = xp.clip(xp.floor(x_s), 0, width - 1).astype(xp.int32)
	self.y_min = xp.clip(xp.floor(y_s), 0, width - 1).astype(xp.int32)

	# determine max indices
	self.x_max = xp.clip((self.x_min + 1), 0, width - 1).astype(xp.int32)
	self.y_max = xp.clip((self.y_min + 1), 0, height - 1).astype(xp.int32)

	# remove points outside of image
	x_invalid = xp.logical_or(x_s < 0, (width - 1) < x_s)
	y_invalid = xp.logical_or(y_s < 0, (height - 1) < y_s)
	self.invalid = xp.logical_or(x_invalid, y_invalid)

	self.left_weight = 1.0 - (x_s - self.x_min).astype(U.dtype)
	self.top_weight = 1.0 - (y_s - self.y_min).astype(U.dtype)
	self.right_weight = 1.0 - self.left_weight.astype(U.dtype)
	self.bottom_weight = 1.0 - self.top_weight.astype(U.dtype)

	replacement = xp.zeros_like(self.invalid, dtype=U.dtype)
	self.left_weight = cupy_place(self.left_weight.copy(), self.invalid, replacement.copy())
	self.top_weight = cupy_place(self.top_weight.copy(), self.invalid, replacement.copy())
	self.right_weight = cupy_place(self.right_weight.copy(), self.invalid, replacement.copy())
	self.bottom_weight = cupy_place(self.bottom_weight.copy(), self.invalid, replacement.copy())

	interpolate_kernel = cuda.cupy.ElementwiseKernel(
	'raw T U, T left, T right, T top, T bottom, int32 x_min, int32 y_min, int32 x_max, int32 y_max, int32 pixels_per_batch, int32 width, int32 height',
	'T V',
	'''
	int batchNum = i / pixels_per_batch;
	int batchOffset = batchNum * width * height;

	T top_left = left * U[batchOffset + y_min * width + x_min];
	T top_right = right * U[batchOffset + y_min * width + x_max];
	T bottom_left = left * U[batchOffset + y_max * width + x_min];
	T bottom_right = right * U[batchOffset + y_max * width + x_max];

	V = top * (top_left + top_right) + bottom * (bottom_left + bottom_right);
	''',
	'interpolation_kernel'
	)

	V = interpolate_kernel(
	xp.ravel(U),
	self.left_weight,
	self.right_weight,
	self.top_weight,
	self.bottom_weight,
	self.x_min,
	self.y_min,
	self.x_max,
	self.y_max,
	self.out_height * self.out_width,
	width,
	height,
	)

	V = V.reshape((batch_size, self.out_height, self.out_width))
	return force_array(V, dtype=U.dtype),

	def backward_cpu(self, inputs, gradients):
	xp = cuda.get_array_module(*inputs)
	U, points = inputs
	gV = gradients[0]

	batch_size, height, width = U.shape
	flattened_gV = xp.reshape(xp.ravel(gV), (batch_size, -1))

	gU = xp.zeros((batch_size, height * width), dtype=U.dtype)

	batch_axis = xp.expand_dims(xp.arange(batch_size), 1)

	U_tl = U[batch_axis, self.y_min, self.x_min]
	U_tr = U[batch_axis, self.y_min, self.x_max]
	U_bl = U[batch_axis, self.y_max, self.x_min]
	U_br = U[batch_axis, self.y_max, self.x_max]

	gx = self.top_weight * (U_tr - U_tl) + self.bottom_weight * (U_br - U_bl)
	gy = self.left_weight * (U_bl - U_tl) + self.right_weight * (U_br - U_tr)
	gx = gx * flattened_gV
	gy = gy * flattened_gV

	gx = xp.expand_dims(gx, 1)
	gy = xp.expand_dims(gy, 1)
	gpoints = xp.hstack((gx, gy))

	top_left_indices = width * self.y_min + self.x_min
	top_right_indices = width * self.y_min + self.x_max
	bottom_left_indices = width * self.y_max + self.x_min
	bottom_right_indices = width * self.y_max + self.x_max
	indices = xp.hstack((top_left_indices, top_right_indices, bottom_left_indices, bottom_right_indices))

	top_left_weight = self.top_weight * self.left_weight * flattened_gV
	top_right_weight = self.top_weight * self.right_weight * flattened_gV
	bottom_left_weight = self.bottom_weight * self.left_weight * flattened_gV
	bottom_right_weight = self.bottom_weight * self.right_weight * flattened_gV
	weights = xp.hstack((top_left_weight, top_right_weight, bottom_left_weight, bottom_right_weight))

	for b in range(batch_size):
	gU[b] = xp.bincount(indices[b], weights=weights[b], minlength=height * width)

	return force_array(gU.reshape(U.shape), dtype=gV.dtype), force_array(gpoints.reshape(points.shape), dtype=points.dtype),

	def backward_gpu(self, inputs, gradients):
	xp = cuda.get_array_module(*inputs)
	U, points = inputs
	batch_size, height, width = U.shape

	gV, = gradients
	flattened_gV = xp.reshape(xp.ravel(gV), (batch_size, -1))

	batch_indices = xp.asarray(xp.repeat(xp.arange(batch_size, dtype=xp.int32), points.shape[-1]))
	top_left_indices = batch_indices * (width * height) + self.y_min.ravel() * width + self.x_min.ravel()
	top_right_indices = batch_indices * (width * height) + self.y_min.ravel() * width + self.x_max.ravel()
	bottom_left_indices = batch_indices * (width * height) + self.y_max.ravel() * width + self.x_min.ravel()
	bottom_right_indices = batch_indices * (width * height) + self.y_max.ravel() * width + self.x_max.ravel()

	U_tl = xp.take(U, top_left_indices).reshape((batch_size, -1))
	U_tr = xp.take(U, top_right_indices).reshape((batch_size, -1))
	U_bl = xp.take(U, bottom_left_indices).reshape((batch_size, -1))
	U_br = xp.take(U, bottom_right_indices).reshape((batch_size, -1))

	gx = self.top_weight * (U_tr - U_tl) + self.bottom_weight * (U_br - U_bl)
	gy = self.left_weight * (U_bl - U_tl) + self.right_weight * (U_br - U_tr)

	gx = xp.expand_dims(gx * flattened_gV, 1)
	gy = xp.expand_dims(gy * flattened_gV, 1)
	gpoints = xp.hstack((gx, gy))

	top_left_weight = self.top_weight * self.left_weight * flattened_gV
	top_right_weight = self.top_weight * self.right_weight * flattened_gV
	bottom_left_weight = self.bottom_weight * self.left_weight * flattened_gV
	bottom_right_weight = self.bottom_weight * self.right_weight * flattened_gV

	gU_kernel = cuda.cupy.ElementwiseKernel(
	'int32 elements_per_batch, int32 width, int32 height, T top_left_weight, T top_right_weight, T bottom_left_weight, T bottom_right_weight, int32 x_min, int32 x_max, int32 y_min, int32 y_max',
	'raw T gU',
	'''
	int batchNum = i / elements_per_batch;
	int batchOffset = batchNum * width * height;

	int index = (batchOffset + y_min * width + x_min);
	atomicAdd(&gU[index], top_left_weight);

	index = (batchOffset + y_min * width + x_max);
	atomicAdd(&gU[index], top_right_weight);

	index = (batchOffset + y_max * width + x_min);
	atomicAdd(&gU[index], bottom_left_weight);

	index = (batchOffset + y_max * width + x_max);
	atomicAdd(&gU[index], bottom_right_weight);
	''',
	'compute_gU_bilinear_interpolation',
	)

	gU = xp.zeros_like(U, dtype=U.dtype)
	gU = gU_kernel(self.out_width * self.out_height, width, height, top_left_weight, top_right_weight, bottom_left_weight, bottom_right_weight, self.x_min, self.x_max, self.y_min, self.y_max, gU)

	return force_array(gU.reshape(*U.shape), dtype=U.dtype), force_array(gpoints, dtype=points.dtype)


	def bilinear_interpolation(batch, points_s, out_height, out_width):
	return BilinearInterpolation(out_height, out_width)(batch, points_s)
No results found