Created
June 12, 2017 08:57
-
-
Save Bartzi/d21426edfca1d2ce70d91fc918d41eea to your computer and use it in GitHub Desktop.
Revisions
-
Bartzi created this gist
Jun 12, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,285 @@ from chainer import Function from chainer import cuda from chainer.utils import type_check, force_array def cupy_place(arr, mask, val): _place_kernel = cuda.cupy.ElementwiseKernel( 'T a, S mask, T val', 'T out', 'out = mask ? val : a', 'cupy_place') val = cuda.cupy.array(val, dtype=arr.dtype) out = cuda.cupy.empty_like(arr) return _place_kernel(arr, mask, val, out) def apply_with_mask(arr, mask, elementwise_func, *args): value = elementwise_func(arr, *args) return cupy_place(arr, mask, value) class BilinearInterpolation(Function): def __init__(self, out_height, out_width): self.out_height = out_height self.out_width = out_width self.x_max = None self.y_max = None self.x_min = None self.y_min = None def check_type_forward(self, in_types): type_check.expect(in_types.size() == 2) x_type, points_type = in_types type_check.expect( x_type.ndim == 3, x_type.dtype.kind == 'f', points_type.ndim == 3, points_type.dtype.kind == 'f', points_type.shape[-1] == self.out_width * self.out_height, ) def forward_cpu(self, inputs): xp = cuda.get_array_module(*inputs) U, points = inputs batch_size, height, width = U.shape # resize grid to be in image dimensions points = (points + 1) / 2 points[:, 0] *= width points[:, 1] *= height x_s = points[:, 0] y_s = points[:, 1] # determine min indices self.x_min = xp.clip(xp.floor(x_s * 0.999999), 0, width - 1).astype(xp.int32) self.y_min = xp.clip(xp.floor(y_s * 0.999999), 0, width - 1).astype(xp.int32) # determine max indices self.x_max = xp.clip((self.x_min + 1), 0, width - 1).astype(xp.int32) self.y_max = xp.clip((self.y_min + 1), 0, height - 1).astype(xp.int32) # remove points outside of image x_invalid = xp.logical_or(x_s < 0, (width - 1) < x_s) y_invalid = xp.logical_or(y_s < 0, (height - 1) < y_s) self.invalid = xp.logical_or(x_invalid, y_invalid) self.left_weight = 1.0 - (x_s - self.x_min).astype(U.dtype) self.top_weight = 1.0 - (y_s - self.y_min).astype(U.dtype) self.right_weight = 1.0 - self.left_weight.astype(U.dtype) self.bottom_weight = 1.0 - self.top_weight.astype(U.dtype) self.left_weight[self.invalid] = 0 self.top_weight[self.invalid] = 0 self.right_weight[self.invalid] = 0 self.bottom_weight[self.invalid] = 0 batch_axis = xp.expand_dims(xp.arange(batch_size), 1) top_interpolated = self.left_weight * U[batch_axis, self.y_min, self.x_min] + self.right_weight * U[batch_axis, self.y_min, self.x_max] bottom_interpolated = self.left_weight * U[batch_axis, self.y_max, self.x_min] + self.right_weight * U[batch_axis, self.y_max, self.x_max] V = self.top_weight * top_interpolated + self.bottom_weight * bottom_interpolated V = V.reshape((batch_size, self.out_height, self.out_width)) return force_array(V, dtype=U.dtype), def forward_gpu(self, inputs): xp = cuda.get_array_module(*inputs) U, points = inputs batch_size, height, width = U.shape # resize grid to be in image dimensions points = (points + 1) / 2 points[:, 0] *= width points[:, 1] *= height x_s = points[:, 0] y_s = points[:, 1] # move points on boundary a little bit inward to ensure correct computation # apply masking twice in order to avoid race conditions x_s_copy = x_s.copy() y_s_copy = y_s.copy() on_boundary = (x_s == 0) replacement = apply_with_mask(x_s_copy, on_boundary, xp.nextafter, xp.array(1.0).astype(x_s.dtype)) x_s = cupy_place(x_s_copy, on_boundary, replacement) on_boundary = (x_s == width - 1) replacement = apply_with_mask(x_s_copy, on_boundary, xp.nextafter, xp.array(0.0).astype(x_s.dtype)) x_s = cupy_place(x_s_copy, on_boundary, replacement) on_boundary = (y_s == 0) replacement = apply_with_mask(y_s_copy, on_boundary, xp.nextafter, xp.array(1.0).astype(y_s.dtype)) y_s = cupy_place(y_s_copy, on_boundary, replacement) on_boundary = (y_s == width - 1) replacement = apply_with_mask(y_s_copy, on_boundary, xp.nextafter, xp.array(0.0).astype(y_s.dtype)) y_s = cupy_place(y_s_copy, on_boundary, replacement) # determine min indices self.x_min = xp.clip(xp.floor(x_s), 0, width - 1).astype(xp.int32) self.y_min = xp.clip(xp.floor(y_s), 0, width - 1).astype(xp.int32) # determine max indices self.x_max = xp.clip((self.x_min + 1), 0, width - 1).astype(xp.int32) self.y_max = xp.clip((self.y_min + 1), 0, height - 1).astype(xp.int32) # remove points outside of image x_invalid = xp.logical_or(x_s < 0, (width - 1) < x_s) y_invalid = xp.logical_or(y_s < 0, (height - 1) < y_s) self.invalid = xp.logical_or(x_invalid, y_invalid) self.left_weight = 1.0 - (x_s - self.x_min).astype(U.dtype) self.top_weight = 1.0 - (y_s - self.y_min).astype(U.dtype) self.right_weight = 1.0 - self.left_weight.astype(U.dtype) self.bottom_weight = 1.0 - self.top_weight.astype(U.dtype) replacement = xp.zeros_like(self.invalid, dtype=U.dtype) self.left_weight = cupy_place(self.left_weight.copy(), self.invalid, replacement.copy()) self.top_weight = cupy_place(self.top_weight.copy(), self.invalid, replacement.copy()) self.right_weight = cupy_place(self.right_weight.copy(), self.invalid, replacement.copy()) self.bottom_weight = cupy_place(self.bottom_weight.copy(), self.invalid, replacement.copy()) interpolate_kernel = cuda.cupy.ElementwiseKernel( 'raw T U, T left, T right, T top, T bottom, int32 x_min, int32 y_min, int32 x_max, int32 y_max, int32 pixels_per_batch, int32 width, int32 height', 'T V', ''' int batchNum = i / pixels_per_batch; int batchOffset = batchNum * width * height; T top_left = left * U[batchOffset + y_min * width + x_min]; T top_right = right * U[batchOffset + y_min * width + x_max]; T bottom_left = left * U[batchOffset + y_max * width + x_min]; T bottom_right = right * U[batchOffset + y_max * width + x_max]; V = top * (top_left + top_right) + bottom * (bottom_left + bottom_right); ''', 'interpolation_kernel' ) V = interpolate_kernel( xp.ravel(U), self.left_weight, self.right_weight, self.top_weight, self.bottom_weight, self.x_min, self.y_min, self.x_max, self.y_max, self.out_height * self.out_width, width, height, ) V = V.reshape((batch_size, self.out_height, self.out_width)) return force_array(V, dtype=U.dtype), def backward_cpu(self, inputs, gradients): xp = cuda.get_array_module(*inputs) U, points = inputs gV = gradients[0] batch_size, height, width = U.shape flattened_gV = xp.reshape(xp.ravel(gV), (batch_size, -1)) gU = xp.zeros((batch_size, height * width), dtype=U.dtype) batch_axis = xp.expand_dims(xp.arange(batch_size), 1) U_tl = U[batch_axis, self.y_min, self.x_min] U_tr = U[batch_axis, self.y_min, self.x_max] U_bl = U[batch_axis, self.y_max, self.x_min] U_br = U[batch_axis, self.y_max, self.x_max] gx = self.top_weight * (U_tr - U_tl) + self.bottom_weight * (U_br - U_bl) gy = self.left_weight * (U_bl - U_tl) + self.right_weight * (U_br - U_tr) gx = gx * flattened_gV gy = gy * flattened_gV gx = xp.expand_dims(gx, 1) gy = xp.expand_dims(gy, 1) gpoints = xp.hstack((gx, gy)) top_left_indices = width * self.y_min + self.x_min top_right_indices = width * self.y_min + self.x_max bottom_left_indices = width * self.y_max + self.x_min bottom_right_indices = width * self.y_max + self.x_max indices = xp.hstack((top_left_indices, top_right_indices, bottom_left_indices, bottom_right_indices)) top_left_weight = self.top_weight * self.left_weight * flattened_gV top_right_weight = self.top_weight * self.right_weight * flattened_gV bottom_left_weight = self.bottom_weight * self.left_weight * flattened_gV bottom_right_weight = self.bottom_weight * self.right_weight * flattened_gV weights = xp.hstack((top_left_weight, top_right_weight, bottom_left_weight, bottom_right_weight)) for b in range(batch_size): gU[b] = xp.bincount(indices[b], weights=weights[b], minlength=height * width) return force_array(gU.reshape(*U.shape), dtype=gV.dtype), force_array(gpoints.reshape(*points.shape), dtype=points.dtype), def backward_gpu(self, inputs, gradients): xp = cuda.get_array_module(*inputs) U, points = inputs batch_size, height, width = U.shape gV, = gradients flattened_gV = xp.reshape(xp.ravel(gV), (batch_size, -1)) batch_indices = xp.asarray(xp.repeat(xp.arange(batch_size, dtype=xp.int32), points.shape[-1])) top_left_indices = batch_indices * (width * height) + self.y_min.ravel() * width + self.x_min.ravel() top_right_indices = batch_indices * (width * height) + self.y_min.ravel() * width + self.x_max.ravel() bottom_left_indices = batch_indices * (width * height) + self.y_max.ravel() * width + self.x_min.ravel() bottom_right_indices = batch_indices * (width * height) + self.y_max.ravel() * width + self.x_max.ravel() U_tl = xp.take(U, top_left_indices).reshape((batch_size, -1)) U_tr = xp.take(U, top_right_indices).reshape((batch_size, -1)) U_bl = xp.take(U, bottom_left_indices).reshape((batch_size, -1)) U_br = xp.take(U, bottom_right_indices).reshape((batch_size, -1)) gx = self.top_weight * (U_tr - U_tl) + self.bottom_weight * (U_br - U_bl) gy = self.left_weight * (U_bl - U_tl) + self.right_weight * (U_br - U_tr) gx = xp.expand_dims(gx * flattened_gV, 1) gy = xp.expand_dims(gy * flattened_gV, 1) gpoints = xp.hstack((gx, gy)) top_left_weight = self.top_weight * self.left_weight * flattened_gV top_right_weight = self.top_weight * self.right_weight * flattened_gV bottom_left_weight = self.bottom_weight * self.left_weight * flattened_gV bottom_right_weight = self.bottom_weight * self.right_weight * flattened_gV gU_kernel = cuda.cupy.ElementwiseKernel( 'int32 elements_per_batch, int32 width, int32 height, T top_left_weight, T top_right_weight, T bottom_left_weight, T bottom_right_weight, int32 x_min, int32 x_max, int32 y_min, int32 y_max', 'raw T gU', ''' int batchNum = i / elements_per_batch; int batchOffset = batchNum * width * height; int index = (batchOffset + y_min * width + x_min); atomicAdd(&gU[index], top_left_weight); index = (batchOffset + y_min * width + x_max); atomicAdd(&gU[index], top_right_weight); index = (batchOffset + y_max * width + x_min); atomicAdd(&gU[index], bottom_left_weight); index = (batchOffset + y_max * width + x_max); atomicAdd(&gU[index], bottom_right_weight); ''', 'compute_gU_bilinear_interpolation', ) gU = xp.zeros_like(U, dtype=U.dtype) gU = gU_kernel(self.out_width * self.out_height, width, height, top_left_weight, top_right_weight, bottom_left_weight, bottom_right_weight, self.x_min, self.x_max, self.y_min, self.y_max, gU) return force_array(gU.reshape(*U.shape), dtype=U.dtype), force_array(gpoints, dtype=points.dtype) def bilinear_interpolation(batch, points_s, out_height, out_width): return BilinearInterpolation(out_height, out_width)(batch, points_s)