From eca80f5d45ed9d3f9a3a165a8503d83767f6357e Mon Sep 17 00:00:00 2001 From: Sean Morgan Date: Sun, 13 Jan 2019 20:14:54 -0500 Subject: [PATCH 01/22] Update BUILD --- change.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 change.sh diff --git a/change.sh b/change.sh new file mode 100644 index 0000000000..dcd166fd73 --- /dev/null +++ b/change.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +git filter-branch -f --env-filter ' + +OLD_EMAIL="sunjiahe@sss-2.local" +CORRECT_NAME="public" +CORRECT_EMAIL="975759105@qq.com" + +if [ "$GIT_COMMITTER_EMAIL" = "$OLD_EMAIL" ] +then + export GIT_COMMITTER_NAME="$CORRECT_NAME" + export GIT_COMMITTER_EMAIL="$CORRECT_EMAIL" +fi +if [ "$GIT_AUTHOR_EMAIL" = "$OLD_EMAIL" ] +then + export GIT_AUTHOR_NAME="$CORRECT_NAME" + export GIT_AUTHOR_EMAIL="$CORRECT_EMAIL" +fi +' -- --all \ No newline at end of file From 350d26ca68c36921373d1ab57cf192a16a94b2a3 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 08:41:34 -0500 Subject: [PATCH 02/22] Add DeformableConv2D and DeformablePSROIAlign operator with cpu and gpu --- docs/tutorials/deformable_conv2d_ops.ipynb | 142 ++ tensorflow_addons/custom_ops/layers/BUILD | 18 + .../cc/kernels/deformable_conv2d_utils.h | 250 +++ .../layers/cc/kernels/deformable_conv_op.cc | 1517 +++++++++++++++++ .../layers/cc/kernels/deformable_conv_op.h | 317 ++++ .../cc/kernels/deformable_conv_op_gpu.cu.cc | 995 +++++++++++ .../layers/cc/ops/deformable_conv2d.cc | 287 ++++ tensorflow_addons/layers/BUILD | 8 + tensorflow_addons/layers/__init__.py | 3 + tensorflow_addons/layers/deformable_conv2d.py | 414 +++++ .../layers/deformable_conv2d_test.py | 279 +++ 11 files changed, 4230 insertions(+) create mode 100644 docs/tutorials/deformable_conv2d_ops.ipynb create mode 100755 tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h create mode 100644 tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc create mode 100755 tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h create mode 100755 tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc create mode 100644 tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc create mode 100644 tensorflow_addons/layers/deformable_conv2d.py create mode 100644 tensorflow_addons/layers/deformable_conv2d_test.py diff --git a/docs/tutorials/deformable_conv2d_ops.ipynb b/docs/tutorials/deformable_conv2d_ops.ipynb new file mode 100644 index 0000000000..7509feba66 --- /dev/null +++ b/docs/tutorials/deformable_conv2d_ops.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "Setup\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%% md\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "try:\n", + " %tensorflow_version 2.x\n", + "except:\n", + " pass\n", + "\n", + "import tensorflow as tf\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "!pip install -q --no-deps tensorflow-addons~=0.9" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import numpy as np\n", + "import tensorflow_addons as tfa\n", + "import matplotlib.pyplot as plt\n" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "Usage example\n", + "\n", + "(DeformableConv2D is just like normal tf.keras.layers.Conv2D)\n", + "\n", + "(DeformablePSROIAlign is something like tf.image.crop_to_bounding_box)" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "filters = 64\n", + "batch_size = 4\n", + "channels = 3\n", + "kernel_size = (3, 3)\n", + "padding = \"same\"\n", + "featuremap = tf.random.uniform(shape=[batch_size, 20, 20, channels], dtype=tf.float32)\n", + "\n", + "deformable_layer = tfa.layers.DeformableConv2D(filters, kernel_size, padding=padding)\n", + "result = deformable_layer(featuremap)\n", + "\n", + "image_featuremap = tf.random.normal(shape=[2, 64, 100, 100])\n", + "rois = tf.convert_to_tensor([[0, 1, 1, 800, 800], [1, 2, 2, 400, 400]], dtype=tf.float32)\n", + "out_dim = 64\n", + "spatial_scale = 1 / 16\n", + "group_size = 1\n", + "pooled_size = 7\n", + "sample_per_part = 4\n", + "part_size = 7\n", + "trans_std = 1\n", + "data_format = \"channels_first\"\n", + "psroi_align_layer = tfa.layers.DeformablePSROIAlign(out_dim, spatial_scale, group_size,\n", + " pooled_size, sample_per_part, part_size,\n", + " trans_std, data_format)\n", + "features = psroi_align_layer([image_featuremap, rois])" + ], + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "source": [], + "metadata": { + "collapsed": false + } + } + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/tensorflow_addons/custom_ops/layers/BUILD b/tensorflow_addons/custom_ops/layers/BUILD index 5ff49efe99..2a59ba9e19 100644 --- a/tensorflow_addons/custom_ops/layers/BUILD +++ b/tensorflow_addons/custom_ops/layers/BUILD @@ -19,3 +19,21 @@ custom_op_library( "cc/kernels/correlation_cost_op_gpu.cu.cc", ], ) + +custom_op_library( + name = "_deformable_conv2d_ops.so", + srcs = [ + "cc/kernels/deformable_conv2d_utils.h", + "cc/kernels/deformable_conv_op.cc", + "cc/kernels/deformable_conv_op.h", + "cc/ops/deformable_conv2d.cc", + ], + cuda_deps = [ + "@cub_archive//:cub", + ], + cuda_srcs = [ + "cc/kernels/deformable_conv2d_utils.h", + "cc/kernels/deformable_conv_op.h", + "cc/kernels/deformable_conv_op_gpu.cu.cc", + ], +) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h new file mode 100755 index 0000000000..51943105bb --- /dev/null +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h @@ -0,0 +1,250 @@ + +#ifndef TF_OPS_DEFORMABLE_CONV2D_UTILS_H +#define TF_OPS_DEFORMABLE_CONV2D_UTILS_H + +#include + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool" + +namespace tensorflow { +namespace addons { + +namespace functor { +typedef Eigen::ThreadPoolDevice CPUDevice; +typedef Eigen::GpuDevice GPUDevice; + +using namespace tensorflow::shape_inference; +Status CheckFormatConstraintsOnShape(const TensorFormat tensor_format, + const ShapeHandle shape_handle, + const string &tensor_name, + InferenceContext *ctx) { + if (tensor_format == FORMAT_NCHW_VECT_C) { + const int num_dims = ctx->Rank(shape_handle); + DimensionHandle vect_dim = ctx->Dim( + shape_handle, GetTensorInnerFeatureDimIndex(num_dims, tensor_format)); + DimensionHandle unused_vect_dim; + TF_RETURN_IF_ERROR(ctx->WithValue(vect_dim, 4, &unused_vect_dim)); + } + return Status::OK(); +} +Status DimensionsFromShape(ShapeHandle shape, TensorFormat format, + DimensionHandle *batch_dim, + gtl::MutableArraySlice spatial_dims, + DimensionHandle *filter_dim, InferenceContext *ctx) { + const int32 rank = GetTensorDimsFromSpatialDims(spatial_dims.size(), format); + *batch_dim = ctx->Dim(shape, GetTensorBatchDimIndex(rank, format)); + for (int spatial_dim_index = 0; spatial_dim_index < spatial_dims.size(); + ++spatial_dim_index) { + spatial_dims[spatial_dim_index] = ctx->Dim( + shape, GetTensorSpatialDimIndex(rank, format, spatial_dim_index)); + } + *filter_dim = ctx->Dim(shape, GetTensorFeatureDimIndex(rank, format)); + if (format == FORMAT_NCHW_VECT_C) { + TF_RETURN_IF_ERROR(ctx->Multiply( + *filter_dim, + ctx->Dim(shape, GetTensorInnerFeatureDimIndex(rank, format)), + filter_dim)); + } + return Status::OK(); +} +Status ShapeFromDimensions(DimensionHandle batch_dim, + gtl::ArraySlice spatial_dims, + DimensionHandle filter_dim, TensorFormat format, + InferenceContext *ctx, ShapeHandle *shape) { + const int32 rank = GetTensorDimsFromSpatialDims(spatial_dims.size(), format); + std::vector out_dims(rank); + out_dims[GetTensorBatchDimIndex(rank, format)] = batch_dim; + for (int spatial_dim_index = 0; spatial_dim_index < spatial_dims.size(); + ++spatial_dim_index) { + out_dims[GetTensorSpatialDimIndex(rank, format, spatial_dim_index)] = + spatial_dims[spatial_dim_index]; + } + if (format == FORMAT_NCHW_VECT_C) { + TF_RETURN_IF_ERROR( + ctx->Divide(filter_dim, 4, true, + &out_dims[GetTensorFeatureDimIndex(rank, format)])); + out_dims[GetTensorInnerFeatureDimIndex(rank, format)] = ctx->MakeDim(4); + } else { + out_dims[GetTensorFeatureDimIndex(rank, format)] = filter_dim; + } + *shape = ctx->MakeShape(out_dims); + return Status::OK(); +} +template +EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool FastBoundsCheck(const Ta index, + const Tb limit) { + static_assert(std::is_integral::value && std::is_integral::value, + "FastBoundsCheck can only be used on integer types."); + typedef typename std::make_unsigned::type UIndex; + return TF_PREDICT_TRUE(static_cast(index) < + static_cast(limit)); +} + +#define TF_REQUIRES(EXP, STATUS) \ + do { \ + if (!TF_PREDICT_TRUE(EXP)) return (STATUS); \ + } while (false) + +Status InitDeformableConv2DParameters(const OpKernelConstruction *ctx, + DeformableConv2DParameters *params) { + TF_RETURN_IF_ERROR(ctx->GetAttr("dilations", ¶ms->dilations)); + TF_RETURN_IF_ERROR(ctx->GetAttr("strides", ¶ms->strides)); + TF_RETURN_IF_ERROR(ctx->GetAttr("padding", ¶ms->padding)); + string data_format_string; + TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format_string)); + TF_RETURN_IF_ERROR(ctx->GetAttr("num_groups", ¶ms->num_groups)); + TF_RETURN_IF_ERROR( + ctx->GetAttr("deformable_groups", ¶ms->deformable_groups)); + TF_RETURN_IF_ERROR(ctx->GetAttr("im2col_step", ¶ms->im2col_step)); + TF_RETURN_IF_ERROR(ctx->GetAttr("no_bias", ¶ms->no_bias)); + TF_REQUIRES(FormatFromString(data_format_string, ¶ms->data_format), + errors::InvalidArgument("Invalid data format")); + const auto &strides = params->strides; + const auto &dilations = params->dilations; + const auto &data_format = params->data_format; + TF_REQUIRES(dilations.size() == 4, + errors::InvalidArgument("Sliding window dilations field must " + "specify 4 dimensions")); + TF_REQUIRES(strides.size() == 4, + errors::InvalidArgument("Sliding window strides field must " + "specify 4 dimensions")); + const int64 stride_n = GetTensorDim(strides, data_format, 'N'); + const int64 stride_c = GetTensorDim(strides, data_format, 'C'); + const int64 stride_h = GetTensorDim(strides, data_format, 'H'); + const int64 stride_w = GetTensorDim(strides, data_format, 'W'); + TF_REQUIRES( + stride_n == 1 && stride_c == 1, + errors::InvalidArgument("Current implementation does not yet support " + "strides in the batch and depth dimensions.")); + TF_REQUIRES(stride_h > 0 && stride_w > 0, + errors::InvalidArgument( + "Row and column strides should be larger than 0.")); + + const int64 dilation_n = GetTensorDim(dilations, data_format, 'N'); + const int64 dilation_c = GetTensorDim(dilations, data_format, 'C'); + const int64 dilation_h = GetTensorDim(dilations, data_format, 'H'); + const int64 dilation_w = GetTensorDim(dilations, data_format, 'W'); + TF_REQUIRES( + dilation_n == 1 && dilation_c == 1, + errors::InvalidArgument("Current implementation does not yet support " + "dilations in the batch and depth dimensions.")); + TF_REQUIRES( + dilation_h > 0 && dilation_w > 0, + errors::InvalidArgument("Dilated rates should be larger than 0.")); + + return Status::OK(); +} +Status ComputeDeformableConv2DDimension( + const DeformableConv2DParameters ¶ms, const Tensor &input, + const Tensor &filter, DeformableConv2DDimensions *dimensions, int flag) { + TF_REQUIRES(input.dims() == 4, + errors::InvalidArgument("input must be 4-dimensional", + input.shape().DebugString())); + TF_REQUIRES(filter.dims() == 4, + errors::InvalidArgument("filter must be 4-dimensional: ", + filter.shape().DebugString())); + for (int i = 3; i > 0; i--) { + TF_REQUIRES( + FastBoundsCheck(filter.dim_size(i), std::numeric_limits::max()), + errors::InvalidArgument("filter too large")); + } + const int64 in_depth_raw = GetTensorDim(input, params.data_format, 'C'); + const int64 patch_depth_raw = filter.dim_size(1); + TF_REQUIRES(FastBoundsCheck(in_depth_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input depth too large")); + TF_REQUIRES(FastBoundsCheck(patch_depth_raw, std::numeric_limits::max()), + errors::InvalidArgument("Patch depth too large")); + const int in_depth = static_cast(in_depth_raw); + const int patch_depth = static_cast(patch_depth_raw); + TF_REQUIRES(in_depth % patch_depth == 0, + errors::InvalidArgument( + "input depth must be evenly divisible by filter depth: ", + in_depth, " vs ", patch_depth, " flag: ", flag)); + + // The first dimension for filter is out_depth. + const int out_depth = static_cast(filter.dim_size(0)); + const int64 input_rows_raw = GetTensorDim(input, params.data_format, 'H'); + TF_REQUIRES(FastBoundsCheck(input_rows_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input rows too large")); + const int input_rows = static_cast(input_rows_raw); + const int filter_rows = static_cast(filter.dim_size(2)); + const int64 input_cols_raw = GetTensorDim(input, params.data_format, 'W'); + TF_REQUIRES(FastBoundsCheck(input_cols_raw, std::numeric_limits::max()), + errors::InvalidArgument("Input cols too large")); + const int input_cols = static_cast(input_cols_raw); + const int filter_cols = static_cast(filter.dim_size(3)); + const int64 batch_raw = GetTensorDim(input, params.data_format, 'N'); + TF_REQUIRES(FastBoundsCheck(batch_raw, std::numeric_limits::max()), + errors::InvalidArgument("batch is too large")); + const int batch = static_cast(batch_raw); + const int stride_rows = GetTensorDim(params.strides, params.data_format, 'H'); + const int stride_cols = GetTensorDim(params.strides, params.data_format, 'W'); + const int dilation_rows = + GetTensorDim(params.dilations, params.data_format, 'H'); + const int dilation_cols = + GetTensorDim(params.dilations, params.data_format, 'W'); + + // Compute windowed output sizes for rows and columns. + int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; + TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2( + input_rows, filter_rows, dilation_rows, stride_rows, params.padding, + &out_rows, &pad_rows)); + TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2( + input_cols, filter_cols, dilation_cols, stride_cols, params.padding, + &out_cols, &pad_cols)); + + dimensions->batch = batch; + dimensions->input_rows = input_rows; + dimensions->input_cols = input_cols; + dimensions->in_depth = in_depth; + dimensions->filter_rows = filter_rows; + dimensions->filter_cols = filter_cols; + dimensions->patch_depth = patch_depth; + dimensions->out_depth = out_depth; + dimensions->stride_rows = stride_rows; + dimensions->stride_cols = stride_cols; + dimensions->dilation_rows = dilation_rows; + dimensions->dilation_cols = dilation_cols; + dimensions->out_rows = out_rows; + dimensions->out_cols = out_cols; + dimensions->pad_rows = pad_rows; + dimensions->pad_cols = pad_cols; + return Status::OK(); +} + +inline TShape ToVector(const TShape &shape) { return shape; } + +inline std::vector ToVector(const TensorShape &shape) { + std::vector res; + for (int i = 0; i < shape.dims(); ++i) { + res.push_back(shape.dim_size(i)); + } + return res; +} + +inline std::vector SubVector(const TensorShape &shape, int start, + int end) { + std::vector res; + for (int i = start; i < end; i++) { + res.push_back(shape.dim_size(i)); + } + return res; +} + +inline TShape SubVector(const TShape &shape, int start, int end) { + TShape res; + for (int i = start; i < end; i++) { + res.push_back(shape[i]); + } + return res; +} + +} // namespace functor +} // namespace addons +} // namespace tensorflow + +#endif // TF_OPS_DEFORMABLE_CONV2D_UTILS_H diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc new file mode 100644 index 0000000000..54e2ab2176 --- /dev/null +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc @@ -0,0 +1,1517 @@ + +#include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h" + +#include +#include + +#include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h" + +namespace tensorflow { +namespace addons { + +namespace functor { + +template +DType DmcnIm2colBilinear(const DType *bottom_data, const int data_width, + const int height, const int width, DType h, DType w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + DType lh = h - h_low; + DType lw = w - w_low; + DType hh = 1 - lh, hw = 1 - lw; + + DType v1 = 0; + if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; + DType v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + DType v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + DType v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} +template +DType DmcnGetGradientWeight(DType argmax_h, DType argmax_w, const int h, + const int w, const int height, const int width) { + /* + * offset h, offset w, (h, w) coordinate + */ + if (argmax_h <= -1 || argmax_w <= -1 || argmax_h >= height || + argmax_w >= width) { + return 0; + } + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + DType weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} +template +DType DmcnGetCoordinateWeight(DType argmax_h, DType argmax_w, const int height, + const int width, const DType *im_data, + const int data_width, const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + DType weight = 0; + + if (bp_dir == 0) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +using ull = unsigned long long int; +using uInt = unsigned int; +typedef Eigen::GpuDevice GPUDevice; +typedef Eigen::ThreadPoolDevice CPUDevice; + +Eigen::IndexPair ContractionDims(bool adj_x, bool adj_y) { + return {adj_x ? 0 : 1, adj_y ? 1 : 0}; +} + +void AtomicAdd(float *address, float val) { + auto *address_as_ull = reinterpret_cast(address); + uInt old = *address_as_ull; + uInt assumed; + float desired; + do { + assumed = old; + desired = *reinterpret_cast(&assumed) + static_cast(val); + old = __sync_val_compare_and_swap(address_as_ull, assumed, + *reinterpret_cast(&desired)); + } while (assumed != old); +} + +void AtomicAdd(double *address, double val) { + auto *address_as_ull = reinterpret_cast(address); + ull old = *address_as_ull; + ull assumed; + double desired; + do { + assumed = old; + desired = *reinterpret_cast(&assumed) + static_cast(val); + old = __sync_val_compare_and_swap(address_as_ull, assumed, + *reinterpret_cast(&desired)); + } while (assumed != old); +} + +template +void SwapAxisKernel(const CPUDevice &d, const int n, const int cuda_mem_size, + const int min_unit_size, DType *input_data, + const int dim_num, const int axis_x_dims, + const int axis_y_dims, const int axis_x, const int axis_y) { + d.parallelFor(n, + Eigen::TensorOpCost(cuda_mem_size, cuda_mem_size, + cuda_mem_size * axis_y_dims * axis_x_dims), + [min_unit_size, input_data, dim_num, axis_x_dims, axis_y_dims, + axis_x, axis_y, cuda_mem_size](int64 start, int64 end) { + for (int64 index = start; index < end; index++) { + auto *device_data = new DType[cuda_mem_size]; + DType *input_data_ptr = input_data + index * cuda_mem_size; + for (int j = 0; j < axis_y_dims; j++) { + for (int i = 0; i < axis_x_dims; i++) { + DType *temp_ptr = input_data_ptr + + (i * axis_x_dims + j) * min_unit_size; + DType *device_data_temp_ptr = + device_data + (j * axis_y_dims + i) * min_unit_size; + for (int k = 0; k < min_unit_size; k++) { + *(device_data_temp_ptr + k) = *(temp_ptr + k); + } + } + } + for (int idx = 0; idx < cuda_mem_size; idx++) { + *(input_data_ptr + idx) = *(device_data + idx); + } + delete[] device_data; + } + }); +} + +template +void DeformablePSROIPoolBackwardCpuAccKernel( + const CPUDevice &d, const int count, const T *top_diff, const T *top_count, + const int num_rois, const T spatial_scale, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int output_dim, T *bottom_data_diff, + T *bottom_trans_diff, const T *bottom_data, const T *bottom_rois, + const T *bottom_trans, const int no_trans, const T trans_std, + const int sample_per_part, const int group_size, const int part_size, + const int num_classes, const int channels_each_class) { + auto f = [count, top_diff, top_count, num_rois, spatial_scale, channels, + height, width, pooled_height, pooled_width, output_dim, + bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, + bottom_trans, no_trans, trans_std, sample_per_part, group_size, + part_size, num_classes, + channels_each_class](int64 start, int64 end) { + for (int64 index = start; index < end; ++index) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = (T)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = (T)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = + (T)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + (T)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + // Force too small ROIs to be 1x1 + T roi_width = + std::max(roi_end_w - roi_start_w, static_cast(0.1)); // avoid 0 + T roi_height = std::max(roi_end_h - roi_start_h, static_cast(0.1)); + + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + + int part_h = floor((T)(ph) / pooled_height * part_size); + int part_w = floor((T)(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = + no_trans + ? static_cast(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + + part_h) * + part_size + + part_w] * + (T)trans_std; + T trans_y = no_trans + ? (T)(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_size + + part_h) * + part_size + + part_w] * + (T)trans_std; + + T wstart = (T)(pw)*bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = (T)(ph)*bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + T diff_val = top_diff[index] / top_count[index]; + const T *offset_bottom_data = + bottom_data + roi_batch_ind * channels * height * width; + T *offset_bottom_data_diff = + bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor((T)(pw)*group_size / pooled_width); + int gh = floor((T)(ph)*group_size / pooled_height); + gw = std::min(std::max(gw, 0), group_size - 1); + gh = std::min(std::max(gh, 0), group_size - 1); + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = std::min(std::max(w, static_cast(0.)), + static_cast(width - 1.)); + h = std::min(std::max(h, static_cast(0.)), + static_cast(height - 1.)); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + AtomicAdd( + offset_bottom_data_diff + bottom_index_base + y0 * width + x0, + q00 * diff_val); + AtomicAdd( + offset_bottom_data_diff + bottom_index_base + y1 * width + x0, + q01 * diff_val); + AtomicAdd( + offset_bottom_data_diff + bottom_index_base + y0 * width + x1, + q10 * diff_val); + AtomicAdd( + offset_bottom_data_diff + bottom_index_base + y1 * width + x1, + q11 * diff_val); + + if (no_trans) { + continue; + } + T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - + U00 * (1 - dist_y)) * + trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - + U00 * (1 - dist_x)) * + trans_std * diff_val; + diff_y *= roi_height; + + AtomicAdd( + bottom_trans_diff + + (((n * num_classes + class_id) * 2) * part_size + part_h) * + part_size + + part_w, + diff_x); + AtomicAdd(bottom_trans_diff + + (((n * num_classes + class_id) * 2 + 1) * part_size + + part_h) * + part_size + + part_w, + diff_y); + } + } + } + }; + d.parallelFor(count, Eigen::TensorOpCost(count, count, count), f); +} + +template +void DeformablePSROIPoolForwardCpuKernel( + const CPUDevice &d, const int count, const T *bottom_data, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, const int no_trans, + const T trans_std, const int sample_per_part, const int output_dim, + const int group_size, const int part_size, const int num_classes, + const int channels_each_class, T *top_data, T *top_count) { + auto f = [count, bottom_data, spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, + trans_std, sample_per_part, output_dim, group_size, part_size, + num_classes, channels_each_class, top_data, + top_count](int64 start, int64 end) { + for (int64 index = start; index < end; ++index) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = (T)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = (T)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = + (T)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = + (T)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + // Force too small ROIs to be 1x1 + T roi_width = + std::max(roi_end_w - roi_start_w, static_cast(0.1)); // avoid 0 + T roi_height = std::max(roi_end_h - roi_start_h, static_cast(0.1)); + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = + no_trans + ? (T)(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + + part_h) * + part_size + + part_w] * + (T)trans_std; + T trans_y = no_trans + ? (T)(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * + part_size + + part_h) * + part_size + + part_w] * + (T)trans_std; + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + T sum = 0; + int total = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = std::min(std::max(gw, 0), group_size - 1); + gh = std::min(std::max(gh, 0), group_size - 1); + const T *offset_bottom_data = + bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ++ih) { + for (int iw = 0; iw < sample_per_part; ++iw) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = std::min(std::max(w, static_cast(0.)), + static_cast(width - 1.)); + h = std::min(std::max(h, static_cast(0.)), + static_cast(height - 1.)); + int c = (ctop * group_size + gh) * group_size + gw; + T val = DmcnIm2colBilinear(offset_bottom_data + c * height * width, w, + h, w, (T)height, (T)width); + sum += val; + total++; + } + } + top_data[index] = total == 0 ? (T)(0) : sum / total; + top_count[index] = total; + } + }; + d.parallelFor(count, Eigen::TensorOpCost(count, count, count), f); +} +template +void DeformableConv2DIm2ColCPUKernel( + const CPUDevice &d, const int n, const DType *data_im, + const DType *data_offset, const DType *data_mask, + + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + + const int + channel_per_deformable_group, // 输入图通道数除以deformable_group的数量, + const int batch_size, const int num_channels, + const int + deformable_group, //这里的batch_size代表的是im2col_step_, 一般就设为1了 + const int height_col, const int width_col, DType *data_col) { + auto f = [n, data_im, data_offset, data_mask, height, width, kernel_h, + kernel_w, pad_h, pad_w, stride_w, stride_h, dilation_w, dilation_h, + channel_per_deformable_group, batch_size, num_channels, + deformable_group, height_col, width_col, + data_col](int64 start, int64 end) { + for (int64 index = start; index < end; index++) { + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + + DType *data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + + w_col; + const DType *data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const DType *data_offset_ptr = + data_offset + (b_col * deformable_group + deformable_group_index) * + 2 * kernel_h * kernel_w * height_col * + width_col; // + + const DType *data_mask_ptr = + data_mask + (b_col * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; // + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + const DType mask = data_mask_ptr[data_mask_hw_ptr]; + auto val = static_cast(0); + const DType h_im = h_in + i * dilation_h + offset_h; + const DType w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, + w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } + }; + d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f); +} + +template +void DeformableConv2DCol2ImCPUKernel( + const CPUDevice &d, const int n, const DType *data_col, + const DType *data_offset, const DType *data_mask, const int channels, + const int height, const int width, const int kernel_h, const int kernel_w, + const int pad_h, const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int deformable_group, const int height_col, const int width_col, + DType *grad_im) { + auto f = [n, data_col, data_offset, data_mask, channels, height, width, + kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h, + dilation_w, channel_per_deformable_group, batch_size, + deformable_group, height_col, width_col, + grad_im](int64 start, int64 end) { + for (int64 index = start; index < end; ++index) { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = + (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + const int deformable_group_index = c / channel_per_deformable_group; + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const DType *data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + const DType *data_mask_ptr = + data_mask + (b * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + const DType mask = data_mask_ptr[data_mask_hw_ptr]; + const DType cur_inv_h_data = h_in + i * dilation_h + offset_h; + const DType cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const DType cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + DType weight = + DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, + cur_h + dy, cur_w + dx, height, width); + AtomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + // *(grad_im + cur_bottom_grad_pos) += weight * + // cur_top_grad; + } + } + } + } + }; + d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f); +} +template +void DeformableConv2DCol2ImCoordCPUKernel( + const CPUDevice &d, const int n, const DType *data_col, + const DType *data_im, const DType *data_offset, const DType *data_mask, + const int channels, const int height, const int width, // 输入的C, H, W + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, DType *grad_offset, + DType *grad_mask) { + auto f = [n, data_col, data_im, data_offset, data_mask, channels, height, + width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, + dilation_h, dilation_w, channel_per_deformable_group, batch_size, + offset_channels, deformable_group, height_col, width_col, + grad_offset, grad_mask](int64 start, int64 end) { + for (int64 index = start; index < end; index++) { + DType val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const DType *data_col_ptr = + data_col + deformable_group_index * channel_per_deformable_group * + batch_size * width_col * height_col; + const DType *data_im_ptr = + data_im + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * + height * width; + const DType *data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + const DType *data_mask_ptr = + data_mask + (b * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = (col_pos / width_col / height_col / batch_size / kernel_w) % + kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const int data_mask_hw_ptr = + (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + const DType mask = data_mask_ptr[data_mask_hw_ptr]; + DType inv_h = h_in + i * dilation_h + offset_h; + DType inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -2; + } else { + mval += data_col_ptr[col_pos] * + DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width, + height, width, inv_h, inv_w); + } + const DType weight = DmcnGetCoordinateWeight( + inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, + width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + + grad_offset[index] = val; + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + if (offset_c % 2 == 0) { + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * + kernel_w + + offset_c / 2) * + height_col + + h) * + width_col + + w] = mval; + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + + // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * + // height_col + h) * width_col + w], mask_req, mval); + } + } + }; + d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f); +} +template +void PureAddToKernel(const CPUDevice &d, const int n, DType *result_data, + const DType *right_data) { + auto f = [n, result_data, right_data](int64 start, int64 end) { + for (int64 index = start; index < end; index++) { + *(result_data + index) += (right_data[index]); + } + }; + d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f); +} +template +void SetZeroKernel(const CPUDevice &d, const int n, DType *result_data) { + auto f = [n, result_data](int64 start, int64 end) { + for (int64 index = start; index < end; ++index) { + *(result_data + index) = DType(0); + } + }; + d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f); +} +template +void SetOneKernel(const CPUDevice &d, const int n, DType *result_data) { + auto f = [n, result_data](int64 start, int64 end) { + for (int64 index = start; index < end; ++index) { + *(result_data + index) = DType(1); + } + }; + d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f); +} + +template +void DeformableConv2DCol2ImCoord::operator()( + const Eigen::ThreadPoolDevice &d, const DType *data_col, + const DType *data_im, const DType *data_offset, const DType *data_mask, + const TShape &im_shape, const TShape &col_shape, const TShape &kernel_shape, + const TShape &pad, const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *grad_offset, DType *grad_mask) { + int num_spatial_axes = kernel_shape.size(); + int num_kernels = col_shape[1] * col_shape[2] * col_shape[3] * 2 * + kernel_shape[0] * kernel_shape[1] * deformable_group; + int channel_per_deformable_group = col_shape[0] / deformable_group; + switch (num_spatial_axes) { + case 2: + DeformableConv2DCol2ImCoordCPUKernel( + d, num_kernels, data_col, data_im, data_offset, data_mask, + im_shape[1], im_shape[2], im_shape[3], kernel_shape[0], + kernel_shape[1], pad[0], pad[1], stride[0], stride[1], dilation[0], + dilation[1], channel_per_deformable_group, col_shape[1], + 2 * kernel_shape[0] * kernel_shape[1] * deformable_group, + deformable_group, col_shape[2], col_shape[3], grad_offset, grad_mask); + break; + default: + LOG(FATAL) << "col2im_nd_gpu does not support computation with " + << num_spatial_axes << "spatial axes"; + } +} + +template +void DeformableConv2DCol2Im::operator()( + const Eigen::ThreadPoolDevice &d, const DType *data_col, + const DType *data_offset, const DType *data_mask, const TShape &im_shape, + const TShape &col_shape, const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *grad_im) { + int num_spatial_axes = kernel_shape.size(); + int channel_per_deformable_group = im_shape[1] / deformable_group; + int num_kernels = ProdShape(col_shape, 0, col_shape.size()); + // num_axes should be smaller than block size + // using namespace mxnet_op; + switch (num_spatial_axes) { + case 2: + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + DeformableConv2DCol2ImCPUKernel( + d, num_kernels, data_col, data_offset, data_mask, im_shape[1], + im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1], pad[0], + pad[1], stride[0], stride[1], dilation[0], dilation[1], + channel_per_deformable_group, col_shape[1], deformable_group, + col_shape[2], col_shape[3], grad_im); + break; + default: + LOG(FATAL) << "col2im_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + +template +void DeformableConv2DIm2Col::operator()( + const Eigen::ThreadPoolDevice &d, const DType *data_im, + const DType *data_offset, const DType *data_mask, const TShape &im_shape, + const TShape &col_shape, const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *data_col) { + int num_spatial_axes = kernel_shape.size(); + int channel_per_deformable_group = + im_shape[1] / deformable_group; // imshape[1] = 输入图的通道数 + int num_kernels = + im_shape[1] * + ProdShape(col_shape, 1, + col_shape.size()); // K * N / k.Size(), k = filter, col_shape = + // [K, im2col_step_, H, W] + switch (num_spatial_axes) { + case 2: + DeformableConv2DIm2ColCPUKernel( + d, num_kernels, data_im, data_offset, data_mask, im_shape[2], + im_shape[3], kernel_shape[0], kernel_shape[1], pad[0], pad[1], + stride[0], stride[1], dilation[0], dilation[1], + channel_per_deformable_group, col_shape[1], im_shape[1], + deformable_group, col_shape[2], col_shape[3], data_col); + // MSHADOW_CUDA_POST_KERNEL_CHECK(modulated_deformable_im2col_gpu_kernel); + break; + default: + LOG(FATAL) << "im2col_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + +template +void SetZeros::operator()(const Eigen::ThreadPoolDevice &d, + int n, DType *result_data) { + SetZeroKernel(d, n, result_data); +} +template +void PureAddTo::operator()(const Eigen::ThreadPoolDevice &d, + const int n, DType *result_data, + const DType *right_data) { + PureAddToKernel(d, n, result_data, right_data); +} +template +void SetOne::operator()(const Eigen::ThreadPoolDevice &d, + int n, DType *result_data) { + SetOneKernel(d, n, result_data); +} +template +void SetNumAtIndex::operator()( + const Eigen::ThreadPoolDevice &d, DType num, int index, DType *data) { + *(data + index) = num; +} + +template +void LaunchBatchMatMul::launch(OpKernelContext *context, + const TensorShape &in_x_shape, + const TensorShape &in_y_shape, + const T *in_x_ptr, + const T *in_y_ptr, bool adj_x, + bool adj_y, T *out) { + const int64 m = in_x_shape.dim_size(adj_x ? 2 : 1); + const int64 k = in_x_shape.dim_size(adj_x ? 1 : 2); + const int64 n = in_y_shape.dim_size(adj_y ? 1 : 2); + const uint64 batch_size = in_x_shape.dim_size(0); + Eigen::TensorMap> t_in_x( + in_x_ptr, in_x_shape.AsEigenDSizes<3, Eigen::DenseIndex>()); + Eigen::TensorMap> t_in_y( + in_y_ptr, in_y_shape.AsEigenDSizes<3, Eigen::DenseIndex>()); + Eigen::TensorMap> t_out(out, batch_size, + m, n); + Eigen::array, 1> contract_pairs; + contract_pairs[0] = ContractionDims(adj_x, adj_y); + auto &device = context->eigen_device(); + for (int i = 0; i < t_out.dimension(0); ++i) { + t_out.template chip<0>(i).device(device) = + (t_in_x.template chip<0>(i)) + .template contract(t_in_y.template chip<0>(i), contract_pairs); + } +} + +template +void DeformablePSROIPoolForward::operator()( + const CPUDevice &d, const int count, const T *bottom_data, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, const int no_trans, + const T trans_std, const int sample_per_part, const int output_dim, + const int group_size, const int part_size, const int num_classes, + const int channels_each_class, T *top_data, T *top_count) { + DeformablePSROIPoolForwardCpuKernel( + d, count, bottom_data, spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, + trans_std, sample_per_part, output_dim, group_size, part_size, + num_classes, channels_each_class, top_data, top_count); +} + +template +void DeformablePSROIPoolBackwardKernel::operator()( + const CPUDevice &d, const int count, const T *top_diff, const T *top_count, + const int num_rois, const T spatial_scale, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int output_dim, T *bottom_data_diff, + T *bottom_trans_diff, const T *bottom_data, const T *bottom_rois, + const T *bottom_trans, const int no_trans, const T trans_std, + const int sample_per_part, const int group_size, const int part_size, + const int num_classes, const int channels_each_class) { + DeformablePSROIPoolBackwardCpuAccKernel( + d, count, top_diff, top_count, num_rois, spatial_scale, channels, height, + width, pooled_height, pooled_width, output_dim, bottom_data_diff, + bottom_trans_diff, bottom_data, bottom_rois, bottom_trans, no_trans, + trans_std, sample_per_part, group_size, part_size, num_classes, + channels_each_class); +} +template struct DeformableConv2DIm2Col; +template struct DeformableConv2DCol2Im; +template struct DeformableConv2DCol2ImCoord; +template struct PureAddTo; +template struct SetOne; +template struct SetZeros; +template struct SwapAxis; +template struct SetNumAtIndex; + +template struct DeformableConv2DIm2Col; +template struct DeformableConv2DCol2Im; +template struct DeformableConv2DCol2ImCoord; +template struct PureAddTo; +template struct SetOne; +template struct SetZeros; +template struct SwapAxis; +template struct SetNumAtIndex; + +template struct LaunchBatchMatMul; +template struct LaunchBatchMatMul; +template struct DeformablePSROIPoolForward; +template struct DeformablePSROIPoolForward; +template struct DeformablePSROIPoolBackwardKernel; +template struct DeformablePSROIPoolBackwardKernel; + +template +class DeformableConv2DOp : public OpKernel { + public: + explicit DeformableConv2DOp(OpKernelConstruction *ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, InitDeformableConv2DParameters(ctx, ¶ms_)); + } + void Compute(OpKernelContext *context) override { + // Input tensor's shape + // [batch, channels, height, weight] + const Tensor &input = context->input(0); + const TensorShape &input_shape = input.shape(); + // [out_channels, in_channels, filter_height, filter_weight] + const Tensor &filter = context->input(1); + const TensorShape &filter_shape = filter.shape(); + // [batch, 2 * filter.Size(), out_height, out_weight] + const Tensor &offset = context->input(2); + const TensorShape &offset_shape = offset.shape(); + // [batch, filter.Size(), out_height, out_weight] + const Tensor &mask = context->input(3); + const TensorShape &mask_shape = mask.shape(); + + DeformableConv2DDimensions dimensions; + OP_REQUIRES_OK(context, ComputeDeformableConv2DDimension( + params_, input, filter, &dimensions, 0)); + // data_format = NCHW + // 这个地方我出了bug,原因是shapefromformat的参数必须是data_format, N, H, W, + // C,因为其内部是根据data_format来决定是否需要进行transpose, + // 如何第三个参数给了C, 且第一个参数为NCHW,那最后得到的结果会是NWCH + TensorShape out_shape = ShapeFromFormat( + params_.data_format, dimensions.batch, dimensions.out_rows, + dimensions.out_cols, dimensions.out_depth); + + // Output tensor is of the following dimensions: + // [ in_batch, out_depth, out_rows, out_cols] + // Tensor* output = nullptr; + // OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output)); + VLOG(2) << "DeformableConv2D: in_depth = " << dimensions.in_depth + << ", patch_depth = " << dimensions.patch_depth + << ", input_cols = " << dimensions.input_cols + << ", filter_cols = " << dimensions.filter_cols + << ", input_rows = " << dimensions.input_rows + << ", filter_rows = " << dimensions.filter_rows + << ", stride_rows = " << dimensions.stride_rows + << ", stride_cols = " << dimensions.stride_cols + << ", dilation_rows = " << dimensions.dilation_rows + << ", dilation_cols = " << dimensions.dilation_cols + << ", out_depth = " << dimensions.out_depth; + + // If there is nothing to compute, return. + if (out_shape.num_elements() == 0) { + return; + } + + /** + * from here i stop use the traditional convolution implement of the + * official code which was defined in conv_ops.cc and began to use the + * implement of the deformable conv2d of the msra version + * **/ + LayerSetUp(input_shape, filter_shape, offset_shape, mask_shape, out_shape); + // notice the fact that the flat function return a reference of a pointer, + // but in fact we only need a pointer + const T *in_data_ptr = input.template flat().data(); + const T *offset_ptr = offset.template flat().data(); + const T *mask_ptr = mask.template flat().data(); + const Device &d = context->eigen_device(); + int col_buffer_shape_temp[4]; // calculate the shape of col_buffer, + // mxnet源码是 + 1, 多了一个im2col_step_ + col_buffer_shape_temp[0] = ProdShape( + filter_shape, 1, + filter_shape + .dims()); // 卷积核的参数个数,注意卷积核的形状应该是[out_depth, + // in_depth, height, weight] + col_buffer_shape_temp[1] = im2col_step_; + col_buffer_shape_temp[2] = out_shape.dim_size(2); + col_buffer_shape_temp[3] = out_shape.dim_size(3); + TensorShape col_buffer_shape = + TensorShape({col_buffer_shape_temp[0], col_buffer_shape_temp[1], + col_buffer_shape_temp[2], col_buffer_shape_temp[3]}); + + Tensor col_buffer; + OP_REQUIRES_OK(context, + context->allocate_temp(DataTypeToEnum::value, + col_buffer_shape, &col_buffer)); + T *col_buffer_ptr = col_buffer.template flat().data(); + + int32_t M = conv_out_channels_ / group_; // filter的数量 + int32_t N = im2col_step_ * conv_out_spatial_dim_; + int32_t K = kernel_dim_; // 卷积的参数个数 + + Tensor weight_3d; + TensorShape weight_3d_shape = TensorShape({group_, M, K}); + OP_REQUIRES(context, weight_3d.CopyFrom(filter, weight_3d_shape), + errors::InvalidArgument("shape doesn't match")); + T *weight_3d_ptr = weight_3d.template flat().data(); + + Tensor *output_temp_4d = NULL; + OP_REQUIRES_OK(context, + context->allocate_output(0, out_shape, &output_temp_4d)); + auto output_temp_4d_ptr = output_temp_4d->template flat().data(); + // auto output__ptr = output_temp_4d->flat(); + /** + * 这样的话下面计算矩阵乘法的时候直接就写到这个输出里了 + * 但是注意的是作者实现的时候划分step,这个时候其实是往shape为{num_ + * / im2col_step_, group_, M, + * N}的输出里写的,所以最后一定要置换一下维度的位置 + * **/ + SetZeros()(d, ProdShape(out_shape, 0, out_shape.dims()), + output_temp_4d_ptr); + TShape pads; + pads.push_back(dimensions.pad_rows); + pads.push_back(dimensions.pad_cols); + for (int32_t n = 0; n < num_ / im2col_step_; ++n) { // 分batch进行 + // transform image to col_buffer in order to use gemm + DeformableConv2DIm2Col()( + d, + in_data_ptr + + n * im2col_step_ * + input_dim_, // dptr是获取输入数据的指针 + n * im2col_step_* + // input_dim 是让指针向后移动 一张图片的数据 + offset_ptr + n * im2col_step_ * input_offset_dim_, // + mask_ptr + n * im2col_step_ * input_mask_dim_, ToVector(input_shape), + ToVector(col_buffer_shape), SubVector(filter_shape, 2, 4), pads, + SubVector(params_.strides, 2, 4), SubVector(params_.dilations, 2, 4), + params_.deformable_groups, col_buffer_ptr); + TensorShape col_buffer_3d_shape = TensorShape({group_, K, N}); + + auto output_temp_group_ptr = output_temp_4d_ptr + (n * group_ * M * N); + + LaunchBatchMatMul::launch( + context, weight_3d_shape, col_buffer_3d_shape, weight_3d_ptr, + col_buffer_ptr, false, false, output_temp_group_ptr); + } + } + + private: + DeformableConv2DParameters params_; + bool use_cudnn_; + bool cudnn_use_autotune_; + int32_t channel_axis_; // channel axis of the input + int32_t channels_; // number of channels of input image + int32_t num_spatial_axes_; // number of spatial axes + int32_t num_; // batch size + int32_t group_; // number of groups + int32_t conv_out_channels_; // number of output channels (num_filter) + int32_t + conv_out_spatial_dim_; // number of pixels of output images per channel + int32_t conv_in_channels_; // number of input channels + int32_t kernel_dim_; // number of input channels per group * kernel size + int32_t weight_offset_; // number of output channels per group * kernel_dim_ + int32_t col_offset_; + int32_t output_offset_; + int32_t col_buffer_size_; + int32_t input_dim_; + int32_t input_offset_dim_; + int32_t input_mask_dim_; + int32_t output_dim_; + int32_t num_kernels_im2col_; + int32_t num_kernels_col2im_; + int32_t im2col_step_; + bool bias_term_; // has bias term? + bool is_1x1_; + void LayerSetUp(const TensorShape &ishape, const TensorShape &filter_shape, + const TensorShape &offset_shape, + const TensorShape &mask_shape, const TensorShape &oshape) { + channel_axis_ = 1; // hard code channel axis, fixed the input data_format + const int32_t first_spatial_axis = channel_axis_ + 1; + const int32_t num_axes = filter_shape.dims(); + num_spatial_axes_ = + num_axes - + first_spatial_axis; //表示的是空间坐标个数,比如说2维卷积里,就是2, + // 3维卷积里就是3 + is_1x1_ = true; // 判断是否为1x1卷积 + for (int32_t i = 2; i < filter_shape.dims(); ++i) { + // is_1x1_ &= filter_shape.dim_size(i) == 1 && params_.stride[i] == 1 && + // params_.pad[i] == 0; + is_1x1_ &= + filter_shape.dim_size(i) == 1; // only judge by the filter's shape + if (!is_1x1_) break; + } + num_ = ishape.dim_size(0); // batch size + channels_ = ishape.dim_size(1); // number of input channels + group_ = params_.num_groups; // + conv_out_channels_ = filter_shape.dim_size(0); // output channel nums + conv_in_channels_ = channels_; // input channel nums + bias_term_ = !params_.no_bias; // + kernel_dim_ = + conv_in_channels_ / group_ * filter_shape.dim_size(2) * + filter_shape.dim_size( + 3); // Size()返回tensor中元素个数,即各维度大小的乘积,所以这里的kernel_dim的意思是卷积核的参数个数了. + conv_out_spatial_dim_ = ProdShape( + oshape, 2, + oshape + .dims()); // ProdShape(dimstart, dimend)返回指定维度大小乘积, + // 这个变量代表每个通道的像素点个数, + // oshape.ndim()返回这个shape的维度,假设是NCHW那么返回4,则为 + // H * W, + // col_offset_ = kernel_dim_ * + // conv_out_spatial_dim_;//kernel_dim代表一个卷积核参数的个数,conv_out_spatial_dim_相当于特征图上的坐标个数,那这个变量相当于总共需要的偏移量 + // weight_offset_ = conv_out_channels_ * kernel_dim_ / + // group_;//这里应该是所有的权重的个数,也就是需要求的权重偏移的个数 + // output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ / + // group_;//这里是输出通道数乘上每个通道的像素点的个数,所以结果应该是输出的总维度,就是C*H*W + im2col_step_ = std::min(params_.im2col_step, num_); + col_buffer_size_ = + kernel_dim_ * group_ * im2col_step_ * + conv_out_spatial_dim_; // 开辟的缓存大小// size of the column buffer + // used for storing im2col-ed pixels + + input_dim_ = ProdShape( + ishape, 1, + ishape.dims()); // input image size (#channels * height * width) + input_offset_dim_ = + ProdShape(offset_shape, 1, offset_shape.dims()); // 18 * H * W + input_mask_dim_ = ProdShape(mask_shape, 1, mask_shape.dims()); // 9 * H * W + output_dim_ = ProdShape(oshape, 1, oshape.dims()); //输出的元素个数 + + num_kernels_im2col_ = + conv_in_channels_ * + conv_out_spatial_dim_; //如果输出和输入的分辨率不变的话,代表输入数据的dim,我个人觉得就是把整个输入展开为一个一维向量,在求其维度大小 + num_kernels_col2im_ = input_dim_; //输入数据的dim + } +}; + +template +class DeformableConv2DBackPropOp : public OpKernel { + public: + explicit DeformableConv2DBackPropOp(OpKernelConstruction *context) + : OpKernel(context) { + OP_REQUIRES_OK(context, InitDeformableConv2DParameters(context, ¶ms_)); + } + void Compute(OpKernelContext *ctx) override { + const Tensor &x = ctx->input(0); + const TensorShape &x_shape = x.shape(); + const Tensor &filter = ctx->input(1); + const TensorShape &filter_shape = filter.shape(); + const Tensor &offset = ctx->input(2); + const TensorShape &offset_shape = offset.shape(); + const Tensor &mask = ctx->input(3); + const TensorShape &mask_shape = mask.shape(); + const Tensor &out_grad = ctx->input(4); + const TensorShape &out_grad_shape = out_grad.shape(); + DeformableConv2DDimensions dimensions; + OP_REQUIRES_OK(ctx, ComputeDeformableConv2DDimension(params_, x, filter, + &dimensions, 1)); + LayerSetUp(x_shape, filter_shape, offset_shape, mask_shape, out_grad_shape); + const Device &d = ctx->eigen_device(); + int col_buffer_shape_temp[4]; + col_buffer_shape_temp[0] = ProdShape(filter_shape, 1, filter_shape.dims()); + col_buffer_shape_temp[1] = im2col_step_; + col_buffer_shape_temp[2] = out_grad_shape.dim_size(2); + col_buffer_shape_temp[3] = out_grad_shape.dim_size(3); + TensorShape col_buffer_shape = + TensorShape({col_buffer_shape_temp[0], col_buffer_shape_temp[1], + col_buffer_shape_temp[2], col_buffer_shape_temp[3]}); + int32_t M = kernel_dim_; + int32_t N = im2col_step_ * conv_out_spatial_dim_; + int32_t K = conv_out_channels_ / group_; + const auto x_ptr = x.template flat().data(); + const auto offset_ptr = offset.template flat().data(); + const auto mask_ptr = mask.template flat().data(); + const auto weight_3d_ptr = filter.template flat().data(); + TensorShape weight_3d_shape = TensorShape({group_, K, M}); + Tensor out_grad_4d; + TensorShape out_grad_4d_shape = + TensorShape({num_ / im2col_step_, im2col_step_, conv_out_channels_, + conv_out_spatial_dim_}); + OP_REQUIRES(ctx, out_grad_4d.CopyFrom(out_grad, out_grad_4d_shape), + errors::InvalidArgument("shape doesn't match")); + auto out_grad_4d_ptr = out_grad_4d.template flat().data(); + out_grad_4d_shape = TensorShape({num_ / im2col_step_, group_, K, N}); + Tensor col_buffer; + OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::value, + col_buffer_shape, &col_buffer)); + auto col_buffer_3d_ptr = col_buffer.template flat().data(); + TensorShape col_buffer_3d_shape = TensorShape({group_, M, N}); + Tensor *dweight_3d = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(1, filter_shape, &dweight_3d)); + T *dweight_3d_ptr = dweight_3d->template flat().data(); + Tensor *x_grad = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, x_shape, &x_grad)); + T *x_grad_ptr = x_grad->template flat().data(); + Tensor *offset_grad = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(2, offset_shape, &offset_grad)); + T *offset_grad_ptr = offset_grad->template flat().data(); + + Tensor *mask_grad = nullptr; + OP_REQUIRES_OK(ctx, ctx->allocate_output(3, mask_shape, &mask_grad)); + T *mask_grad_ptr = mask_grad->template flat().data(); + TShape pads; + pads.push_back(dimensions.pad_rows); + pads.push_back(dimensions.pad_cols); + TShape kernel_shape = SubVector(filter_shape, 2, 4); + TShape stride_shape = SubVector(params_.strides, 2, 4); + TShape dilation_shape = SubVector(params_.dilations, 2, 4); + Tensor dweight_3d_temp; + OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum::value, + filter_shape, &dweight_3d_temp)); + T *dweight_3d_temp_ptr = dweight_3d_temp.template flat().data(); + SetZeros()(d, group_ * M * N, col_buffer_3d_ptr); + SetZeros()(d, ProdShape(x_shape, 0, x_shape.dims()), x_grad_ptr); + SetZeros()(d, ProdShape(filter_shape, 0, filter_shape.dims()), + dweight_3d_ptr); + SetZeros()(d, ProdShape(filter_shape, 0, filter_shape.dims()), + dweight_3d_temp_ptr); + for (int n = 0; n < num_ / im2col_step_; ++n) { + TensorShape out_grad_3d_shape = TensorShape({group_, K, N}); + T *out_grad_3d_ptr = out_grad_4d_ptr + n * group_ * K * N; + LaunchBatchMatMul::launch( + ctx, weight_3d_shape, out_grad_3d_shape, weight_3d_ptr, + out_grad_3d_ptr, true, false, col_buffer_3d_ptr); + DeformableConv2DCol2ImCoord()( + d, col_buffer_3d_ptr, x_ptr + n * im2col_step_ * input_dim_, + offset_ptr + n * im2col_step_ * input_offset_dim_, + mask_ptr + n * im2col_step_ * input_mask_dim_, ToVector(x_shape), + ToVector(col_buffer_shape), kernel_shape, pads, stride_shape, + dilation_shape, params_.deformable_groups, + offset_grad_ptr + n * im2col_step_ * input_offset_dim_, + mask_grad_ptr + n * im2col_step_ * input_mask_dim_); + DeformableConv2DCol2Im()( + d, col_buffer_3d_ptr, + offset_ptr + n * im2col_step_ * input_offset_dim_, + mask_ptr + n * im2col_step_ * input_mask_dim_, ToVector(x_shape), + ToVector(col_buffer_shape), kernel_shape, pads, stride_shape, + dilation_shape, params_.deformable_groups, + x_grad_ptr + n * im2col_step_ * input_dim_); + DeformableConv2DIm2Col()( + d, x_ptr + n * im2col_step_ * input_dim_, + offset_ptr + n * im2col_step_ * input_offset_dim_, + mask_ptr + n * im2col_step_ * input_mask_dim_, ToVector(x_shape), + ToVector(col_buffer_shape), kernel_shape, pads, stride_shape, + dilation_shape, params_.deformable_groups, col_buffer_3d_ptr); + if (n == 0) { + LaunchBatchMatMul::launch( + ctx, out_grad_3d_shape, col_buffer_3d_shape, out_grad_3d_ptr, + col_buffer_3d_ptr, false, true, dweight_3d_ptr); + } else { + LaunchBatchMatMul::launch( + ctx, out_grad_3d_shape, col_buffer_3d_shape, out_grad_3d_ptr, + col_buffer_3d_ptr, false, true, dweight_3d_temp_ptr); + PureAddTo()(d, + ProdShape(filter_shape, 0, filter_shape.dims()), + dweight_3d_ptr, dweight_3d_temp_ptr); + } + } + } + + private: + DeformableConv2DParameters params_; + // bool use_cudnn_; + // bool cudnn_use_autotune_; + int32_t channel_axis_; // channel axis of the input + int32_t channels_; // number of channels of input image + int32_t num_spatial_axes_; // number of spatial axes + int32_t num_; // batch size + int32_t group_; // number of groups + int32_t conv_out_channels_; // number of output channels (num_filter) + int32_t + conv_out_spatial_dim_; // number of pixels of output images per channel + int32_t conv_in_channels_; // number of input channels + int32_t kernel_dim_; // number of input channels per group * kernel size + int32_t weight_offset_; // number of output channels per group * kernel_dim_ + int32_t col_offset_; + int32_t output_offset_; + int32_t col_buffer_size_; + int32_t input_dim_; + int32_t input_offset_dim_; + int32_t input_mask_dim_; + int32_t output_dim_; + int32_t num_kernels_im2col_; + int32_t num_kernels_col2im_; + int32_t im2col_step_; + bool bias_term_; // has bias term? + bool is_1x1_; + void LayerSetUp(const TensorShape &ishape, const TensorShape &filter_shape, + const TensorShape &offset_shape, + const TensorShape &mask_shape, const TensorShape &oshape) { + channel_axis_ = 1; // hard code channel axis, fixed the input data_format + const int32_t first_spatial_axis = channel_axis_ + 1; + const int32_t num_axes = filter_shape.dims(); + num_spatial_axes_ = + num_axes - + first_spatial_axis; //表示的是空间坐标个数,比如说2维卷积里,就是2, + // 3维卷积里就是3 + is_1x1_ = true; // 判断是否为1x1卷积 + for (int32_t i = 2; i < filter_shape.dims(); ++i) { + // is_1x1_ &= filter_shape.dim_size(i) == 1 && params_.stride[i] == 1 && + // params_.pad[i] == 0; + is_1x1_ &= + filter_shape.dim_size(i) == 1; // only judge by the filter's shape + if (!is_1x1_) break; + } + num_ = ishape.dim_size(0); // batch size + channels_ = ishape.dim_size(1); // number of input channels + group_ = params_.num_groups; // + conv_out_channels_ = filter_shape.dim_size(0); // output channel nums + conv_in_channels_ = channels_; // input channel nums + bias_term_ = !params_.no_bias; // + kernel_dim_ = + conv_in_channels_ / group_ * filter_shape.dim_size(2) * + filter_shape.dim_size( + 3); // Size()返回tensor中元素个数,即各维度大小的乘积,所以这里的kernel_dim的意思是卷积核的参数个数了. + conv_out_spatial_dim_ = ProdShape( + oshape, 2, + oshape + .dims()); // ProdShape(dimstart, dimend)返回指定维度大小乘积, + // 这个变量代表每个通道的像素点个数, + // oshape.ndim()返回这个shape的维度,假设是NCHW那么返回4,则为 + // H * W, + col_offset_ = + kernel_dim_ * + conv_out_spatial_dim_; // kernel_dim代表一个卷积核参数的个数,conv_out_spatial_dim_相当于特征图上的坐标个数,那这个变量相当于总共需要的偏移量 + weight_offset_ = + conv_out_channels_ * kernel_dim_ / + group_; //这里应该是所有的权重的个数,也就是需要求的权重偏移的个数 + output_offset_ = + conv_out_channels_ * conv_out_spatial_dim_ / + group_; //这里是输出通道数乘上每个通道的像素点的个数,所以结果应该是输出的总维度,就是C*H*W + im2col_step_ = std::min(params_.im2col_step, num_); + col_buffer_size_ = + kernel_dim_ * group_ * im2col_step_ * + conv_out_spatial_dim_; // 开辟的缓存大小// size of the column buffer + // used for storing im2col-ed pixels + + input_dim_ = ProdShape( + ishape, 1, + ishape.dims()); // input image size (#channels * height * width) + input_offset_dim_ = + ProdShape(offset_shape, 1, offset_shape.dims()); // 18 * H * W + input_mask_dim_ = ProdShape(mask_shape, 1, mask_shape.dims()); // 9 * H * W + output_dim_ = ProdShape(oshape, 1, oshape.dims()); //输出的元素个数 + + num_kernels_im2col_ = + conv_in_channels_ * + conv_out_spatial_dim_; //如果输出和输入的分辨率不变的话,代表输入数据的dim,我个人觉得就是把整个输入展开为一个一维向量,在求其维度大小 + num_kernels_col2im_ = input_dim_; //输入数据的dim + }; +}; + +template +class DeformablePSROIPoolOp : public OpKernel { + public: + explicit DeformablePSROIPoolOp(OpKernelConstruction *ctx) : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("pooled_size", &pool_size)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("no_trans", &no_trans)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("spatial_scale", &spatial_scale)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("output_dim", &output_dim)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("group_size", &group_size)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("part_size", &part_size)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("sample_per_part", &sample_per_part)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("trans_std", &trans_std)); + } + void Compute(OpKernelContext *ctx) override { + const Tensor &data = ctx->input(0); + const Tensor &bbox = ctx->input(1); + const Tensor &trans = ctx->input(2); + const int batch = data.dim_size(0); + const int channels = data.dim_size(1); + const int height = data.dim_size(2); + const int width = data.dim_size(3); + const int channels_trans = no_trans ? 2 : trans.dim_size(1); + const int num_bbox = bbox.dim_size(0); + Tensor *output; + Tensor *top_count; + const int pooled_width = pool_size; + const int pooled_height = pool_size; + const int count = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = + no_trans ? output_dim : output_dim / num_classes; + TensorShape output_shape{num_bbox, output_dim, pooled_height, pooled_width}; + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output)); + OP_REQUIRES_OK(ctx, ctx->allocate_output(1, output_shape, &top_count)); + const Type *bottom_data = data.flat().data(); + const Type *bottom_rois = bbox.flat().data(); + const Type *bottom_trans = no_trans ? nullptr : trans.flat().data(); + Type *top_data = output->flat().data(); + Type *top_count_data = top_count->flat().data(); + const Device &d = ctx->eigen_device(); + DeformablePSROIPoolForward()( + d, count, bottom_data, spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, + trans_std, sample_per_part, output_dim, group_size, part_size, + num_classes, channels_each_class, top_data, top_count_data); + } + + private: + int pool_size; + int no_trans; + float spatial_scale; + int output_dim; + int group_size; + int part_size; + int sample_per_part; + float trans_std; +}; + +template +class DeformablePSROIPoolBackPropOp : public OpKernel { + public: + explicit DeformablePSROIPoolBackPropOp(OpKernelConstruction *ctx) + : OpKernel(ctx) { + OP_REQUIRES_OK(ctx, ctx->GetAttr("pooled_size", &pool_size)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("no_trans", &no_trans)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("spatial_scale", &spatial_scale)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("output_dim", &output_dim)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("group_size", &group_size)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("part_size", &part_size)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("sample_per_part", &sample_per_part)); + OP_REQUIRES_OK(ctx, ctx->GetAttr("trans_std", &trans_std)); + } + void Compute(OpKernelContext *ctx) override { + const Tensor &data = ctx->input(0); + const Tensor &bbox = ctx->input(1); + const Tensor &trans = ctx->input(2); + const Tensor &top_count = ctx->input(3); + const Tensor &out_grad = ctx->input(4); + const int batch = data.dim_size(0); + const int channels = data.dim_size(1); + const int height = data.dim_size(2); + const int width = data.dim_size(3); + const int channels_trans = no_trans ? 2 : trans.dim_size(1); + const int num_bbox = bbox.dim_size(0); + const int num_rois = num_bbox; + const int pooled_height = pool_size; + const int pooled_width = pool_size; + const int count = num_bbox * output_dim * pooled_height * pooled_width; + const int num_classes = no_trans ? 1 : channels_trans / 2; + const int channels_each_class = + no_trans ? output_dim : output_dim / num_classes; + Tensor *in_grad = nullptr; + Tensor *trans_grad = nullptr; + const TensorShape &in_grad_shape = data.shape(); + OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in_grad_shape, &in_grad)); + TensorShape trans_grad_shape; + const Type *top_diff = out_grad.flat().data(); + const Type *bottom_data = data.flat().data(); + const Type *bottom_rois = bbox.flat().data(); + trans_grad_shape = trans.shape(); + OP_REQUIRES_OK(ctx, ctx->allocate_output(1, trans_grad_shape, &trans_grad)); + const Type *bottom_trans = no_trans ? nullptr : trans.flat().data(); + Type *bottom_data_diff = in_grad->flat().data(); + Type *bottom_trans_diff = + no_trans ? nullptr : trans_grad->flat().data(); + const Type *top_count_data = top_count.flat().data(); + const Device &d = ctx->eigen_device(); + DeformablePSROIPoolBackwardKernel()( + d, count, top_diff, top_count_data, num_rois, spatial_scale, channels, + height, width, pooled_height, pooled_width, output_dim, + bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois, + bottom_trans, no_trans, trans_std, sample_per_part, group_size, + part_size, num_classes, channels_each_class); + } + + private: + int pool_size; + int no_trans; + float spatial_scale; + int output_dim; + int group_size; + int part_size; + int sample_per_part; + float trans_std; +}; + +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("AddonsDeformableConv2D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + DeformableConv2DOp); \ + REGISTER_KERNEL_BUILDER(Name("AddonsDeformableConv2DBackProp") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + DeformableConv2DBackPropOp); +REGISTER_CPU(float); +REGISTER_CPU(double); +#undef REGISTER_CPU +#define REGISTER_CPU(T) \ + REGISTER_KERNEL_BUILDER(Name("AddonsDeformablePsroiPool") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + DeformablePSROIPoolOp); \ + REGISTER_KERNEL_BUILDER(Name("AddonsDeformablePsroiPoolBackProp") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + DeformablePSROIPoolBackPropOp); +REGISTER_CPU(float); +REGISTER_CPU(double); +#undef REGISTER_CPU + +#ifdef GOOGLE_CUDA +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER(Name("AddonsDeformableConv2D") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T"), \ + DeformableConv2DOp); \ + REGISTER_KERNEL_BUILDER(Name("AddonsDeformableConv2DBackProp") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T"), \ + DeformableConv2DBackPropOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#undef REGISTER_GPU +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER(Name("AddonsDeformablePsroiPool") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T"), \ + DeformablePSROIPoolOp); \ + REGISTER_KERNEL_BUILDER(Name("AddonsDeformablePsroiPoolBackProp") \ + .Device(DEVICE_GPU) \ + .TypeConstraint("T"), \ + DeformablePSROIPoolBackPropOp); +REGISTER_GPU(float); +REGISTER_GPU(double); +#endif + +} // namespace functor +} // namespace addons +} // namespace tensorflow diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h new file mode 100755 index 0000000000..04413f1f60 --- /dev/null +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h @@ -0,0 +1,317 @@ + +#ifndef TF_OPS_DEFORMABLE_CONV2D_H +#define TF_OPS_DEFORMABLE_CONV2D_H + +#ifdef __JETBRAINS_IDE__ +#define __host__ +#define __device__ +#define __shared__ +#define __constant__ +#define __global__ + +// This is slightly mental, but gets it to properly index device function calls +// like __popc and whatever. +//#define __CUDACC__ + +// These headers are all implicitly present when you compile CUDA with clang. +// Clion doesn't know that, so we include them explicitly to make the indexer +// happy. Doing this when you actually build is, obviously, a terrible idea :D +//#include <__clang_cuda_builtin_vars.h> +//#include <__clang_cuda_intrinsics.h> +//#include <__clang_cuda_math_forward_declares.h> +//#include <__clang_cuda_complex_builtins.h> +//#include <__clang_cuda_cmath.h> +#endif // __JETBRAINS_IDE__ + +#define EIGEN_USE_THREADS +#define EIGEN_USE_GPU + +#include +#include + +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/register_types.h" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/lib/core/threadpool.h" +#include "tensorflow/core/util/padding.h" +#include "tensorflow/core/util/tensor_format.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" + +namespace tensorflow { +namespace addons { + +namespace functor { + +using TShape = std::vector; + +typedef Eigen::GpuDevice GPUDevice; +typedef Eigen::ThreadPoolDevice CPUDevice; + +inline int ProdShape(const TShape &shape, int start, int end) { + int res = 1; + for (int i = start; i < end; ++i) { + res *= shape[i]; + } + return res; +} +inline int ProdShape(const TensorShape &shape, int start, int end) { + int res = 1; + for (int i = start; i < end; ++i) { + res *= shape.dim_size(i); + } + return res; +} + +template +struct PureAddTo { + void operator()(const Device &d, const int n, DType *result_data, + const DType *right_data); +}; +struct DeformableConv2DParameters { + TShape dilations; + TShape strides; + Padding padding; + int32_t num_groups; + int32_t deformable_groups; + int32_t im2col_step; + bool no_bias; + TensorFormat data_format; +}; +struct DeformableConv2DDimensions { + int batch; + int input_rows; + int input_cols; + int in_depth; + int filter_rows; + int filter_cols; + int patch_depth; + int out_depth; + int stride_rows; + int stride_cols; + int dilation_rows; + int dilation_cols; + int out_rows; + int out_cols; + int pad_rows; + int pad_cols; +}; +template +struct LaunchBatchMatMul; + +template +struct DeformableConv2DCol2ImCoord { + void operator()(const Device &d, const DType *data_col, const DType *data_im, + const DType *data_offset, const DType *data_mask, + const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *grad_offset, + DType *grad_mask); +}; +template +struct SwapAxis { + void operator()(const Device &d, DType *input_data, + const TShape &origin_shape, const int axis_x, + const int axis_y); +}; +template +struct DeformableConv2DCol2Im { + void operator()(const Device &d, const DType *data_col, + const DType *data_offset, const DType *data_mask, + const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *grad_im); +}; +template +struct DeformableConv2DIm2Col { + void operator()(const Device &d, const DType *data_im, + const DType *data_offset, const DType *data_mask, + const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *data_col); +}; +template +struct SetZeros { + void operator()(const Device &d, int n, DType *result_data); +}; +template +struct SetOne { + void operator()(const Device &d, int n, DType *result_data); +}; +template +struct SetNumAtIndex { + void operator()(const Device &d, DType num, int index, DType *data); +}; +#ifdef GOOGLE_CUDA +template +struct DeformableConv2DIm2Col { + void operator()(const Eigen::GpuDevice &d, const DType *data_im, + const DType *data_offset, const DType *data_mask, + const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *data_col); +}; +template +struct DeformableConv2DCol2Im { + void operator()(const Eigen::GpuDevice &d, const DType *data_col, + const DType *data_offset, const DType *data_mask, + const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *grad_im); +}; +template +struct DeformableConv2DCol2ImCoord { + void operator()(const Eigen::GpuDevice &d, const DType *data_col, + const DType *data_im, const DType *data_offset, + const DType *data_mask, const TShape &im_shape, + const TShape &col_shape, const TShape &kernel_shape, + const TShape &pad, const TShape &stride, + const TShape &dilation, const int32_t deformable_group, + DType *grad_offset, DType *grad_mask); +}; +template +struct SetNumAtIndex { + void operator()(const Eigen::GpuDevice &d, DType num, int index, DType *data); +}; +template +struct SetZeros { + void operator()(const Eigen::GpuDevice &d, int n, DType *result_data); +}; +template +struct SetOne { + void operator()(const Eigen::GpuDevice &d, int n, DType *result_data); +}; +template +struct PureAddTo { + void operator()(const Eigen::GpuDevice &d, const int n, DType *result_data, + const DType *right_data); +}; +#endif +template +struct DeformableConv2DIm2Col { + void operator()(const Eigen::ThreadPoolDevice &d, const DType *data_im, + const DType *data_offset, const DType *data_mask, + const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *data_col); +}; +template +struct DeformableConv2DCol2Im { + void operator()(const Eigen::ThreadPoolDevice &d, const DType *data_col, + const DType *data_offset, const DType *data_mask, + const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *grad_im); +}; +template +struct DeformableConv2DCol2ImCoord { + void operator()(const Eigen::ThreadPoolDevice &d, const DType *data_col, + const DType *data_im, const DType *data_offset, + const DType *data_mask, const TShape &im_shape, + const TShape &col_shape, const TShape &kernel_shape, + const TShape &pad, const TShape &stride, + const TShape &dilation, const int32_t deformable_group, + DType *grad_offset, DType *grad_mask); +}; +template +struct SetNumAtIndex { + void operator()(const Eigen::ThreadPoolDevice &d, DType num, int index, + DType *data); +}; +template +struct SetZeros { + void operator()(const Eigen::ThreadPoolDevice &d, int n, DType *result_data); +}; +template +struct SetOne { + void operator()(const Eigen::ThreadPoolDevice &d, int n, DType *result_data); +}; +template +struct PureAddTo { + void operator()(const Eigen::ThreadPoolDevice &d, const int n, + DType *result_data, const DType *right_data); +}; + +template +struct LaunchBatchMatMul { + static void launch(OpKernelContext *context, const TensorShape &in_x_shape, + const TensorShape &in_y_shape, const T *in_x_ptr, + const T *in_y_ptr, bool adj_x, bool adj_y, T *out); +}; +template +struct LaunchBatchMatMul { + static void launch(OpKernelContext *context, const TensorShape &in_x_shape, + const TensorShape &in_y_shape, const T *in_x_ptr, + const T *in_y_ptr, bool adj_x, bool adj_y, T *out); +}; + +template +struct DeformablePSROIPoolForward {}; +template +struct DeformablePSROIPoolForward { + void operator()(const CPUDevice &d, const int count, const T *bottom_data, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, + const int pooled_width, const T *bottom_rois, + const T *bottom_trans, const int no_trans, const T trans_std, + const int sample_per_part, const int output_dim, + const int group_size, const int part_size, + const int num_classes, const int channels_each_class, + T *top_data, T *top_count); +}; +template +struct DeformablePSROIPoolForward { + void operator()(const GPUDevice &d, const int count, const T *bottom_data, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, + const int pooled_width, const T *bottom_rois, + const T *bottom_trans, const int no_trans, const T trans_std, + const int sample_per_part, const int output_dim, + const int group_size, const int part_size, + const int num_classes, const int channels_each_class, + T *top_data, T *top_count); +}; + +template +struct DeformablePSROIPoolBackwardKernel {}; + +template +struct DeformablePSROIPoolBackwardKernel { + void operator()(const GPUDevice &d, const int count, const T *top_diff, + const T *top_count, const int num_rois, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, T *bottom_data_diff, + T *bottom_trans_diff, const T *bottom_data, + const T *bottom_rois, const T *bottom_trans, + const int no_trans, const T trans_std, + const int sample_per_part, const int group_size, + const int part_size, const int num_classes, + const int channels_each_class); +}; + +template +struct DeformablePSROIPoolBackwardKernel { + void operator()(const CPUDevice &d, const int count, const T *top_diff, + const T *top_count, const int num_rois, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const int output_dim, T *bottom_data_diff, + T *bottom_trans_diff, const T *bottom_data, + const T *bottom_rois, const T *bottom_trans, + const int no_trans, const T trans_std, + const int sample_per_part, const int group_size, + const int part_size, const int num_classes, + const int channels_each_class); +}; +} // namespace functor +} // namespace addons +} // namespace tensorflow + +#endif // TF_OPS_DEFORMABLE_CONV2D_H diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc new file mode 100755 index 0000000000..d7f52aaa12 --- /dev/null +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc @@ -0,0 +1,995 @@ + +#include +#include + +#include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h" + +#ifdef GOOGLE_CUDA +#include "tensorflow/core/platform/stream_executor.h" +#include "tensorflow/core/util/gpu_kernel_helper.h" +#endif + +namespace tensorflow { +namespace addons { + +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; +typedef Eigen::ThreadPoolDevice CPUDevice; + +template +__device__ DType DmcnIm2colBilinear(const DType *bottom_data, + const int data_width, const int height, + const int width, DType h, DType w) { + int h_low = floor(h); + int w_low = floor(w); + int h_high = h_low + 1; + int w_high = w_low + 1; + + DType lh = h - h_low; + DType lw = w - w_low; + DType hh = 1 - lh, hw = 1 - lw; + + DType v1 = 0; + if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low]; + DType v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + v2 = bottom_data[h_low * data_width + w_high]; + DType v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + v3 = bottom_data[h_high * data_width + w_low]; + DType v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + v4 = bottom_data[h_high * data_width + w_high]; + + DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} +template +__device__ DType DmcnGetGradientWeight(DType argmax_h, DType argmax_w, + const int h, const int w, + const int height, const int width) { + /* + * offset h, offset w, (h, w) coordinate + */ + if (argmax_h <= -1 || argmax_w <= -1 || argmax_h >= height || + argmax_w >= width) { + return 0; + } + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + DType weight = 0; + if (h == argmax_h_low && w == argmax_w_low) + weight = (h + 1 - argmax_h) * (w + 1 - argmax_w); + if (h == argmax_h_low && w == argmax_w_high) + weight = (h + 1 - argmax_h) * (argmax_w + 1 - w); + if (h == argmax_h_high && w == argmax_w_low) + weight = (argmax_h + 1 - h) * (w + 1 - argmax_w); + if (h == argmax_h_high && w == argmax_w_high) + weight = (argmax_h + 1 - h) * (argmax_w + 1 - w); + return weight; +} +template +__device__ DType DmcnGetCoordinateWeight(DType argmax_h, DType argmax_w, + const int height, const int width, + const DType *im_data, + const int data_width, + const int bp_dir) { + if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 || + argmax_w >= width) { + // empty + return 0; + } + + int argmax_h_low = floor(argmax_h); + int argmax_w_low = floor(argmax_w); + int argmax_h_high = argmax_h_low + 1; + int argmax_w_high = argmax_w_low + 1; + + DType weight = 0; + + if (bp_dir == 0) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += -1 * (argmax_w - argmax_w_low) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += (argmax_w_low + 1 - argmax_w) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_w - argmax_w_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } else if (bp_dir == 1) { + if (argmax_h_low >= 0 && argmax_w_low >= 0) + weight += -1 * (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_low]; + if (argmax_h_low >= 0 && argmax_w_high <= width - 1) + weight += (argmax_h_low + 1 - argmax_h) * + im_data[argmax_h_low * data_width + argmax_w_high]; + if (argmax_h_high <= height - 1 && argmax_w_low >= 0) + weight += -1 * (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_low]; + if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1) + weight += (argmax_h - argmax_h_low) * + im_data[argmax_h_high * data_width + argmax_w_high]; + } + + return weight; +} + +#ifdef GOOGLE_CUDA +template +__global__ void SwapAxisKernel(const int n, const int cuda_mem_size, + const int min_unit_size, DType *input_data, + const int dim_num, const int axis_x_dims, + const int axis_y_dims, const int axis_x, + const int axis_y) { + CUDA_1D_KERNEL_LOOP(index, n) { + DType *device_data = new DType[cuda_mem_size]; + DType *input_data_ptr = input_data + index * cuda_mem_size; + for (int j = 0; j < axis_y_dims; j++) { + for (int i = 0; i < axis_x_dims; i++) { + DType *temp_ptr = + input_data_ptr + (i * axis_x_dims + j) * min_unit_size; + DType *device_data_temp_ptr = + device_data + (j * axis_y_dims + i) * min_unit_size; + for (int k = 0; k < min_unit_size; k++) { + *(device_data_temp_ptr + k) = *(temp_ptr + k); + } + } + } + for (int i = 0; i < cuda_mem_size; i++) { + *(input_data_ptr + i) = *(device_data + i); + } + delete[] device_data; + } +} +template +__global__ void DeformableConv2DIm2ColKernel( + const int n, const DType *data_im, const DType *data_offset, + const DType *data_mask, const int height, const int width, + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int num_channels, const int deformable_group, + const int height_col, const int width_col, DType *data_col) { + /* + * channel_per_deformable_group // 输入图通道数除以deformable_group的数量, + * //这里的batch_size代表的是im2col_step_, 一般就设为1了 + */ + CUDA_1D_KERNEL_LOOP(index, n) { + const int w_col = index % width_col; + const int h_col = (index / width_col) % height_col; + const int b_col = (index / width_col / height_col) % batch_size; + const int c_im = (index / width_col / height_col) / batch_size; + const int c_col = c_im * kernel_h * kernel_w; + // compute deformable group index + const int deformable_group_index = c_im / channel_per_deformable_group; + const int h_in = h_col * stride_h - pad_h; + const int w_in = w_col * stride_w - pad_w; + DType *data_col_ptr = + data_col + + ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col; + const DType *data_im_ptr = + data_im + (b_col * num_channels + c_im) * height * width; + const DType *data_offset_ptr = + data_offset + (b_col * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + const DType *data_mask_ptr = + data_mask + (b_col * deformable_group + deformable_group_index) * + kernel_h * kernel_w * height_col * width_col; + for (int i = 0; i < kernel_h; ++i) { + for (int j = 0; j < kernel_w; ++j) { + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col + + w_col; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_col) * width_col + w_col; + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + const DType mask = data_mask_ptr[data_mask_hw_ptr]; + DType val = static_cast(0); + const DType h_im = h_in + i * dilation_h + offset_h; + const DType w_im = w_in + j * dilation_w + offset_w; + if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) { + val = + DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im); + } + *data_col_ptr = val * mask; + data_col_ptr += batch_size * height_col * width_col; + } + } + } +} +template +__global__ void DeformablePSROIPoolForwardKernel( + const int count, const T *bottom_data, const T spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, const T *bottom_rois, + const T *bottom_trans, const int no_trans, const T trans_std, + const int sample_per_part, const int output_dim, const int group_size, + const int part_size, const int num_classes, const int channels_each_class, + T *top_data, T *top_count) { + CUDA_1D_KERNEL_LOOP(index, count) { + // The output is in order (n, ctop, ph, pw) + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = (T)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = (T)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = (T)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = (T)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, static_cast(0.1)); // avoid 0 + T roi_height = max(roi_end_h - roi_start_h, static_cast(0.1)); + // Compute w and h at bottom + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + T sub_bin_size_h = bin_size_h / static_cast(sample_per_part); + T sub_bin_size_w = bin_size_w / static_cast(sample_per_part); + int part_h = floor(static_cast(ph) / pooled_height * part_size); + int part_w = floor(static_cast(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = + no_trans + ? (T)(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + + part_h) * + part_size + + part_w] * + (T)trans_std; + T trans_y = + no_trans + ? (T)(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + + part_h) * + part_size + + part_w] * + (T)trans_std; + T wstart = static_cast(pw) * bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = static_cast(ph) * bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + T sum = 0; + int total = 0; + int gw = floor(static_cast(pw) * group_size / pooled_width); + int gh = floor(static_cast(ph) * group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + const T *offset_bottom_data = + bottom_data + (roi_batch_ind * channels) * height * width; + for (int ih = 0; ih < sample_per_part; ++ih) { + for (int iw = 0; iw < sample_per_part; ++iw) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = min(max(w, static_cast(0.)), static_cast(width - 1.)); + h = min(max(h, static_cast(0.)), static_cast(height - 1.)); + int c = (ctop * group_size + gh) * group_size + gw; + T val = DmcnIm2colBilinear(offset_bottom_data + c * height * width, w, + h, w, (T)height, (T)width); + sum += val; + total++; + } + } + top_data[index] = total == 0 ? (T)(0) : sum / total; + top_count[index] = total; + } +} +template +__global__ void DeformablePSROIPoolBackwardAccKernel( + const int count, const T *top_diff, const T *top_count, const int num_rois, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int output_dim, T *bottom_data_diff, T *bottom_trans_diff, + const T *bottom_data, const T *bottom_rois, const T *bottom_trans, + const int no_trans, const T trans_std, const int sample_per_part, + const int group_size, const int part_size, const int num_classes, + const int channels_each_class) { + CUDA_1D_KERNEL_LOOP(index, count) { + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + // [start, end) interval for spatial sampling + const T *offset_bottom_rois = bottom_rois + n * 5; + int roi_batch_ind = offset_bottom_rois[0]; + T roi_start_w = (T)(round(offset_bottom_rois[1])) * spatial_scale - 0.5; + T roi_start_h = (T)(round(offset_bottom_rois[2])) * spatial_scale - 0.5; + T roi_end_w = (T)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5; + T roi_end_h = (T)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5; + // Force too small ROIs to be 1x1 + T roi_width = max(roi_end_w - roi_start_w, static_cast(0.1)); // avoid 0 + T roi_height = max(roi_end_h - roi_start_h, static_cast(0.1)); + + // Compute w and h at bottom + T bin_size_h = roi_height / (T)(pooled_height); + T bin_size_w = roi_width / (T)(pooled_width); + + T sub_bin_size_h = bin_size_h / (T)(sample_per_part); + T sub_bin_size_w = bin_size_w / (T)(sample_per_part); + + int part_h = floor((T)(ph) / pooled_height * part_size); + int part_w = floor((T)(pw) / pooled_width * part_size); + int class_id = ctop / channels_each_class; + T trans_x = + no_trans + ? (T)(0) + : bottom_trans[(((n * num_classes + class_id) * 2) * part_size + + part_h) * + part_size + + part_w] * + (T)trans_std; + T trans_y = + no_trans + ? (T)(0) + : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size + + part_h) * + part_size + + part_w] * + (T)trans_std; + + T wstart = (T)(pw)*bin_size_w + roi_start_w; + wstart += trans_x * roi_width; + T hstart = (T)(ph)*bin_size_h + roi_start_h; + hstart += trans_y * roi_height; + + if (top_count[index] <= 0) { + continue; + } + T diff_val = top_diff[index] / top_count[index]; + const T *offset_bottom_data = + bottom_data + roi_batch_ind * channels * height * width; + T *offset_bottom_data_diff = + bottom_data_diff + roi_batch_ind * channels * height * width; + int gw = floor((T)(pw)*group_size / pooled_width); + int gh = floor((T)(ph)*group_size / pooled_height); + gw = min(max(gw, 0), group_size - 1); + gh = min(max(gh, 0), group_size - 1); + for (int ih = 0; ih < sample_per_part; ih++) { + for (int iw = 0; iw < sample_per_part; iw++) { + T w = wstart + iw * sub_bin_size_w; + T h = hstart + ih * sub_bin_size_h; + // bilinear interpolation + if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) { + continue; + } + w = min(max(w, 0.), width - 1.); + h = min(max(h, 0.), height - 1.); + int c = (ctop * group_size + gh) * group_size + gw; + // backward on feature + int x0 = floor(w); + int x1 = ceil(w); + int y0 = floor(h); + int y1 = ceil(h); + T dist_x = w - x0, dist_y = h - y0; + T q00 = (1 - dist_x) * (1 - dist_y); + T q01 = (1 - dist_x) * dist_y; + T q10 = dist_x * (1 - dist_y); + T q11 = dist_x * dist_y; + int bottom_index_base = c * height * width; + CudaAtomicAdd( + offset_bottom_data_diff + bottom_index_base + y0 * width + x0, + q00 * diff_val); + CudaAtomicAdd( + offset_bottom_data_diff + bottom_index_base + y1 * width + x0, + q01 * diff_val); + CudaAtomicAdd( + offset_bottom_data_diff + bottom_index_base + y0 * width + x1, + q10 * diff_val); + CudaAtomicAdd( + offset_bottom_data_diff + bottom_index_base + y1 * width + x1, + q11 * diff_val); + + if (no_trans) { + continue; + } + T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0]; + T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0]; + T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1]; + T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1]; + T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y - + U00 * (1 - dist_y)) * + trans_std * diff_val; + diff_x *= roi_width; + T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x - + U00 * (1 - dist_x)) * + trans_std * diff_val; + diff_y *= roi_height; + + CudaAtomicAdd( + bottom_trans_diff + + (((n * num_classes + class_id) * 2) * part_size + part_h) * + part_size + + part_w, + diff_x); + CudaAtomicAdd( + bottom_trans_diff + + (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) * + part_size + + part_w, + diff_y); + } + } + } +} +template +__global__ void DeformableConv2DCol2ImKernel( + const int n, const DType *data_col, const DType *data_offset, + const DType *data_mask, const int channels, const int height, + const int width, const int kernel_h, const int kernel_w, const int pad_h, + const int pad_w, const int stride_h, const int stride_w, + const int dilation_h, const int dilation_w, + const int channel_per_deformable_group, const int batch_size, + const int deformable_group, const int height_col, const int width_col, + DType *grad_im) { + CUDA_1D_KERNEL_LOOP(index, n) { + const int j = (index / width_col / height_col / batch_size) % kernel_w; + const int i = + (index / width_col / height_col / batch_size / kernel_w) % kernel_h; + const int c = + index / width_col / height_col / batch_size / kernel_w / kernel_h; + // compute the start and end of the output + const int deformable_group_index = c / channel_per_deformable_group; + + int w_out = index % width_col; + int h_out = (index / width_col) % height_col; + int b = (index / width_col / height_col) % batch_size; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + + const DType *data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + const DType *data_mask_ptr = + data_mask + (b * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + const int data_offset_h_ptr = + ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out; + const int data_offset_w_ptr = + ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out; + const int data_mask_hw_ptr = + ((i * kernel_w + j) * height_col + h_out) * width_col + w_out; + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + const DType mask = data_mask_ptr[data_mask_hw_ptr]; + const DType cur_inv_h_data = h_in + i * dilation_h + offset_h; + const DType cur_inv_w_data = w_in + j * dilation_w + offset_w; + + const DType cur_top_grad = data_col[index] * mask; + const int cur_h = (int)cur_inv_h_data; + const int cur_w = (int)cur_inv_w_data; + for (int dy = -2; dy <= 2; dy++) { + for (int dx = -2; dx <= 2; dx++) { + if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 && + cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 && + abs(cur_inv_w_data - (cur_w + dx)) < 1) { + int cur_bottom_grad_pos = + ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx; + DType weight = + DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy, + cur_w + dx, height, width); + CudaAtomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad); + } + } + } + } +} +template +__global__ void DeformableConv2DCol2ImCoordGPUKernel( + const int n, const DType *data_col, const DType *data_im, + const DType *data_offset, const DType *data_mask, const int channels, + const int height, const int width, // 输入的C, H, W + const int kernel_h, const int kernel_w, const int pad_h, const int pad_w, + const int stride_h, const int stride_w, const int dilation_h, + const int dilation_w, const int channel_per_deformable_group, + const int batch_size, const int offset_channels, const int deformable_group, + const int height_col, const int width_col, DType *grad_offset, + DType *grad_mask) { + CUDA_1D_KERNEL_LOOP(index, n) { + DType val = 0, mval = 0; + int w = index % width_col; + int h = (index / width_col) % height_col; + int c = (index / width_col / height_col) % offset_channels; + int b = (index / width_col / height_col) / offset_channels; + // compute the start and end of the output + + const int deformable_group_index = c / (2 * kernel_h * kernel_w); + const int col_step = kernel_h * kernel_w; + int cnt = 0; + const DType *data_col_ptr = + data_col + deformable_group_index * channel_per_deformable_group * + batch_size * width_col * height_col; + const DType *data_im_ptr = + data_im + (b * deformable_group + deformable_group_index) * + channel_per_deformable_group / kernel_h / kernel_w * + height * width; + const DType *data_offset_ptr = + data_offset + (b * deformable_group + deformable_group_index) * 2 * + kernel_h * kernel_w * height_col * width_col; + const DType *data_mask_ptr = + data_mask + (b * deformable_group + deformable_group_index) * kernel_h * + kernel_w * height_col * width_col; + + const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w; + + for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group; + col_c += col_step) { + const int col_pos = + (((col_c * batch_size + b) * height_col) + h) * width_col + w; + const int bp_dir = offset_c % 2; + + int j = (col_pos / width_col / height_col / batch_size) % kernel_w; + int i = + (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h; + int w_out = col_pos % width_col; + int h_out = (col_pos / width_col) % height_col; + int w_in = w_out * stride_w - pad_w; + int h_in = h_out * stride_h - pad_h; + const int data_offset_h_ptr = + (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out); + const int data_offset_w_ptr = + (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + + w_out); + const int data_mask_hw_ptr = + (((i * kernel_w + j) * height_col + h_out) * width_col + w_out); + const DType offset_h = data_offset_ptr[data_offset_h_ptr]; + const DType offset_w = data_offset_ptr[data_offset_w_ptr]; + const DType mask = data_mask_ptr[data_mask_hw_ptr]; + DType inv_h = h_in + i * dilation_h + offset_h; + DType inv_w = w_in + j * dilation_w + offset_w; + if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) { + inv_h = inv_w = -2; + } else { + mval += data_col_ptr[col_pos] * + DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width, + height, width, inv_h, inv_w); + } + const DType weight = DmcnGetCoordinateWeight( + inv_h, inv_w, height, width, data_im_ptr + cnt * height * width, + width, bp_dir); + val += weight * data_col_ptr[col_pos] * mask; + cnt += 1; + } + + grad_offset[index] = val; + // KERNEL_ASSIGN(grad_offset[index], offset_req, val); + if (offset_c % 2 == 0) { + grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h * + kernel_w + + offset_c / 2) * + height_col + + h) * + width_col + + w] = mval; + // KERNEL_ASSIGN(grad_mask[(((b * deformable_group + + // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) * + // height_col + h) * width_col + w], mask_req, mval); + } + } +} +template +__global__ void PureAddToKernel(const int n, DType *result_data, + const DType *right_data) { + CUDA_1D_KERNEL_LOOP(index, n) { + CudaAtomicAdd(result_data + index, right_data[index]); + } +} +template +__global__ void SetZeroKernel(const int n, DType *result_data) { + CUDA_1D_KERNEL_LOOP(index, n) { *(result_data + index) = DType(0); } +} +template +__global__ void SetOneKernel(const int n, DType *result_data) { + CUDA_1D_KERNEL_LOOP(index, n) { *(result_data + index) = DType(1); } +} +template +__global__ void SetNumAtIndexKernel(DType num, int index, DType *data) { + *(data + index) = num; +} +template +void DeformableConv2DCol2ImCoord::operator()( + const Eigen::GpuDevice &d, const DType *data_col, const DType *data_im, + const DType *data_offset, const DType *data_mask, const TShape &im_shape, + const TShape &col_shape, const TShape &kernel_shape, const TShape &pad, + const TShape &stride, const TShape &dilation, + const int32_t deformable_group, DType *grad_offset, DType *grad_mask) { + int num_spatial_axes = kernel_shape.size(); + int num_kernels = col_shape[1] * col_shape[2] * col_shape[3] * 2 * + kernel_shape[0] * kernel_shape[1] * deformable_group; + int channel_per_deformable_group = col_shape[0] / deformable_group; + // num_axes should be smaller than block size + CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); + CHECK_LT(num_spatial_axes, config.thread_per_block); + switch (num_spatial_axes) { + case 2: + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + + DeformableConv2DCol2ImCoordGPUKernel + <<>>( + num_kernels, data_col, data_im, data_offset, data_mask, + im_shape[1], im_shape[2], im_shape[3], kernel_shape[0], + kernel_shape[1], pad[0], pad[1], stride[0], stride[1], + dilation[0], dilation[1], channel_per_deformable_group, + col_shape[1], + 2 * kernel_shape[0] * kernel_shape[1] * deformable_group, + deformable_group, col_shape[2], col_shape[3], grad_offset, + grad_mask); + // MSHADOW_CUDA_POST_KERNEL_CHECK(DeformableConv2DCol2ImCoordGPUKernel); + break; + default: + LOG(FATAL) << "col2im_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} +template +void DeformableConv2DCol2Im::operator()( + const GPUDevice &d, const DType *data_col, const DType *data_offset, + const DType *data_mask, const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, const TShape &stride, + const TShape &dilation, const int32_t deformable_group, DType *grad_im) { + int num_spatial_axes = kernel_shape.size(); + int im_size = ProdShape(im_shape, 1, im_shape.size()); + int channel_per_deformable_group = im_shape[1] / deformable_group; + int num_kernels = ProdShape(col_shape, 0, col_shape.size()); + // num_axes should be smaller than block size + CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); + CHECK_LT(num_spatial_axes, config.thread_per_block); + // using namespace mxnet_op; + switch (num_spatial_axes) { + case 2: + // To avoid involving atomic operations, we will launch one kernel per + // bottom dimension, and then in the kernel add up the top dimensions. + // NOLINT_NEXT_LINE(whitespace/operators) + DeformableConv2DCol2ImKernel + <<>>( + num_kernels, data_col, data_offset, data_mask, im_shape[1], + im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1], + pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1], + channel_per_deformable_group, col_shape[1], deformable_group, + col_shape[2], col_shape[3], grad_im); + // MSHADOW_CUDA_POST_KERNEL_CHECK(modulated_deformable_col2im_gpu_kernel); + break; + default: + LOG(FATAL) << "col2im_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + +template +void DeformableConv2DIm2Col::operator()( + const GPUDevice &d, const DType *data_im, const DType *data_offset, + const DType *data_mask, const TShape &im_shape, const TShape &col_shape, + const TShape &kernel_shape, const TShape &pad, const TShape &stride, + const TShape &dilation, const int32_t deformable_group, DType *data_col) { + int num_spatial_axes = kernel_shape.size(); + int channel_per_deformable_group = + im_shape[1] / deformable_group; // imshape[1] = 输入图的通道数 + int num_kernels = + im_shape[1] * + ProdShape(col_shape, 1, + col_shape.size()); // K * N / k.Size(), k = filter, col_shape = + // [K, im2col_step_, H, W] + CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d); + CHECK_LT(num_spatial_axes, config.thread_per_block); + switch (num_spatial_axes) { + case 2: + DeformableConv2DIm2ColKernel< + DType> // NOLINT_NEXT_LINE(whitespace/operators) + <<>>( + // CUDA对device(GPU + // )的内存管理主要通过cudaMalloc()、cudaFree()、cudaMemcpy() + // 进行管理。另外,从上述代码我们可以看到, add() + // 函数的调用比较奇怪相对于C语言来说,需要用add<<>> + // 这种形式表明这是一个从host(CPU)代码调用device的代码, + //并且括号中的数值表明,M个block,每个block有 N个线程, + //所以这个函数总共有M*N个线程。 + num_kernels, data_im, data_offset, data_mask, im_shape[2], + im_shape[3], kernel_shape[0], kernel_shape[1], pad[0], pad[1], + stride[0], stride[1], dilation[0], dilation[1], + channel_per_deformable_group, col_shape[1], im_shape[1], + deformable_group, col_shape[2], col_shape[3], data_col); + // MSHADOW_CUDA_POST_KERNEL_CHECK(modulated_deformable_im2col_gpu_kernel); + break; + default: + LOG(FATAL) << "im2col_nd_gpu does not support computation with " + << num_spatial_axes << " spatial axes"; + } +} + +template +void SetZeros::operator()(const GPUDevice &d, int n, + DType *result_data) { + CudaLaunchConfig config = GetCudaLaunchConfig(n, d); + SetZeroKernel + <<>>( + n, result_data); +} + +template +void PureAddTo::operator()(const GPUDevice &d, const int n, + DType *result_data, + const DType *right_data) { + CudaLaunchConfig config = GetCudaLaunchConfig(n, d); + PureAddToKernel + <<>>( + n, result_data, right_data); +} + +template +void SetOne::operator()(const GPUDevice &d, int n, + DType *result_data) { + CudaLaunchConfig config = GetCudaLaunchConfig(n, d); + SetOneKernel + <<>>( + n, result_data); +} + +template +void SetNumAtIndex::operator()(const GPUDevice &d, DType num, + int index, DType *data) { + CudaLaunchConfig config = GetCudaLaunchConfig(1, d); + SetNumAtIndexKernel + <<>>( + num, index, data); +} + +// 如果没有在这里实例化的话, 生成的.so会报类似于 undefined symbol: +// _ZN10tensorflow13setNumAtIndexIN5Eigen9GpuDeviceEfEclERKS2_fiPf的错误 I guess +// the reason for instancing the functional structure below is that certifying +// single functor instance for every functor. +template struct DeformableConv2DIm2Col; +template struct DeformableConv2DCol2Im; +template struct DeformableConv2DCol2ImCoord; +template struct PureAddTo; +template struct SetOne; +template struct SetZeros; +template struct SwapAxis; +template struct SetNumAtIndex; + +template struct DeformableConv2DIm2Col; +template struct DeformableConv2DCol2Im; +template struct DeformableConv2DCol2ImCoord; +template struct PureAddTo; +template struct SetOne; +template struct SetZeros; +template struct SwapAxis; +template struct SetNumAtIndex; +template +se::DeviceMemory AsDeviceMemory(const T *cuda_memory) { + se::DeviceMemoryBase wrapped(const_cast(cuda_memory)); + se::DeviceMemory typed(wrapped); + return typed; +} + +class CublasScratchAllocator : public se::ScratchAllocator { + public: + using Stream = se::Stream; + using DeviceMemoryBytes = se::DeviceMemory; + + CublasScratchAllocator(OpKernelContext *context) : context_(context) {} + + int64 GetMemoryLimitInBytes() override { return -1; } + + se::port::StatusOr AllocateBytes( + int64 byte_size) override { + Tensor temporary_memory; + + Status allocation_status(context_->allocate_temp( + DT_UINT8, TensorShape({byte_size}), &temporary_memory)); + if (!allocation_status.ok()) { + return se::port::StatusOr( + DeviceMemoryBytes::MakeFromByteSize(nullptr, 0)); + } + // Hold the reference of the allocated tensors until the end of the + // allocator. + allocated_tensors_.push_back(temporary_memory); + return se::port::StatusOr( + DeviceMemoryBytes::MakeFromByteSize( + temporary_memory.flat().data(), + temporary_memory.flat().size())); + } + + se::port::StatusOr AllocateBytes(Stream *stream, + int64 byte_size) { + Tensor temporary_memory; + + Status allocation_status(context_->allocate_temp( + DT_UINT8, TensorShape({byte_size}), &temporary_memory)); + if (!allocation_status.ok()) { + return se::port::StatusOr( + DeviceMemoryBytes::MakeFromByteSize(nullptr, 0)); + } + // Hold the reference of the allocated tensors until the end of the + // allocator. + allocated_tensors_.push_back(temporary_memory); + return se::port::StatusOr( + DeviceMemoryBytes::MakeFromByteSize( + temporary_memory.flat().data(), + temporary_memory.flat().size())); + } + + private: + OpKernelContext *context_; + std::vector allocated_tensors_; +}; + +template +void LaunchBatchMatMul::launch( + OpKernelContext *context, const TensorShape &in_x_shape, + const TensorShape &in_y_shape, const Scalar *in_x_ptr, + const Scalar *in_y_ptr, bool adj_x, bool adj_y, Scalar *out) { + constexpr se::blas::Transpose kTranspose = + is_complex::value ? se::blas::Transpose::kConjugateTranspose + : se::blas::Transpose::kTranspose; + se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose, kTranspose}; + + const uint64 m = in_x_shape.dim_size(adj_x ? 2 : 1); + const uint64 k = in_x_shape.dim_size(adj_x ? 1 : 2); + const uint64 n = in_y_shape.dim_size(adj_y ? 1 : 2); + const uint64 batch_size = in_x_shape.dim_size(0); + auto blas_transpose_a = trans[adj_x]; + auto blas_transpose_b = trans[adj_y]; + + auto *stream = context->op_device_context()->stream(); + OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); + + typedef se::DeviceMemory DeviceMemoryType; + std::vector a_device_memory; + std::vector b_device_memory; + std::vector c_device_memory; + std::vector a_ptrs; + std::vector b_ptrs; + std::vector c_ptrs; + a_device_memory.reserve(batch_size); + b_device_memory.reserve(batch_size); + c_device_memory.reserve(batch_size); + a_ptrs.reserve(batch_size); + b_ptrs.reserve(batch_size); + c_ptrs.reserve(batch_size); + auto *a_base_ptr = in_x_ptr; + auto *b_base_ptr = in_y_ptr; + auto *c_base_ptr = out; + for (int64 i = 0; i < batch_size; ++i) { + a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k)); + b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n)); + c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n)); + a_ptrs.push_back(&a_device_memory.back()); + b_ptrs.push_back(&b_device_memory.back()); + c_ptrs.push_back(&c_device_memory.back()); + } + + typedef Scalar Coefficient; + + // Cublas does + // C = A x B + // where A, B and C are assumed to be in column major. + // We want the output to be in row-major, so we can compute + // C' = B' x A', where ' stands for transpose (not adjoint). + // TODO(yangzihao): Choose the best of the three strategies using autotune. + if (batch_size == 1) { + // This is a regular matrix*matrix or matrix*vector multiply. Avoid the + // overhead of the scratch allocator and the batch interface. + if (n == 1 && + blas_transpose_b != se::blas::Transpose::kConjugateTranspose && + blas_transpose_a != se::blas::Transpose::kConjugateTranspose) { + // This is a matrix*vector multiply so use GEMV to compute A * b. + // Here we are multiplying in the natural order, so we have to flip + // the transposition flag to compensate for the tensor being stored + // row-major. Since GEMV doesn't provide a way to just conjugate an + // argument, we have to defer those cases to GEMM below. + auto gemv_trans_a = blas_transpose_a == se::blas::Transpose::kTranspose + ? se::blas::Transpose::kNoTranspose + : se::blas::Transpose::kTranspose; + bool blas_launch_status = + stream + ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m, + static_cast(1.0), *(a_ptrs[0]), + adj_x ? m : k, *(b_ptrs[0]), 1, + static_cast(0.0), c_ptrs[0], 1) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal( + "Blas xGEMV launch failed : a.shape=", in_x_shape.DebugString(), + ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n, + ", k=", k)); + } + } else { + bool blas_launch_status = + stream + ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, + static_cast(1.0), *(b_ptrs[0]), + adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k, + static_cast(0.0), c_ptrs[0], n) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal( + "Blas xGEMM launch failed : a.shape=", in_x_shape.DebugString(), + ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n, + ", k=", k)); + } + } + } else { + CublasScratchAllocator scratch_allocator(context); + bool blas_launch_status = + stream + ->ThenBlasGemmBatchedWithScratch( + blas_transpose_b, blas_transpose_a, n, m, k, + static_cast(1.0), b_ptrs, adj_y ? k : n, a_ptrs, + adj_x ? m : k, static_cast(0.0), c_ptrs, n, + batch_size, &scratch_allocator) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal( + "Blas xGEMMBatched launch failed : a.shape=", + in_x_shape.DebugString(), ", b.shape=", in_y_shape.DebugString(), + ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size)); + } + } +} +template +void DeformablePSROIPoolForward::operator()( + const GPUDevice &d, const int count, const T *bottom_data, + const T spatial_scale, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const T *bottom_rois, const T *bottom_trans, const int no_trans, + const T trans_std, const int sample_per_part, const int output_dim, + const int group_size, const int part_size, const int num_classes, + const int channels_each_class, T *top_data, T *top_count) { + auto config = GetCudaLaunchConfig(count, d); + DeformablePSROIPoolForwardKernel + <<>>( + count, bottom_data, spatial_scale, channels, height, width, + pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans, + trans_std, sample_per_part, output_dim, group_size, part_size, + num_classes, channels_each_class, top_data, top_count); +} +template +void DeformablePSROIPoolBackwardKernel::operator()( + const GPUDevice &d, const int count, const T *top_diff, const T *top_count, + const int num_rois, const T spatial_scale, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int output_dim, T *bottom_data_diff, + T *bottom_trans_diff, const T *bottom_data, const T *bottom_rois, + const T *bottom_trans, const int no_trans, const T trans_std, + const int sample_per_part, const int group_size, const int part_size, + const int num_classes, const int channels_each_class) { + auto config = GetCudaLaunchConfig(count, d); + DeformablePSROIPoolBackwardAccKernel + <<>>( + count, top_diff, top_count, num_rois, spatial_scale, channels, height, + width, pooled_height, pooled_width, output_dim, bottom_data_diff, + bottom_trans_diff, bottom_data, bottom_rois, bottom_trans, no_trans, + trans_std, sample_per_part, group_size, part_size, num_classes, + channels_each_class); +} +template struct LaunchBatchMatMul; +template struct LaunchBatchMatMul; +template struct DeformablePSROIPoolForward; +template struct DeformablePSROIPoolForward; +template struct DeformablePSROIPoolBackwardKernel; +template struct DeformablePSROIPoolBackwardKernel; +#endif +} // namespace functor +} // namespace addons +} // namespace tensorflow diff --git a/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc b/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc new file mode 100644 index 0000000000..c9ec963b13 --- /dev/null +++ b/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc @@ -0,0 +1,287 @@ +// +// Created by 孙嘉禾 on 2019/12/31. +// + +#include +#include +#include +#include + +#include "tensorflow/core/framework/common_shape_fns.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/shape_inference.h" + +namespace tensorflow { +namespace addons { + +namespace functor { + +using shape_inference::DimensionHandle; +using shape_inference::InferenceContext; +using shape_inference::ShapeHandle; + +REGISTER_OP("AddonsDeformableConv2D") + .Input("input: T") + .Input("filter: T") + .Input("offset: T") + .Input("mask: T") + .Output("output: T") + .Attr("T: {float, double}") + .Attr("strides: list(int)") + // .Attr("use_cudnn_on_gpu: bool = true") + .Attr("num_groups: int") + .Attr("deformable_groups: int") + .Attr("im2col_step: int") + .Attr("no_bias: bool = true") + .Attr(GetPaddingAttrString()) + .Attr("data_format: {'NCHW' } = 'NCHW' ") + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext *c) { + ShapeHandle input_shape; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape)); + ShapeHandle filter_shape; + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape)); + ShapeHandle offset_shape; + TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 4, &offset_shape)); + ShapeHandle mask_shape; + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 4, &mask_shape)); + + std::vector strides; + TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides)); + if (strides.size() != 4) { + return errors::InvalidArgument( + "Deformconv requires the stride attribute to contain 4 values, but " + "got: ", + strides.size()); + } + + std::vector rates; + TF_RETURN_IF_ERROR(c->GetAttr("dilations", &rates)); + if (rates.size() != 4) { + return errors::InvalidArgument( + "Deformconv requires the dilations attribute to contain 4 values, " + "but " + "got: ", + rates.size()); + } + string data_format; + TensorFormat data_format_; + TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format)); + FormatFromString(data_format, &data_format_); + const int32 stride_rows = GetTensorDim(strides, data_format_, 'H'); + const int32 stride_cols = GetTensorDim(strides, data_format_, 'W'); + + const int32 rate_rows = GetTensorDim(rates, data_format_, 'H'); + const int32 rate_cols = GetTensorDim(rates, data_format_, 'W'); + + int groups; + TF_RETURN_IF_ERROR(c->GetAttr("num_groups", &groups)); + int deform_groups; + TF_RETURN_IF_ERROR(c->GetAttr("deformable_groups", &deform_groups)); + + DimensionHandle batch_size_dim = c->Dim(input_shape, 0); + DimensionHandle in_depths_dim = c->Dim(input_shape, 1); + DimensionHandle in_rows_dim = c->Dim(input_shape, 2); + DimensionHandle in_cols_dim = c->Dim(input_shape, 3); + DimensionHandle filter_rows_dim = c->Dim(filter_shape, 2); + DimensionHandle filter_cols_dim = c->Dim(filter_shape, 3); + DimensionHandle filter_depth_dim = c->Dim(filter_shape, 1); + DimensionHandle output_depth_dim = c->Dim(filter_shape, 0); + DimensionHandle multiplied_depth; + DimensionHandle depth_per_dfgps; + auto filter_row = c->Value(filter_rows_dim); + auto filter_col = c->Value(filter_cols_dim); + auto offset_dpt = c->Value(c->Dim(offset_shape, 1)); + if ((offset_dpt % (filter_row * filter_col) != 0) || + (offset_dpt / (2 * filter_row * filter_col) != deform_groups)) { + return errors::InvalidArgument( + "Deformconv requires the offset compatible with filter, but " + "got: ", + c->DebugString(offset_shape)); + } + + auto mask_dpt = c->Value(c->Dim(mask_shape, 1)); + if ((mask_dpt % (filter_row * filter_col) != 0) || + (mask_dpt / (filter_row * filter_col) != deform_groups)) { + return errors::InvalidArgument( + "Deformconv requires the mask compatible with filter, but " + "got: ", + c->DebugString(offset_shape)); + } + + TF_RETURN_IF_ERROR( + c->Multiply(filter_depth_dim, groups, &multiplied_depth)); + TF_RETURN_IF_ERROR( + c->Divide(filter_depth_dim, deform_groups, true, &depth_per_dfgps)); + TF_RETURN_IF_ERROR( + c->Divide(in_depths_dim, deform_groups, true, &depth_per_dfgps)); + + if (!c->ValueKnown(in_rows_dim) || !c->ValueKnown(in_cols_dim) || + !c->ValueKnown(filter_rows_dim) || !c->ValueKnown(filter_cols_dim)) { + ShapeHandle output_shape = c->MakeShape( + {batch_size_dim, output_depth_dim, InferenceContext::kUnknownDim, + InferenceContext::kUnknownDim}); + c->set_output(0, output_shape); + return Status::OK(); + } + DimensionHandle unused; + TF_RETURN_IF_ERROR( + c->Merge(c->Dim(input_shape, 1), multiplied_depth, &unused)); + + auto in_rows = c->Value(in_rows_dim); + auto in_cols = c->Value(in_cols_dim); + auto filter_rows = c->Value(filter_rows_dim); + auto filter_cols = c->Value(filter_cols_dim); + auto filter_rows_eff = filter_rows + (filter_rows - 1) * (rate_rows - 1); + auto filter_cols_eff = filter_cols + (filter_cols - 1) * (rate_cols - 1); + + Padding padding; + TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding)); + + int64 output_rows, output_cols; + int64 padding_before, padding_after; + TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose( + in_rows, filter_rows_eff, stride_rows, padding, &output_rows, + &padding_before, &padding_after)); + TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose( + in_cols, filter_cols_eff, stride_cols, padding, &output_cols, + &padding_before, &padding_after)); + + ShapeHandle output_shape = c->MakeShape( + {batch_size_dim, output_depth_dim, output_rows, output_cols}); + c->set_output(0, output_shape); + // shape_inference::ShapeHandle offset_shape = c->input(2); + // shape_inference::ShapeHandle mask_shape = c->input(3); + shape_inference::DimensionHandle offset_batch = c->Dim(offset_shape, 0); + shape_inference::DimensionHandle offset_channel = c->Dim(offset_shape, 1); + shape_inference::DimensionHandle offset_height = c->Dim(offset_shape, 2); + shape_inference::DimensionHandle offset_weight = c->Dim(offset_shape, 3); + shape_inference::DimensionHandle mask_channel = c->Dim(mask_shape, 1); + shape_inference::DimensionHandle mask_height = c->Dim(mask_shape, 2); + shape_inference::DimensionHandle mask_weight = c->Dim(mask_shape, 3); + shape_inference::DimensionHandle mask_batch = c->Dim(mask_shape, 0); + TF_RETURN_IF_ERROR(c->WithRank(offset_shape, 4, &offset_shape)); + TF_RETURN_IF_ERROR(c->WithRank(mask_shape, 4, &mask_shape)); + TF_RETURN_IF_ERROR( + c->WithValue(offset_batch, c->Value(batch_size_dim), &offset_batch)); + TF_RETURN_IF_ERROR(c->WithValue( + offset_channel, + 2 * c->Value(filter_rows_dim) * c->Value(filter_cols_dim), + &offset_channel)); + TF_RETURN_IF_ERROR( + c->WithValue(offset_height, output_rows, &offset_height)); + TF_RETURN_IF_ERROR( + c->WithValue(offset_weight, output_cols, &offset_weight)); + TF_RETURN_IF_ERROR( + c->WithValue(mask_batch, c->Value(batch_size_dim), &mask_batch)); + TF_RETURN_IF_ERROR(c->WithValue( + mask_channel, c->Value(filter_rows_dim) * c->Value(filter_cols_dim), + &mask_channel)); + TF_RETURN_IF_ERROR(c->WithValue(mask_height, output_rows, &mask_height)); + TF_RETURN_IF_ERROR(c->WithValue(mask_weight, output_cols, &mask_weight)); + return Status::OK(); + }) + .Doc(R"doc( + DeformableConv2D is a new convolution operation with the deformable kernel locations. + The inputs should have format NCHW, which is faster on GPUS. + The offset and mask should have same input spatial resolution. + Also, the output's shape depends on the stride, and I only consider the situation of dilation rate = 1. + )doc"); + +// Opkernel defination. +// template parameter is the datatype of the tensors +// in my opnion, the deformable convolution op ought to be implemented by +// extending the Conv2DOp, however, we can not get the conv_ops.h file if we +// choose to dynamic link the op + +REGISTER_OP("AddonsDeformableConv2DBackProp") + .Input("input: T") + .Input("filter: T") + .Input("offset: T") + .Input("mask: T") + .Input("out_grad: T") + .Output("x_grad: T") + .Output("filter_grad: T") + .Output("offset_grad: T") + .Output("mask_grad: T") + .Attr("T: {float, double}") + .Attr("strides: list(int)") + // .Attr("use_cudnn_on_gpu: bool = true") + .Attr("num_groups: int") + .Attr("deformable_groups: int") + .Attr("im2col_step: int") + .Attr("no_bias: bool = true") + .Attr(GetPaddingAttrString()) + .Attr("data_format: { 'NCHW' } = 'NCHW' ") + .Attr("dilations: list(int) = [1, 1, 1, 1]") + .SetShapeFn([](InferenceContext *c) { + c->set_output(0, c->input(0)); + c->set_output(1, c->input(1)); + c->set_output(2, c->input(2)); + c->set_output(3, c->input(3)); + return Status::OK(); + }) + .Doc(R"doc(only support NCHW now)doc"); + +REGISTER_OP("AddonsDeformablePsroiPool") + .Input("input: T") + .Input("bbox: T") + .Input("trans: T") + .Output("output: T") + .Output("top_count: T") + .Attr("T: {float, double}") + .Attr("pooled_size: int") + .Attr("no_trans: int") + .Attr("spatial_scale: float") + .Attr("output_dim: int") + .Attr("group_size: int") + .Attr("part_size: int") + .Attr("sample_per_part: int") + .Attr("trans_std: float") + .SetShapeFn([](InferenceContext *ctx) { + int pooled_size; + int output_dim; + TF_RETURN_IF_ERROR(ctx->GetAttr("pooled_size", &pooled_size)); + TF_RETURN_IF_ERROR(ctx->GetAttr("output_dim", &output_dim)); + auto input_handle = ctx->input(0); + auto batch = ctx->Dim(input_handle, 0); + auto output_dim_handle = ctx->MakeDim(output_dim); + auto pooled_size_handle = ctx->MakeDim(pooled_size); + ctx->set_output( + 0, ctx->MakeShape({batch, output_dim_handle, pooled_size_handle, + pooled_size_handle})); + ctx->set_output( + 1, ctx->MakeShape({batch, output_dim_handle, pooled_size_handle, + pooled_size_handle})); + return Status::OK(); + }) + .Doc( + R"doc(DeformablePsROIPool is a new pooling operation with the deformable +kernel locations. The inpus should have format NCHW, which is faster on GPUS.)doc"); +REGISTER_OP("AddonsDeformablePsroiPoolBackProp") + .Input("data: T") + .Input("bbox: T") + .Input("trans: T") + .Input("top_count: T") + .Input("out_grad: T") + .Output("in_grad: T") + .Output("trans_grad: T") + .Attr("pooled_size: int") + .Attr("T: {float, double}") + .Attr("no_trans: int") + .Attr("spatial_scale: float") + .Attr("output_dim: int") + .Attr("group_size: int") + .Attr("part_size: int") + .Attr("sample_per_part: int") + .Attr("trans_std: float") + .SetShapeFn([](InferenceContext *ctx) { + ctx->set_output(0, ctx->input(0)); + ctx->set_output(1, ctx->input(2)); + return Status::OK(); + }) + .Doc(R"doc("BackProp operation for DeformablePSROIPool")doc"); + +} // namespace functor +} // namespace addons +} // namespace tensorflow diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD index 2ee52d4f6b..5483f6a64f 100644 --- a/tensorflow_addons/layers/BUILD +++ b/tensorflow_addons/layers/BUILD @@ -6,6 +6,7 @@ py_library( name = "layers", srcs = [ "__init__.py", + "deformable_conv2d.py", "gelu.py", "maxout.py", "multihead_attention.py", @@ -20,6 +21,7 @@ py_library( ], data = [ "//tensorflow_addons/custom_ops/layers:_correlation_cost_ops.so", + "//tensorflow_addons/custom_ops/layers:_deformable_conv2d_ops.so", ], deps = [ "//tensorflow_addons/activations", @@ -156,6 +158,12 @@ py_test( "multihead_attention_test.py", ], main = "multihead_attention_test.py", + name = "deformable_conv_test", + size = "small", + srcs = [ + "deformable_conv2d_test.py", + ], + main = "deformable_conv2d_test.py", deps = [ ":layers", ], diff --git a/tensorflow_addons/layers/__init__.py b/tensorflow_addons/layers/__init__.py index 9606352b2d..733cb79c76 100644 --- a/tensorflow_addons/layers/__init__.py +++ b/tensorflow_addons/layers/__init__.py @@ -14,6 +14,7 @@ # ============================================================================== """Additional layers that conform to Keras API.""" + from tensorflow_addons.layers.gelu import GELU from tensorflow_addons.layers.maxout import Maxout from tensorflow_addons.layers.multihead_attention import MultiHeadAttention @@ -25,3 +26,5 @@ from tensorflow_addons.layers.sparsemax import Sparsemax from tensorflow_addons.layers.tlu import TLU from tensorflow_addons.layers.wrappers import WeightNormalization +from tensorflow_addons.layers.deformable_conv2d import DeformableConv2D +from tensorflow_addons.layers.deformable_conv2d import DeformablePSROIAlign diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py new file mode 100644 index 0000000000..948a48a80d --- /dev/null +++ b/tensorflow_addons/layers/deformable_conv2d.py @@ -0,0 +1,414 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Tensorflow op performing correlation cost operation.""" + +import tensorflow as tf +from typeguard import typechecked +from tensorflow_addons.utils.resource_loader import LazySO +from tensorflow.python.keras.utils import conv_utils + +# _deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so") +_deformable_conv2d_ops_so = LazySO( + "/home/admin-seu/TempData/sss/custom_ops/deformable_conv2d_ops_new/deformable_conv2D.so" +) +# _deformable_conv2d_ops_so = LazySO("/home/admin-seu/TempData/sss/SoftWare/addons/bazel-bin/tensorflow_addons/custom_ops/layers/_deformable_conv2d_ops.so") +# _deformable_conv2d_ops_so = tf.load_op_library("custom_ops/layers/_deformable_conv2d_ops.so") + + +def _deformable_conv2d( + input, + filter, + offset, + mask, + strides=[1, 1, 1, 1], + num_groups=1, + deformable_groups=1, + im2col_step=1, + no_bias=True, + padding="VALID", + data_format="NHWC", + dilations=[1, 1, 1, 1], +): + if data_format == "NHWC": + input = tf.transpose(input, [0, 3, 1, 2]) + filter = tf.transpose(filter, [3, 2, 0, 1]) + offset = tf.transpose(offset, [0, 3, 1, 2]) + mask = tf.transpose(mask, [0, 3, 1, 2]) + ret = _deformable_conv2d_ops_so.ops.addons_deformable_conv2d( + input=input, + filter=filter, + offset=offset, + mask=mask, + strides=strides, + num_groups=num_groups, + deformable_groups=deformable_groups, + im2col_step=im2col_step, + no_bias=no_bias, + padding=padding, + data_format="NCHW", + dilations=dilations, + ) + if data_format == "NHWC": + return tf.transpose(ret, [0, 2, 3, 1]) + return ret + + +@tf.RegisterGradient("AddonsDeformableConv2D") +def _deformable_conv2d_back_prop(op, grad): + """The gradients for `deform_conv`. + Args: + op: The `deform_conv` `Operation` that we are differentiating, which we can use + to find the inputs and outputs of the original op. + grad: Gradient with respect to the output of the `roi_pool` op. + Returns: + Gradients with respect to the input of `deform_conv`. + """ + data = op.inputs[0] + filter = op.inputs[1] + offset = op.inputs[2] + mask = op.inputs[3] + """ + .Attr("strides: list(int)") + .Attr("num_groups: int") + .Attr("deformable_groups: int") + .Attr("im2col_step: int") + .Attr("no_bias: bool = true") + .Attr(GetPaddingAttrString()) + .Attr("data_format: {'NCHW' } = 'NCHW' ") + .Attr("dilations: list(int) = [1, 1, 1, 1]") + """ + strides = op.get_attr("strides") + dilations = op.get_attr("dilations") + data_format = op.get_attr("data_format") + im2col_step = op.get_attr("im2col_step") + no_bias = op.get_attr("no_bias") + pads = op.get_attr("padding") + num_groups = op.get_attr("num_groups") + deformable_groups = op.get_attr("deformable_groups") + """ + REGISTER_OP("Addons>DeformableConv2DBackProp") + .Input("input: T") + .Input("filter: T") + .Input("offset: T") + .Input("mask: T") + .Input("out_grad: T") + .Output("x_grad: T") + .Output("filter_grad: T") + .Output("offset_grad: T") + .Output("mask_grad: T") + .Attr("T: {float, double}") + .Attr("strides: list(int)") + .Attr("num_groups: int") + .Attr("deformable_groups: int") + .Attr("im2col_step: int") + .Attr("no_bias: bool = true") + .Attr(GetPaddingAttrString()) + .Attr("data_format: { 'NCHW' } = 'NCHW' ") + .Attr("dilations: list(int) = [1, 1, 1, 1]") + """ + # compute gradient + data_grad = _deformable_conv2d_ops_so.ops.addons_deformable_conv2d_back_prop( + data, + filter, + offset, + mask, + grad, + strides=strides, + num_groups=num_groups, + deformable_groups=deformable_groups, + im2col_step=im2col_step, + no_bias=no_bias, + padding=pads, + data_format=data_format, + dilations=dilations, + ) + return data_grad # List of 4 Tensor, since we have 4 input + + +# @tf.keras.utils.register_keras_serializable(package="Addons") +class DeformableConv2D(tf.keras.layers.Layer): + @typechecked + def __init__( + self, + filters: int, + kernel_size=(3, 3), + num_groups=1, + deformable_groups=1, + strides=(1, 1), + im2col=1, + use_bias=False, + padding="valid", + data_format="channels_last", + dilations=(1, 1), + ): + super(DeformableConv2D, self).__init__() + self.filters = filters + self.kernel_size = kernel_size + self.num_groups = num_groups + self.deformable_groups = deformable_groups + self.strides = strides + self.im2col = im2col + self.use_bias = use_bias + self.padding = padding + self.data_format = data_format + self.dilations = dilations + self.conv_offset = tf.keras.layers.Conv2D( + self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1], + kernel_size=self.kernel_size, + strides=(1, 1), + padding=self.padding, + use_bias=True, + data_format=data_format, + ) + + def build(self, input_shape): + if self.data_format == "channels_last": + channel = int(input_shape[-1]) + else: + channel = int(input_shape[1]) + if self.data_format == "channels_last": + self.filter = tf.Variable( + initial_value=tf.random.normal( + shape=[ + self.kernel_size[0], + self.kernel_size[1], + channel, + self.filters, + ] + ) + ) + else: + self.filter = tf.Variable( + initial_value=tf.random.normal( + shape=[ + self.filters, + channel, + self.kernel_size[0], + self.kernel_size[1], + ] + ) + ) + self.built = True + + def compute_output_shape(self, input_shape): + input_shape = tf.TensorShape(input_shape).as_list() + if self.data_format == "channels_last": + space = input_shape[1:-1] + new_space = [] + for i in range(len(space)): + new_dim = conv_utils.conv_output_length( + space[i], + self.kernel_size[i], + padding=self.padding, + stride=self.strides[i], + dilation=self.dilation_rate[i], + ) + new_space.append(new_dim) + return tensor_shape.TensorShape( + [input_shape[0]] + new_space + [self.filters] + ) + else: + space = input_shape[2:] + new_space = [] + for i in range(len(space)): + new_dim = conv_utils.conv_output_length( + space[i], + self.kernel_size[i], + padding=self.padding, + stride=self.strides[i], + dilation=self.dilation_rate[i], + ) + new_space.append(new_dim) + return tf.TensorShape([input_shape[0], self.filters] + new_space) + + def call(self, inputs, **kwargs): + """ + Build static Graph + :param inputs: [B, Height, Width, Channel] + :param kwargs: + :return: + """ + weight_info = self.conv_offset(inputs) + tf_data_format = "NCHW" + tf_padding = "VALID" + if self.padding == "same": + tf_padding = "SAME" + if self.data_format == "channels_last": + tf_data_format = "NHWC" + o1, o2, mask = tf.split(weight_info, 3, axis=-1) + offset = tf.concat((o1, o2), axis=-1) + mask = tf.sigmoid(mask) + else: + o1, o2, mask = tf.split(weight_info, 3, axis=1) + offset = tf.concat((o1, o2), axis=1) + mask = tf.sigmoid(mask) + result = _deformable_conv2d( + input=inputs, + filter=self.filter, + offset=offset, + mask=mask, + strides=[1, self.strides[0], self.strides[1], 1], + num_groups=self.num_groups, + deformable_groups=self.deformable_groups, + im2col_step=self.im2col, + no_bias=(not self.use_bias), + padding=tf_padding, + data_format=tf_data_format, + dilations=[1, self.dilations[0], self.dilations[1], 1], + ) + return result + + def get_config(self): + config = { + "kernel_size": self.kernel_size, + "filters": self.filters, + "num_groups": self.num_groups, + "deformable_groups": self.deformable_groups, + "strides": self.strides, + "im2col": self.im2col, + "use_bias": self.use_bias, + "padding": self.padding, + "data_format": self.data_format, + "dilations": self.dilations, + } + base_config = super().get_config() + return {**base_config, **config} + + +@tf.RegisterGradient("AddonsDeformablePsroiPool") +def _deformable_psroi_pool_back_prop(op, *grad): + data = op.inputs[0] + bbox = op.inputs[1] + trans = op.inputs[2] + top_count = op.outputs[1] + pooled_size = op.get_attr("pooled_size") + no_trans = op.get_attr("no_trans") + spatial_scale = op.get_attr("spatial_scale") + output_dim = op.get_attr("output_dim") + group_size = op.get_attr("group_size") + part_size = op.get_attr("part_size") + sample_per_part = op.get_attr("sample_per_part") + trans_std = op.get_attr("trans_std") + data_grad = _deformable_conv2d_ops_so.ops.addons_deformable_psroi_pool_back_prop( + data, + bbox, + trans, + top_count, + grad[0], + pooled_size=pooled_size, + no_trans=no_trans, + spatial_scale=spatial_scale, + output_dim=output_dim, + group_size=group_size, + part_size=part_size, + sample_per_part=sample_per_part, + trans_std=trans_std, + ) + return [data_grad[0], tf.zeros_like(bbox), data_grad[1]] + + +# @tf.keras.utils.register_keras_serializable(package="Addons") +class DeformablePSROIAlign(tf.keras.layers.Layer): + def __init__( + self, + output_dim=256, + spatial_scale=1 / 16, + group_size=1, + pooled_size=7, + sample_per_part=4, + part_size=7, + trans_std=1, + data_format="channels_last", + ): + super(DeformablePSROIAlign, self).__init__() + self.spatial_scale = spatial_scale + self.group_size = group_size + self.output_dim = output_dim + self.pooled_size = pooled_size + self.sample_per_part = sample_per_part + self.part_size = part_size + self.trans_std = trans_std + self.data_format = data_format + self.flat = tf.keras.layers.Flatten(data_format="channels_first") + self.fully_connect = tf.keras.layers.Dense( + self.pooled_size * self.pooled_size * 2 + ) + + def compute_output_shape(self, input_shape): + data_shape = input_shape[0] + batch_size = data_shape[0] + if self.data_format == "channels_last": + return tf.TensorShape( + [batch_size, self.pooled_size, self.pooled_size, self.output_dim] + ) + else: + return tf.TensorShape( + [batch_size, self.output_dim, self.pooled_size, self.pooled_size] + ) + + def call(self, inputs, **kwargs): + featuremap = inputs[0] + rois = inputs[1] + if self.data_format == "channels_last": + featuremap = tf.transpose(featuremap, perm=[0, 3, 1, 2]) + ( + offset_t, + top_count, + ) = _deformable_conv2d_ops_so.ops.addons_deformable_psroi_pool( + featuremap, + rois, + tf.convert_to_tensor(0.0), + pooled_size=self.pooled_size, + no_trans=True, + spatial_scale=self.spatial_scale, + output_dim=self.output_dim, + group_size=self.group_size, + part_size=self.part_size, + sample_per_part=self.sample_per_part, + trans_std=1.0, + ) + offset_flat = self.flat(offset_t) + offset = self.fully_connect(offset_flat) + offset_reshape = tf.reshape(offset, shape=[-1, 2, 7, 7], name="offset_reshape") + ret, ret_count = _deformable_conv2d_ops_so.ops.addons_deformable_psroi_pool( + featuremap, + rois, + offset_reshape, + pooled_size=self.pooled_size, + no_trans=False, + spatial_scale=self.spatial_scale, + output_dim=self.output_dim, + group_size=self.group_size, + part_size=self.part_size, + sample_per_part=self.sample_per_part, + trans_std=self.trans_std, + ) + if self.data_format == "channels_last": + ret = tf.transpose(ret, [0, 2, 3, 1]) + return ret + + def get_config(self): + config = { + "spatial_scale": self.spatial_scale, + "group_size": self.group_size, + "output_dim": self.output_dim, + "pooled_size": self.pooled_size, + "sample_per_part": self.sample_per_part, + "part_size": self.part_size, + "trans_std": self.trans_std, + "data_format": self.data_format, + } + base_config = super().get_config() + return {**config, **base_config} diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py new file mode 100644 index 0000000000..08e9e0a7b1 --- /dev/null +++ b/tensorflow_addons/layers/deformable_conv2d_test.py @@ -0,0 +1,279 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +import numpy as np +import tensorflow as tf +from tensorflow_addons.layers.deformable_conv2d import ( + DeformableConv2D, + DeformablePSROIAlign, +) +from tensorflow_addons.layers.deformable_conv2d import ( + _deformable_conv2d, + _deformable_conv2d_ops_so, +) +from tensorflow_addons.utils import test_utils + + +@test_utils.run_all_in_graph_and_eager_modes +class DeformableConv2DTest(tf.test.TestCase): + def _forward( + self, + input, + filters, + kernel_size=(3, 3), + num_groups=1, + deformable_groups=1, + strides=(1, 1), + im2col=1, + use_bias=False, + padding="valid", + data_format="channels_last", + dilations=(1, 1), + ): + input_op = tf.convert_to_tensor(input) + output = DeformableConv2D( + filters, + kernel_size, + num_groups, + deformable_groups, + strides, + im2col, + use_bias, + padding, + data_format, + dilations, + )(input_op) + return output + + def _create_test_data(self, data_format): + height = 20 + width = 20 + channel = 3 + batch = 1 + val = np.random.uniform(size=[batch, height, width, channel]).astype(np.float32) + if data_format == "channels_first": + val = np.transpose(val, [0, 3, 1, 2]) + return val + + """ + Because DeformableConv2D layer use built in random_normal initializer to initialize weight, So the output can't be actually tested, So here we just simple compare the result in tf.nn.conv2d and _deformable_conv2d function in deformable_conv2d.py. + """ + + def _forward_simple(self, data_format, use_gpu=False): + with test_utils.device(False): + batch_size = 4 + padding = "SAME" + kernel_h = 3 + kernel_w = 3 + channel = 3 + height = 20 + width = 20 + out_channel = 16 + # input = tf.random.uniform(shape=[batch_size, channel, height, width], maxval=10) + input = tf.convert_to_tensor( + [i for i in range(batch_size * channel * height * width)], + dtype=tf.float32, + ) + input = tf.reshape(input, [batch_size, channel, height, width]) + input_trans = tf.transpose(input, [0, 2, 3, 1]) + # filter = tf.random.uniform( + # shape=[kernel_h, kernel_w, channel, out_channel], maxval=10 + # ) + filter = tf.convert_to_tensor( + np.random.uniform( + 0, 1, [kernel_h, kernel_w, channel, out_channel] + ).astype(np.float32) + ) + filter_deform = tf.transpose(filter, [3, 2, 0, 1]) + offset = tf.constant( + [ + 0.0 + for i in range( + batch_size * kernel_h * kernel_w * 2 * height * width + ) + ], + shape=[batch_size, kernel_h * kernel_w * 2, height, width], + ) + mask = tf.constant( + [1.0 for i in range(batch_size * kernel_h * kernel_w * height * width)], + shape=[batch_size, kernel_h * kernel_w, height, width], + ) + result1 = _deformable_conv2d_ops_so.ops.addons_deformable_conv2d( + input=input, + filter=filter_deform, + offset=offset, + mask=mask, + strides=[1, 1, 1, 1], + num_groups=1, + deformable_groups=1, + im2col_step=1, + no_bias=True, + padding=padding, + data_format="NCHW", + dilations=[1, 1, 1, 1], + ) + result2 = tf.nn.conv2d(input_trans, filter, [1, 1, 1, 1], padding) + result2 = tf.transpose(result2, [0, 3, 1, 2]) + # print("Debug!", tf.reduce_mean(result1 - result2)) + self.assertAllClose(result1, result2, 1e-4, 1e-4) + + """ + def _gradients(self, data_format, use_gpu=False): + with test_utils.device(use_gpu): + with tf.GradientTape(persistent=True) as tape: + val = self._create_test_data(data_format) + input = tf.constant(val, dtype=tf.float32) + if data_format == "channels_last": + input = tf.transpose(input, [0, 3, 1, 2]) + input_trans = tf.transpose(input, [0, 2, 3, 1]) + padding = "SAME" + kernel_h = 3 + kernel_w = 3 + channel = 3 + height = 20 + width = 20 + out_channel = 16 + filter = tf.Variable( + np.random.uniform(0, 1, [kernel_h, kernel_w, channel, out_channel] + ).astype(np.float32) + ) + tape.watch(filter) + filter_deform = tf.transpose(filter, [3, 2, 0, 1]) + offset = tf.constant( + [0.0 for i in range(kernel_h * kernel_w * 2 * height * width)], + shape=[1, kernel_h * kernel_w * 2, height, width], + ) + mask = tf.constant( + [1.0 for i in range(kernel_h * kernel_w * height * width)], + shape=[1, kernel_h * kernel_w, height, width], + ) + result1 = _deformable_conv2d_ops_so.ops.addons_deformable_conv2d( + input=input, + filter=filter_deform, + offset=offset, + mask=mask, + strides=[1, 1, 1, 1], + num_groups=1, + deformable_groups=1, + im2col_step=1, + no_bias=True, + padding=padding, + data_format="NCHW", + dilations=[1, 1, 1, 1], + ) + result2 = tf.nn.conv2d(input_trans, filter, [1, 1, 1, 1], "SAME") + grad1 = tape.gradient(result1, filter) + grad2 = tape.gradient(result2, filter) + self.assertAllClose(grad1, grad2, 1e-4, 1e-4) + """ + + def _keras(self, data_format, use_gpu=False): + inputs = self._create_test_data(data_format) + output = self._forward(inputs, 64, data_format=data_format) + + def testForwardNCHW(self): + self._forward_simple(data_format="channels_first", use_gpu=False) + if tf.test.is_gpu_available(): + self._forward_simple(data_format="channels_first", use_gpu=True) + + def testForwardNHWC(self): + self._forward_simple(data_format="channels_last", use_gpu=False) + if tf.test.is_gpu_available(): + self._forward_simple(data_format="channels_last", use_gpu=True) + + """ + def testBackwardNCHW(self): + self._gradients(data_format="channels_first", use_gpu=False) + if tf.test.is_gpu_available(): + self._gradients(data_format="channels_first", use_gpu=True) + + def testBackwardNHWC(self): + self._gradients(data_format="channels_last", use_gpu=False) + if tf.test.is_gpu_available(): + self._gradients(data_format="channels_last", use_gpu=True) + """ + + def testKerasNCHW(self): + self._keras(data_format="channels_first", use_gpu=False) + if tf.test.is_gpu_available(): + self._keras(data_format="channels_first", use_gpu=True) + + def testKerasNHWC(self): + self._keras(data_format="channels_last", use_gpu=False) + if tf.test.is_gpu_available(): + self._keras(data_format="channels_last", use_gpu=True) + + +@test_utils.run_all_in_graph_and_eager_modes +class DeformablePSROIAlignTest(tf.test.TestCase): + def _forward_simple(self, data_format, use_gpu=False): + featuremap = tf.random.normal(shape=[1, 64, 100, 100]) + rois = tf.convert_to_tensor( + [[0, 1, 1, 800, 800], [0, 2, 2, 400, 400]], dtype=tf.float32 + ) + spatial_scale = 1 / 16 + group_size = 1 + pooled_size = 7 + sample_per_part = 4 + no_trans = True + part_size = 7 + trans_std = 1 + ( + offset_t, + top_count, + ) = _deformable_conv2d_ops_so.ops.addons_deformable_psroi_pool( + featuremap, + rois, + tf.convert_to_tensor(0), + pooled_size=pooled_size, + no_trans=True, + spatial_scale=spatial_scale, + output_dim=64, + group_size=group_size, + part_size=part_size, + sample_per_part=sample_per_part, + trans_std=trans_std, + ) + return offset_t + + def _keras(self, data_format, use_gpu=False): + featuremap = tf.random.normal(shape=[1, 64, 100, 100]) + rois = tf.convert_to_tensor( + [[0, 1, 1, 800, 800], [0, 2, 2, 400, 400]], dtype=tf.float32 + ) + spatial_scale = 1 / 16 + group_size = 1 + pooled_size = 7 + sample_per_part = 4 + no_trans = True + part_size = 7 + trans_std = 1 + psroilayer = DeformablePSROIAlign(output_dim=64, data_format="channels_first") + ret = psroilayer([featuremap, rois]) + return ret + + def testKerasNCHW(self): + self._keras(data_format="channels_first", use_gpu=False) + if tf.test.is_gpu_available(): + self._keras(data_format="channels_first", use_gpu=True) + + def testKerasNHWC(self): + self._keras(data_format="channels_last", use_gpu=False) + if tf.test.is_gpu_available(): + self._keras(data_format="channels_last", use_gpu=True) + + +if __name__ == "__main__": + tf.test.main() From bec6e34eee6f544064c28edd137fbf310447b927 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 08:55:10 -0500 Subject: [PATCH 03/22] fixed a problem --- tensorflow_addons/layers/deformable_conv2d.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py index 948a48a80d..dd090848e7 100644 --- a/tensorflow_addons/layers/deformable_conv2d.py +++ b/tensorflow_addons/layers/deformable_conv2d.py @@ -216,7 +216,7 @@ def compute_output_shape(self, input_shape): dilation=self.dilation_rate[i], ) new_space.append(new_dim) - return tensor_shape.TensorShape( + return tf.TensorShape( [input_shape[0]] + new_space + [self.filters] ) else: From a84a1f07e4a7d09669617fccd923bd4167fe0cc9 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 09:08:27 -0500 Subject: [PATCH 04/22] fix problem --- .../custom_ops/layers/cc/kernels/deformable_conv_op.h | 2 ++ .../custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h index 04413f1f60..484baccd67 100755 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h @@ -24,7 +24,9 @@ #endif // __JETBRAINS_IDE__ #define EIGEN_USE_THREADS +#ifdef GOOGLE_CUDA #define EIGEN_USE_GPU +#endif #include #include diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc index d7f52aaa12..9e88f66b56 100755 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc @@ -7,7 +7,6 @@ #ifdef GOOGLE_CUDA #include "tensorflow/core/platform/stream_executor.h" #include "tensorflow/core/util/gpu_kernel_helper.h" -#endif namespace tensorflow { namespace addons { @@ -123,7 +122,6 @@ __device__ DType DmcnGetCoordinateWeight(DType argmax_h, DType argmax_w, return weight; } -#ifdef GOOGLE_CUDA template __global__ void SwapAxisKernel(const int n, const int cuda_mem_size, const int min_unit_size, DType *input_data, From 30a543414e8c4b8d86c223377e1d790a4d5a0759 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 09:39:06 -0500 Subject: [PATCH 05/22] reformat --- .../layers/.deformable_conv2d_test.py.swp | Bin 0 -> 16384 bytes tensorflow_addons/layers/deformable_conv2d.py | 38 +++++++++--------- .../layers/deformable_conv2d_test.py | 15 +------ 3 files changed, 20 insertions(+), 33 deletions(-) create mode 100644 tensorflow_addons/layers/.deformable_conv2d_test.py.swp diff --git a/tensorflow_addons/layers/.deformable_conv2d_test.py.swp b/tensorflow_addons/layers/.deformable_conv2d_test.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..ddef5ff00d52eea8d16a37539521ed627963d517 GIT binary patch literal 16384 zcmeI3ON<;x8OIw;;yB=#!~(=2qC7ri7RIv=Teh(^#N*wyJlNia-L)NSSz0yIH8Y*` zba%R{cd`gMk`)OlA_op|;o<`vqVSMF5f%xLgcC0j9+xNxECnF}5)cIu4oKnuRrmC3 z*1K_Vgo3)I-%fYE>-(y{s;^vbPhDu8V@K;#2A=mC#@q)_eEQXWkKcCh`wW9y1KT|y zMM!a5JYhR@Rd`_f=oV=x>T*)xll3G6lw&P?m6zQ3}M?Y-xGHnMy#B~VJB zlt3wgQUav}N(qz_C?!xz;Qw3#%HLr;iax$G?q@IleMjN<_4v6EhxZHdJL2!f`1kvT z`1rH@DJ4)!pp-xmc1WE~%5-255N}!ZLDS=V~r3C&565y6$+=hC;M-D&!{eR^D zKflK?eh4mu~kHItGYv9Y^QDB3!;1H;QJHab=8phASx4={2v)~e#10Ml_#E)Tr@$#N4?X~PfWO~?{(zr=hrvA93k>l3PQ!Q&{QGv43tk1+z!$(exCBms z`@kOX8U%SA{0La!82H<}4CCkE2S9=&U^jRc2PRK~%U~6}AH0G?n3us{f2&VatKdwqqM zp>3)DrlsMA2D6#%vVgliQQb7n6mFTI(%iFZWIw%bR|>@%Om*wH>LC?(RlcmQ?I;nL z(Up)Wp+c3?_oS#sku|n@u*PO;Y^uhl*Ycgue^J#FbuQJ0FRHFz$K4aI4$h1xqJ&f$ ziXKU+Hj>w{#yWlOLUc9eh02US)(py&eNX{fYI%ct=-PCnWwV|oeP0)WGL?s`E_5p< zpL9PJn{TBKqF~wQzG%GhX4cq%KeEo92JQt={?W6aMbeBKv{l1x*GIRt7K#edQ4ktr z2Yo;*MSbW5aTP*0y*kR1j?IjKqmfvP2I985wMelIjHOtyvC)g zOcZ8XrWdtPR!Y)95Dv2J%n z^%Qrcm^h}B!3Z!D>KpRFf)N||j;KoEbZgAQ)iR11a8+ZW6lTv48>6f`Z>18o;-u(s zBw%yN9kW@wVm8P0-80I|+97OCY}zn;O_$1b5MV2tJFuMrQ(oqi4Yj$fv1N~`z96H3 z3Bo$uy-%^WV7#M3?l>E|cf!)?`XUg>#~uu&Vo_IE#~b(@!Bj>dWEeF9quF)IIN}~0 zgE?WbEzDexW8Kj0DBE+X(PNkAaBr@jh#(PnAXFH*42B>HVtDWkt-x}nMixQQjHc;~ zd7YVsb~h_|)_}g_K_tzt9Y|G)6qhTxI0+!o#;P6^wRqu)r+YFR*0!|SIx8AIopfZg zfxO9FD9!^tygGs4O6UuRCXK4`0fxbt)D%lG(s&X&5!p1;^YfF{vK>xpl6kw79lS{y z3{O0b@~LTbw{2c1!ye3ZJSU%0s=&5HAz@~C^a3yR3(;;kFgFX2Z2;{Y@|8r3My8Z_ z^f$(4Sh}QXiwNmJY04~%*838uZ0q9_zucV1$VKyol z&$g9`7zs487ioq_q>bq`Bo^p+{fRA!WtYvAN;Dgqk!PcdiFGj3oiUMWd1RVFCeP3! z$r!|{(Ls;pnAlffQmYWNzCueMvs@e06B%0@gsuq-W!f$}!2P}nJ7Wso!^UgiVF+RC z*$un_3|QgDZe81I22<<>Dr3EGlsvlN3)`-3b$Y>fRKmDz!!VJi^o(dY%q1B=vAnd{ zYC3k$jYN^gME1P)W}8qolHQCu%ecfq9fd=o{sslPKHMG{y<7R9K*t`~DzXzSbS+wK zuppepV(3ayM}lP$tr2mM-q27I>5GotwL4hK++GOnv7U#S<MEPzh_i#mttu1X!iG%PP%f)>^Kxlg;JUat&picO5D?5D$lTfO6Up zv5y)$w9cVNL#mVa0)aTa`m$kw_1NvznDn{|TLNM1M>+KLmGX~GP7a5|I`?^}FX~>< zn?zY8K{>n1u8?vgPU(6|?-W!Y zox^k$5KKq-decCwxMOp-gY{i+Y4Xa8m6jD$Q@3%X;TpSuJMgraqkXJOimk+%Digh9^`8Kr2UkEF+zb8*fBzfc>);8{0u}HQ{QX~pUxBBA0_VUka0mDceEp}u zW1tC6fcwEc;5vN!uYxXUf`7q}|10J$D1WF0K?Ib|U#b|Rhfc39V7;tJ#)8zwUn%bi=O^Ps0_%i9npYoCG zSvRqh?agEz7)rH$4UsRGde(1IM~cJTC_?hW8}Tm z*wHDx;?G)gT!s^~LllTVYX~hKW*JlfJ=E|UxlV>ofj?_+mIPoRC-C`J03Pm))wL2Vd4(#JlWJErV#M>KL@`7&Ib+d{&!K{F#C<$InZ z=(q~U9GUlgqD|f4sfryA3YE+(& z|AC4B?3or%!O^L`nF&p^C0a!@(i53PI|w~uP#N?TRExOuI4~osT%J^mxe9W+p%^xq zJq^u|!MD#Wy!pe<;vqn4zi!b%K%~0p@AfN4bO=+ffLdPLi|b9VfTQE>k#?f0aF}D+ z!c}km{^TZW+GJ%v$vY3x!TvVel0R{{u6P7R>+v literal 0 HcmV?d00001 diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py index dd090848e7..338e714fee 100644 --- a/tensorflow_addons/layers/deformable_conv2d.py +++ b/tensorflow_addons/layers/deformable_conv2d.py @@ -143,15 +143,15 @@ class DeformableConv2D(tf.keras.layers.Layer): def __init__( self, filters: int, - kernel_size=(3, 3), - num_groups=1, - deformable_groups=1, - strides=(1, 1), - im2col=1, - use_bias=False, - padding="valid", - data_format="channels_last", - dilations=(1, 1), + kernel_size: tuple = (3, 3), + num_groups: int = 1, + deformable_groups: int = 1, + strides: tuple = (1, 1), + im2col: int = 1, + use_bias: bool = False, + padding: str = "valid", + data_format: str = "channels_last", + dilations: tuple = (1, 1), ): super(DeformableConv2D, self).__init__() self.filters = filters @@ -216,9 +216,7 @@ def compute_output_shape(self, input_shape): dilation=self.dilation_rate[i], ) new_space.append(new_dim) - return tf.TensorShape( - [input_shape[0]] + new_space + [self.filters] - ) + return tf.TensorShape([input_shape[0]] + new_space + [self.filters]) else: space = input_shape[2:] new_space = [] @@ -323,14 +321,14 @@ def _deformable_psroi_pool_back_prop(op, *grad): class DeformablePSROIAlign(tf.keras.layers.Layer): def __init__( self, - output_dim=256, - spatial_scale=1 / 16, - group_size=1, - pooled_size=7, - sample_per_part=4, - part_size=7, - trans_std=1, - data_format="channels_last", + output_dim: int = 256, + spatial_scale: float = 1 / 16, + group_size: int = 1, + pooled_size: int = 7, + sample_per_part: int = 4, + part_size: int = 7, + trans_std: int = 1, + data_format: str = "channels_last", ): super(DeformablePSROIAlign, self).__init__() self.spatial_scale = spatial_scale diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py index 08e9e0a7b1..da66c6541f 100644 --- a/tensorflow_addons/layers/deformable_conv2d_test.py +++ b/tensorflow_addons/layers/deformable_conv2d_test.py @@ -19,10 +19,7 @@ DeformableConv2D, DeformablePSROIAlign, ) -from tensorflow_addons.layers.deformable_conv2d import ( - _deformable_conv2d, - _deformable_conv2d_ops_so, -) +from tensorflow_addons.layers.deformable_conv2d import _deformable_conv2d_ops_so from tensorflow_addons.utils import test_utils @@ -181,7 +178,7 @@ def _gradients(self, data_format, use_gpu=False): def _keras(self, data_format, use_gpu=False): inputs = self._create_test_data(data_format) - output = self._forward(inputs, 64, data_format=data_format) + self._forward(inputs, 64, data_format=data_format) def testForwardNCHW(self): self._forward_simple(data_format="channels_first", use_gpu=False) @@ -227,7 +224,6 @@ def _forward_simple(self, data_format, use_gpu=False): group_size = 1 pooled_size = 7 sample_per_part = 4 - no_trans = True part_size = 7 trans_std = 1 ( @@ -253,13 +249,6 @@ def _keras(self, data_format, use_gpu=False): rois = tf.convert_to_tensor( [[0, 1, 1, 800, 800], [0, 2, 2, 400, 400]], dtype=tf.float32 ) - spatial_scale = 1 / 16 - group_size = 1 - pooled_size = 7 - sample_per_part = 4 - no_trans = True - part_size = 7 - trans_std = 1 psroilayer = DeformablePSROIAlign(output_dim=64, data_format="channels_first") ret = psroilayer([featuremap, rois]) return ret From 948f8efba83cd9aea30d3fc7e70c0e6c78d6f6db Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 09:41:23 -0500 Subject: [PATCH 06/22] reformat --- .../layers/.deformable_conv2d_test.py.swp | Bin 16384 -> 0 bytes .../layers/deformable_conv2d_test.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 tensorflow_addons/layers/.deformable_conv2d_test.py.swp diff --git a/tensorflow_addons/layers/.deformable_conv2d_test.py.swp b/tensorflow_addons/layers/.deformable_conv2d_test.py.swp deleted file mode 100644 index ddef5ff00d52eea8d16a37539521ed627963d517..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 16384 zcmeI3ON<;x8OIw;;yB=#!~(=2qC7ri7RIv=Teh(^#N*wyJlNia-L)NSSz0yIH8Y*` zba%R{cd`gMk`)OlA_op|;o<`vqVSMF5f%xLgcC0j9+xNxECnF}5)cIu4oKnuRrmC3 z*1K_Vgo3)I-%fYE>-(y{s;^vbPhDu8V@K;#2A=mC#@q)_eEQXWkKcCh`wW9y1KT|y zMM!a5JYhR@Rd`_f=oV=x>T*)xll3G6lw&P?m6zQ3}M?Y-xGHnMy#B~VJB zlt3wgQUav}N(qz_C?!xz;Qw3#%HLr;iax$G?q@IleMjN<_4v6EhxZHdJL2!f`1kvT z`1rH@DJ4)!pp-xmc1WE~%5-255N}!ZLDS=V~r3C&565y6$+=hC;M-D&!{eR^D zKflK?eh4mu~kHItGYv9Y^QDB3!;1H;QJHab=8phASx4={2v)~e#10Ml_#E)Tr@$#N4?X~PfWO~?{(zr=hrvA93k>l3PQ!Q&{QGv43tk1+z!$(exCBms z`@kOX8U%SA{0La!82H<}4CCkE2S9=&U^jRc2PRK~%U~6}AH0G?n3us{f2&VatKdwqqM zp>3)DrlsMA2D6#%vVgliQQb7n6mFTI(%iFZWIw%bR|>@%Om*wH>LC?(RlcmQ?I;nL z(Up)Wp+c3?_oS#sku|n@u*PO;Y^uhl*Ycgue^J#FbuQJ0FRHFz$K4aI4$h1xqJ&f$ ziXKU+Hj>w{#yWlOLUc9eh02US)(py&eNX{fYI%ct=-PCnWwV|oeP0)WGL?s`E_5p< zpL9PJn{TBKqF~wQzG%GhX4cq%KeEo92JQt={?W6aMbeBKv{l1x*GIRt7K#edQ4ktr z2Yo;*MSbW5aTP*0y*kR1j?IjKqmfvP2I985wMelIjHOtyvC)g zOcZ8XrWdtPR!Y)95Dv2J%n z^%Qrcm^h}B!3Z!D>KpRFf)N||j;KoEbZgAQ)iR11a8+ZW6lTv48>6f`Z>18o;-u(s zBw%yN9kW@wVm8P0-80I|+97OCY}zn;O_$1b5MV2tJFuMrQ(oqi4Yj$fv1N~`z96H3 z3Bo$uy-%^WV7#M3?l>E|cf!)?`XUg>#~uu&Vo_IE#~b(@!Bj>dWEeF9quF)IIN}~0 zgE?WbEzDexW8Kj0DBE+X(PNkAaBr@jh#(PnAXFH*42B>HVtDWkt-x}nMixQQjHc;~ zd7YVsb~h_|)_}g_K_tzt9Y|G)6qhTxI0+!o#;P6^wRqu)r+YFR*0!|SIx8AIopfZg zfxO9FD9!^tygGs4O6UuRCXK4`0fxbt)D%lG(s&X&5!p1;^YfF{vK>xpl6kw79lS{y z3{O0b@~LTbw{2c1!ye3ZJSU%0s=&5HAz@~C^a3yR3(;;kFgFX2Z2;{Y@|8r3My8Z_ z^f$(4Sh}QXiwNmJY04~%*838uZ0q9_zucV1$VKyol z&$g9`7zs487ioq_q>bq`Bo^p+{fRA!WtYvAN;Dgqk!PcdiFGj3oiUMWd1RVFCeP3! z$r!|{(Ls;pnAlffQmYWNzCueMvs@e06B%0@gsuq-W!f$}!2P}nJ7Wso!^UgiVF+RC z*$un_3|QgDZe81I22<<>Dr3EGlsvlN3)`-3b$Y>fRKmDz!!VJi^o(dY%q1B=vAnd{ zYC3k$jYN^gME1P)W}8qolHQCu%ecfq9fd=o{sslPKHMG{y<7R9K*t`~DzXzSbS+wK zuppepV(3ayM}lP$tr2mM-q27I>5GotwL4hK++GOnv7U#S<MEPzh_i#mttu1X!iG%PP%f)>^Kxlg;JUat&picO5D?5D$lTfO6Up zv5y)$w9cVNL#mVa0)aTa`m$kw_1NvznDn{|TLNM1M>+KLmGX~GP7a5|I`?^}FX~>< zn?zY8K{>n1u8?vgPU(6|?-W!Y zox^k$5KKq-decCwxMOp-gY{i+Y4Xa8m6jD$Q@3%X;TpSuJMgraqkXJOimk+%Digh9^`8Kr2UkEF+zb8*fBzfc>);8{0u}HQ{QX~pUxBBA0_VUka0mDceEp}u zW1tC6fcwEc;5vN!uYxXUf`7q}|10J$D1WF0K?Ib|U#b|Rhfc39V7;tJ#)8zwUn%bi=O^Ps0_%i9npYoCG zSvRqh?agEz7)rH$4UsRGde(1IM~cJTC_?hW8}Tm z*wHDx;?G)gT!s^~LllTVYX~hKW*JlfJ=E|UxlV>ofj?_+mIPoRC-C`J03Pm))wL2Vd4(#JlWJErV#M>KL@`7&Ib+d{&!K{F#C<$InZ z=(q~U9GUlgqD|f4sfryA3YE+(& z|AC4B?3or%!O^L`nF&p^C0a!@(i53PI|w~uP#N?TRExOuI4~osT%J^mxe9W+p%^xq zJq^u|!MD#Wy!pe<;vqn4zi!b%K%~0p@AfN4bO=+ffLdPLi|b9VfTQE>k#?f0aF}D+ z!c}km{^TZW+GJ%v$vY3x!TvVel0R{{u6P7R>+v diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py index da66c6541f..a66d34bd91 100644 --- a/tensorflow_addons/layers/deformable_conv2d_test.py +++ b/tensorflow_addons/layers/deformable_conv2d_test.py @@ -190,7 +190,7 @@ def testForwardNHWC(self): if tf.test.is_gpu_available(): self._forward_simple(data_format="channels_last", use_gpu=True) - """ + """ def testBackwardNCHW(self): self._gradients(data_format="channels_first", use_gpu=False) if tf.test.is_gpu_available(): From 176fcb12882fbac79bd6bd6e60a2936f6e8acf00 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 09:56:50 -0500 Subject: [PATCH 07/22] fixed problem --- tensorflow_addons/layers/deformable_conv2d.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py index 338e714fee..b3a4447ac7 100644 --- a/tensorflow_addons/layers/deformable_conv2d.py +++ b/tensorflow_addons/layers/deformable_conv2d.py @@ -19,10 +19,10 @@ from tensorflow_addons.utils.resource_loader import LazySO from tensorflow.python.keras.utils import conv_utils -# _deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so") -_deformable_conv2d_ops_so = LazySO( - "/home/admin-seu/TempData/sss/custom_ops/deformable_conv2d_ops_new/deformable_conv2D.so" -) +_deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so") +# _deformable_conv2d_ops_so = LazySO( +# "/home/admin-seu/TempData/sss/custom_ops/deformable_conv2d_ops_new/deformable_conv2D.so" +# ) # _deformable_conv2d_ops_so = LazySO("/home/admin-seu/TempData/sss/SoftWare/addons/bazel-bin/tensorflow_addons/custom_ops/layers/_deformable_conv2d_ops.so") # _deformable_conv2d_ops_so = tf.load_op_library("custom_ops/layers/_deformable_conv2d_ops.so") From 603ffa79a32d2e16cff745c5da5fbfeed6043246 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 10:04:10 -0500 Subject: [PATCH 08/22] fix problem --- .../custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc index 9e88f66b56..d3e0ea8861 100755 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc @@ -987,7 +987,7 @@ template struct DeformablePSROIPoolForward; template struct DeformablePSROIPoolForward; template struct DeformablePSROIPoolBackwardKernel; template struct DeformablePSROIPoolBackwardKernel; -#endif } // namespace functor } // namespace addons } // namespace tensorflow +#endif From 1c7b33ad3cb028835e6043c57a0e1cdb35fde15c Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 10:23:30 -0500 Subject: [PATCH 09/22] fixed cpu don't support NCHW format in nn.conv2d --- tensorflow_addons/layers/deformable_conv2d.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py index b3a4447ac7..af753efcc5 100644 --- a/tensorflow_addons/layers/deformable_conv2d.py +++ b/tensorflow_addons/layers/deformable_conv2d.py @@ -170,7 +170,7 @@ def __init__( strides=(1, 1), padding=self.padding, use_bias=True, - data_format=data_format, + data_format="channels_last", ) def build(self, input_shape): @@ -238,7 +238,10 @@ def call(self, inputs, **kwargs): :param kwargs: :return: """ - weight_info = self.conv_offset(inputs) + if self.data_format == "channels_first": + weight_info = self.conv_offset(tf.transpose(inputs, [0, 2, 3, 1])) + else: + weight_info = self.conv_offset(inputs) tf_data_format = "NCHW" tf_padding = "VALID" if self.padding == "same": @@ -249,7 +252,7 @@ def call(self, inputs, **kwargs): offset = tf.concat((o1, o2), axis=-1) mask = tf.sigmoid(mask) else: - o1, o2, mask = tf.split(weight_info, 3, axis=1) + o1, o2, mask = tf.split(tf.transpose(weight_info, [0, 3, 1, 2]), 3, axis=1) offset = tf.concat((o1, o2), axis=1) mask = tf.sigmoid(mask) result = _deformable_conv2d( From 527dc81eb17f27ef4b084e133a18fc35dc070265 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 11:02:55 -0500 Subject: [PATCH 10/22] fixed cpu don't support NCHW format in nn.conv2d --- .../layers/cc/kernels/deformable_conv_op.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc index 54e2ab2176..ce4bd2f53a 100644 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc @@ -121,7 +121,15 @@ typedef Eigen::ThreadPoolDevice CPUDevice; Eigen::IndexPair ContractionDims(bool adj_x, bool adj_y) { return {adj_x ? 0 : 1, adj_y ? 1 : 0}; } - +#ifdef PLATFORM_WINDOWS +#include +template +void AtomicAdd(T *address, T val) { + static std::mutex mu; + std::lock_guard lk(mu); + *address += val; +} +#else void AtomicAdd(float *address, float val) { auto *address_as_ull = reinterpret_cast(address); uInt old = *address_as_ull; @@ -147,6 +155,7 @@ void AtomicAdd(double *address, double val) { *reinterpret_cast(&desired)); } while (assumed != old); } +#endif template void SwapAxisKernel(const CPUDevice &d, const int n, const int cuda_mem_size, From b13201efa98e4c3a02da7c765e5f45669731d10b Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 11:25:59 -0500 Subject: [PATCH 11/22] use WIN32 to support windows --- .../custom_ops/layers/cc/kernels/deformable_conv_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc index ce4bd2f53a..d427da4a31 100644 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc @@ -121,7 +121,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; Eigen::IndexPair ContractionDims(bool adj_x, bool adj_y) { return {adj_x ? 0 : 1, adj_y ? 1 : 0}; } -#ifdef PLATFORM_WINDOWS +#ifdef WIN32 #include template void AtomicAdd(T *address, T val) { From e60191b47c0a9956be8c3ddc43bf10d8186becc2 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 12:09:47 -0500 Subject: [PATCH 12/22] use conditional compile to support windows --- .../custom_ops/layers/cc/kernels/deformable_conv_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc index d427da4a31..685f4dc3d7 100644 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc @@ -3,6 +3,7 @@ #include #include +#include #include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h" @@ -121,8 +122,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice; Eigen::IndexPair ContractionDims(bool adj_x, bool adj_y) { return {adj_x ? 0 : 1, adj_y ? 1 : 0}; } -#ifdef WIN32 -#include +#if PLATFORM_WINDOWS template void AtomicAdd(T *address, T val) { static std::mutex mu; From d1fc3ca967982ce9835dc6eb3cec7b0e902749fc Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 12:43:41 -0500 Subject: [PATCH 13/22] use conditional compile to support windows --- .../custom_ops/layers/cc/kernels/deformable_conv_op.cc | 2 -- .../custom_ops/layers/cc/kernels/deformable_conv_op.h | 6 ------ .../layers/cc/kernels/deformable_conv_op_gpu.cu.cc | 2 -- 3 files changed, 10 deletions(-) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc index 685f4dc3d7..024adae84a 100644 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc @@ -887,7 +887,6 @@ template struct DeformableConv2DCol2ImCoord; template struct PureAddTo; template struct SetOne; template struct SetZeros; -template struct SwapAxis; template struct SetNumAtIndex; template struct DeformableConv2DIm2Col; @@ -896,7 +895,6 @@ template struct DeformableConv2DCol2ImCoord; template struct PureAddTo; template struct SetOne; template struct SetZeros; -template struct SwapAxis; template struct SetNumAtIndex; template struct LaunchBatchMatMul; diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h index 484baccd67..27376bfb61 100755 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h @@ -111,12 +111,6 @@ struct DeformableConv2DCol2ImCoord { DType *grad_mask); }; template -struct SwapAxis { - void operator()(const Device &d, DType *input_data, - const TShape &origin_shape, const int axis_x, - const int axis_y); -}; -template struct DeformableConv2DCol2Im { void operator()(const Device &d, const DType *data_col, const DType *data_offset, const DType *data_mask, diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc index d3e0ea8861..9d139703f5 100755 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc @@ -762,7 +762,6 @@ template struct DeformableConv2DCol2ImCoord; template struct PureAddTo; template struct SetOne; template struct SetZeros; -template struct SwapAxis; template struct SetNumAtIndex; template struct DeformableConv2DIm2Col; @@ -771,7 +770,6 @@ template struct DeformableConv2DCol2ImCoord; template struct PureAddTo; template struct SetOne; template struct SetZeros; -template struct SwapAxis; template struct SetNumAtIndex; template se::DeviceMemory AsDeviceMemory(const T *cuda_memory) { From 84d1d05cca6da9786e4d4df33df51c71b85289b0 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 13:07:26 -0500 Subject: [PATCH 14/22] fix syntax problem in windows --- .../custom_ops/layers/cc/kernels/deformable_conv_op.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc index 024adae84a..b66f37f947 100644 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc @@ -842,9 +842,8 @@ void LaunchBatchMatMul::launch(OpKernelContext *context, contract_pairs[0] = ContractionDims(adj_x, adj_y); auto &device = context->eigen_device(); for (int i = 0; i < t_out.dimension(0); ++i) { - t_out.template chip<0>(i).device(device) = - (t_in_x.template chip<0>(i)) - .template contract(t_in_y.template chip<0>(i), contract_pairs); + t_out.chip(i, 0).device(device) = + (t_in_x.chip(i, 0)).contract(t_in_y.chip(i, 0), contract_pairs); } } From 52ae9015d24f43b1520212ce15d62d031a2b5455 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sun, 9 Feb 2020 22:18:15 -0500 Subject: [PATCH 15/22] remove some redudant code --- .../layers/cc/kernels/deformable_conv_op.h | 21 ------------------- .../layers/cc/ops/deformable_conv2d.cc | 3 --- tensorflow_addons/layers/deformable_conv2d.py | 9 ++------ 3 files changed, 2 insertions(+), 31 deletions(-) diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h index 27376bfb61..2640cac134 100755 --- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h +++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h @@ -2,27 +2,6 @@ #ifndef TF_OPS_DEFORMABLE_CONV2D_H #define TF_OPS_DEFORMABLE_CONV2D_H -#ifdef __JETBRAINS_IDE__ -#define __host__ -#define __device__ -#define __shared__ -#define __constant__ -#define __global__ - -// This is slightly mental, but gets it to properly index device function calls -// like __popc and whatever. -//#define __CUDACC__ - -// These headers are all implicitly present when you compile CUDA with clang. -// Clion doesn't know that, so we include them explicitly to make the indexer -// happy. Doing this when you actually build is, obviously, a terrible idea :D -//#include <__clang_cuda_builtin_vars.h> -//#include <__clang_cuda_intrinsics.h> -//#include <__clang_cuda_math_forward_declares.h> -//#include <__clang_cuda_complex_builtins.h> -//#include <__clang_cuda_cmath.h> -#endif // __JETBRAINS_IDE__ - #define EIGEN_USE_THREADS #ifdef GOOGLE_CUDA #define EIGEN_USE_GPU diff --git a/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc b/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc index c9ec963b13..c6d15fad92 100644 --- a/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc +++ b/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc @@ -1,6 +1,3 @@ -// -// Created by 孙嘉禾 on 2019/12/31. -// #include #include diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py index af753efcc5..f49fb7a2d3 100644 --- a/tensorflow_addons/layers/deformable_conv2d.py +++ b/tensorflow_addons/layers/deformable_conv2d.py @@ -20,11 +20,6 @@ from tensorflow.python.keras.utils import conv_utils _deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so") -# _deformable_conv2d_ops_so = LazySO( -# "/home/admin-seu/TempData/sss/custom_ops/deformable_conv2d_ops_new/deformable_conv2D.so" -# ) -# _deformable_conv2d_ops_so = LazySO("/home/admin-seu/TempData/sss/SoftWare/addons/bazel-bin/tensorflow_addons/custom_ops/layers/_deformable_conv2d_ops.so") -# _deformable_conv2d_ops_so = tf.load_op_library("custom_ops/layers/_deformable_conv2d_ops.so") def _deformable_conv2d( @@ -137,7 +132,7 @@ def _deformable_conv2d_back_prop(op, grad): return data_grad # List of 4 Tensor, since we have 4 input -# @tf.keras.utils.register_keras_serializable(package="Addons") +@tf.keras.utils.register_keras_serializable(package="Addons") class DeformableConv2D(tf.keras.layers.Layer): @typechecked def __init__( @@ -320,7 +315,7 @@ def _deformable_psroi_pool_back_prop(op, *grad): return [data_grad[0], tf.zeros_like(bbox), data_grad[1]] -# @tf.keras.utils.register_keras_serializable(package="Addons") +@tf.keras.utils.register_keras_serializable(package="Addons") class DeformablePSROIAlign(tf.keras.layers.Layer): def __init__( self, From 27aa599a3534af7bbdb7a6895049b7efbb8a5f3f Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sat, 22 Feb 2020 11:17:41 +0800 Subject: [PATCH 16/22] add EquiConv according to https://arxiv.org/pdf/1903.08094.pdf --- docs/tutorials/deformable_conv2d_ops.ipynb | 8 +- tensorflow_addons/layers/BUILD | 13 + tensorflow_addons/layers/__init__.py | 1 + tensorflow_addons/layers/deformable_conv2d.py | 55 +-- tensorflow_addons/layers/equi_conv.py | 333 ++++++++++++++++++ tensorflow_addons/layers/equi_conv_test.py | 23 ++ 6 files changed, 412 insertions(+), 21 deletions(-) create mode 100644 tensorflow_addons/layers/equi_conv.py create mode 100644 tensorflow_addons/layers/equi_conv_test.py diff --git a/docs/tutorials/deformable_conv2d_ops.ipynb b/docs/tutorials/deformable_conv2d_ops.ipynb index 7509feba66..fa9481da2d 100644 --- a/docs/tutorials/deformable_conv2d_ops.ipynb +++ b/docs/tutorials/deformable_conv2d_ops.ipynb @@ -65,7 +65,10 @@ "\n", "(DeformableConv2D is just like normal tf.keras.layers.Conv2D)\n", "\n", - "(DeformablePSROIAlign is something like tf.image.crop_to_bounding_box)" + "(DeformablePSROIAlign is something like tf.image.crop_and_resize)\n", + "\n", + "(EquiConv is proposed by https://arxiv.org/pdf/1903.08094.pdf, which is a special case\n", + "of DeformableConv2D)" ], "metadata": { "collapsed": false @@ -84,7 +87,10 @@ "featuremap = tf.random.uniform(shape=[batch_size, 20, 20, channels], dtype=tf.float32)\n", "\n", "deformable_layer = tfa.layers.DeformableConv2D(filters, kernel_size, padding=padding)\n", + "equi_conv_layer = tfa.layers.EquiConv(64, (3, 3), 1, 1, (1, 1),\n", + " 1, False, \"same\", \"channels_last\")\n", "result = deformable_layer(featuremap)\n", + "equi_result = equi_conv_layer(featuremap)\n", "\n", "image_featuremap = tf.random.normal(shape=[2, 64, 100, 100])\n", "rois = tf.convert_to_tensor([[0, 1, 1, 800, 800], [1, 2, 2, 400, 400]], dtype=tf.float32)\n", diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD index 5483f6a64f..df303a61f9 100644 --- a/tensorflow_addons/layers/BUILD +++ b/tensorflow_addons/layers/BUILD @@ -7,6 +7,7 @@ py_library( srcs = [ "__init__.py", "deformable_conv2d.py", + "equi_conv.py", "gelu.py", "maxout.py", "multihead_attention.py", @@ -168,3 +169,15 @@ py_test( ":layers", ], ) + +py_test( + name = "equi_conv_test", + size = "small", + srcs = [ + "equi_conv_test.py", + ], + main = "equi_conv_test.py", + deps = [ + ":layers", + ], +) diff --git a/tensorflow_addons/layers/__init__.py b/tensorflow_addons/layers/__init__.py index 733cb79c76..a9b8ce4d39 100644 --- a/tensorflow_addons/layers/__init__.py +++ b/tensorflow_addons/layers/__init__.py @@ -28,3 +28,4 @@ from tensorflow_addons.layers.wrappers import WeightNormalization from tensorflow_addons.layers.deformable_conv2d import DeformableConv2D from tensorflow_addons.layers.deformable_conv2d import DeformablePSROIAlign +from tensorflow_addons.layers.equi_conv import EquiConv diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py index f49fb7a2d3..bcd250a765 100644 --- a/tensorflow_addons/layers/deformable_conv2d.py +++ b/tensorflow_addons/layers/deformable_conv2d.py @@ -16,6 +16,7 @@ import tensorflow as tf from typeguard import typechecked +from tensorflow_addons.utils import types from tensorflow_addons.utils.resource_loader import LazySO from tensorflow.python.keras.utils import conv_utils @@ -93,7 +94,7 @@ def _deformable_conv2d_back_prop(op, grad): num_groups = op.get_attr("num_groups") deformable_groups = op.get_attr("deformable_groups") """ - REGISTER_OP("Addons>DeformableConv2DBackProp") + REGISTER_OP("AddonsDeformableConv2DBackProp") .Input("input: T") .Input("filter: T") .Input("offset: T") @@ -147,6 +148,12 @@ def __init__( padding: str = "valid", data_format: str = "channels_last", dilations: tuple = (1, 1), + kernel_initializer: types.Initializer = None, + bias_initializer: types.Initializer = None, + kernel_regularizer: types.Regularizer = None, + bias_regularizer: types.Regularizer = None, + kernel_constraint: types.Constraint = None, + bias_constraint: types.Constraint = None, ): super(DeformableConv2D, self).__init__() self.filters = filters @@ -159,13 +166,25 @@ def __init__( self.padding = padding self.data_format = data_format self.dilations = dilations + self.kernel_initializer = tf.keras.initializers.get(kernel_initializer) + self.bias_initializer = tf.keras.initializers.get(bias_initializer) + self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) + self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer) + self.kernel_constraint = tf.keras.constraints.get(kernel_constraint) + self.bias_constraint = tf.keras.constraints.get(bias_constraint) self.conv_offset = tf.keras.layers.Conv2D( self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1], kernel_size=self.kernel_size, strides=(1, 1), padding=self.padding, - use_bias=True, + use_bias=self.use_bias, data_format="channels_last", + kernel_initializer=self.kernel_initializer, + bias_initializer=self.bias_initializer, + kernel_regularizer=self.kernel_regularizer, + bias_regularizer=self.bias_regularizer, + kernel_constraint=self.kernel_constraint, + bias_constraint=self.bias_constraint, ) def build(self, input_shape): @@ -174,26 +193,22 @@ def build(self, input_shape): else: channel = int(input_shape[1]) if self.data_format == "channels_last": - self.filter = tf.Variable( - initial_value=tf.random.normal( - shape=[ - self.kernel_size[0], - self.kernel_size[1], - channel, - self.filters, - ] - ) + self.filter = self.add_weight( + name="filter", + shape=[self.kernel_size[0], self.kernel_size[1], channel, self.filters], + initializer=self.kernel_initializer, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint, + trainable=True, ) else: - self.filter = tf.Variable( - initial_value=tf.random.normal( - shape=[ - self.filters, - channel, - self.kernel_size[0], - self.kernel_size[1], - ] - ) + self.filter = self.add_weight( + name="filter", + shape=[self.filters, channel, self.kernel_size[0], self.kernel_size[1]], + initializer=self.kernel_initializer, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint, + trainable=True, ) self.built = True diff --git a/tensorflow_addons/layers/equi_conv.py b/tensorflow_addons/layers/equi_conv.py new file mode 100644 index 0000000000..06773ab0bd --- /dev/null +++ b/tensorflow_addons/layers/equi_conv.py @@ -0,0 +1,333 @@ +import tensorflow as tf +import numpy as np +import math + +from typeguard import typechecked + +from tensorflow_addons.utils import types +from tensorflow.python.keras.utils import conv_utils + +from tensorflow_addons.layers.deformable_conv2d import _deformable_conv2d + + +def rotation_matrix(axis, theta): + """ + Return the rotation matrix associated with counterclockwise rotation about + the given axis by theta radians. + """ + axis = np.asarray(axis) + axis = axis / math.sqrt(np.dot(axis, axis)) + a = math.cos(theta / 2.0) + b, c, d = -axis * math.sin(theta / 2.0) + aa, bb, cc, dd = a * a, b * b, c * c, d * d + bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d + return np.array( + [ + [aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)], + [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)], + [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc], + ] + ) + + +def equi_coord(pano_W, pano_H, k_W, k_H, u, v): + fov_w = k_W * np.deg2rad(360.0 / float(pano_W)) + focal = (float(k_W) / 2) / np.tan(fov_w / 2) + c_x = 0 + c_y = 0 + + u_r, v_r = u, v + u_r, v_r = u_r - float(pano_W) / 2.0, v_r - float(pano_H) / 2.0 + phi, theta = u_r / (pano_W) * (np.pi) * 2, -v_r / (pano_H) * (np.pi) + + ROT = rotation_matrix((0, 1, 0), phi) + ROT = np.matmul(ROT, rotation_matrix((1, 0, 0), theta)) # np.eye(3) + + h_range = np.array(range(k_H)) + w_range = np.array(range(k_W)) + w_ones = np.ones(k_W) + h_ones = np.ones(k_H) + h_grid = ( + np.matmul(np.expand_dims(h_range, -1), np.expand_dims(w_ones, 0)) + + 0.5 + - float(k_H) / 2 + ) + w_grid = ( + np.matmul(np.expand_dims(h_ones, -1), np.expand_dims(w_range, 0)) + + 0.5 + - float(k_W) / 2 + ) + + K = np.array([[focal, 0, c_x], [0, focal, c_y], [0.0, 0.0, 1.0]]) + inv_K = np.linalg.inv(K) + rays = np.stack([w_grid, h_grid, np.ones(h_grid.shape)], 0) + rays = np.matmul(inv_K, rays.reshape(3, k_H * k_W)) + rays /= np.linalg.norm(rays, axis=0, keepdims=True) + rays = np.matmul(ROT, rays) + rays = rays.reshape((3, k_H, k_W)) + + phi = np.arctan2(rays[0, ...], rays[2, ...]) + theta = np.arcsin(np.clip(rays[1, ...], -1, 1)) + x = (pano_W) / (2.0 * np.pi) * phi + float(pano_W) / 2.0 + y = (pano_H) / (np.pi) * theta + float(pano_H) / 2.0 + + roi_y = h_grid + v_r + float(pano_H) / 2.0 + roi_x = w_grid + u_r + float(pano_W) / 2.0 + + new_roi_y = y + new_roi_x = x + + offsets_x = new_roi_x - roi_x + offsets_y = new_roi_y - roi_y + + return offsets_x, offsets_y + + +def equi_coord_fixed_resoltuion(pano_W, pano_H, k_W, k_H, u, v, pano_Hf=-1, pano_Wf=-1): + pano_Hf = pano_H if pano_Hf <= 0 else pano_H / pano_Hf + pano_Wf = pano_W if pano_Wf <= 0 else pano_W / pano_Wf + fov_w = k_W * np.deg2rad(360.0 / float(pano_Wf)) + focal = (float(k_W) / 2) / np.tan(fov_w / 2) + c_x = 0 + c_y = 0 + + u_r, v_r = u, v + u_r, v_r = u_r - float(pano_W) / 2.0, v_r - float(pano_H) / 2.0 + phi, theta = u_r / (pano_W) * (np.pi) * 2, -v_r / (pano_H) * (np.pi) + + ROT = rotation_matrix((0, 1, 0), phi) + ROT = np.matmul(ROT, rotation_matrix((1, 0, 0), theta)) # np.eye(3) + + h_range = np.array(range(k_H)) + w_range = np.array(range(k_W)) + w_ones = np.ones(k_W) + h_ones = np.ones(k_H) + h_grid = ( + np.matmul(np.expand_dims(h_range, -1), np.expand_dims(w_ones, 0)) + + 0.5 + - float(k_H) / 2 + ) + w_grid = ( + np.matmul(np.expand_dims(h_ones, -1), np.expand_dims(w_range, 0)) + + 0.5 + - float(k_W) / 2 + ) + + K = np.array([[focal, 0, c_x], [0, focal, c_y], [0.0, 0.0, 1.0]]) + inv_K = np.linalg.inv(K) + rays = np.stack([w_grid, h_grid, np.ones(h_grid.shape)], 0) + rays = np.matmul(inv_K, rays.reshape(3, k_H * k_W)) + rays /= np.linalg.norm(rays, axis=0, keepdims=True) + rays = np.matmul(ROT, rays) + rays = rays.reshape((3, k_H, k_W)) + + phi = np.arctan2(rays[0, ...], rays[2, ...]) + theta = np.arcsin(np.clip(rays[1, ...], -1, 1)) + x = (pano_W) / (2.0 * np.pi) * phi + float(pano_W) / 2.0 + y = (pano_H) / (np.pi) * theta + float(pano_H) / 2.0 + + roi_y = h_grid + v_r + float(pano_H) / 2.0 + roi_x = w_grid + u_r + float(pano_W) / 2.0 + + new_roi_y = y + new_roi_x = x + + offsets_x = new_roi_x - roi_x + offsets_y = new_roi_y - roi_y + + return offsets_x, offsets_y + + +def distortion_aware_map(pano_W, pano_H, k_W, k_H, s_width=1, s_height=1, bs=16): + offset = np.zeros(shape=[pano_H, pano_W, k_H * k_W * 2]) + + for v in range(0, pano_H, s_height): + for u in range(0, pano_W, s_width): + offsets_x, offsets_y = equi_coord_fixed_resoltuion( + pano_W, pano_H, k_W, k_H, u, v, 1, 1 + ) + offsets = np.concatenate( + (np.expand_dims(offsets_y, -1), np.expand_dims(offsets_x, -1)), axis=-1 + ) + total_offsets = offsets.flatten().astype("float32") + offset[v, u, :] = total_offsets + + offset = tf.constant(offset) + offset = tf.expand_dims(offset, 0) + offset = tf.tile(offset, multiples=[bs, 1, 1, 1]) + offset = tf.cast(offset, tf.float32) + + return offset + + +class EquiConv(tf.keras.layers.Layer): + @typechecked + def __init__( + self, + filters: int, + kernel_size: tuple = (3, 3), + num_groups: int = 1, + deformable_groups: int = 1, + strides: tuple = (1, 1), + im2col: int = 1, + use_bias: bool = False, + padding: str = "valid", + data_format: str = "channels_last", + dilations: tuple = (1, 1), + use_relu: bool = False, + kernel_initializer: types.Initializer = None, + kernel_regularizer: types.Regularizer = None, + kernel_constraint: types.Constraint = None, + **kwargs + ): + super(EquiConv, self).__init__(**kwargs) + self.filters = filters + self.kernel_size = kernel_size + self.num_groups = num_groups + self.deformable_groups = deformable_groups + self.strides = strides + self.im2col = im2col + self.use_bias = use_bias + self.padding = padding + self.data_format = data_format + self.dilations = dilations + self.use_relu = use_relu + self.kernel_initializer = tf.keras.initializers.get(kernel_initializer) + self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer) + self.kernel_constraint = tf.keras.constraints.get(kernel_constraint) + if self.padding == "valid": + self.tf_pad = "VALID" + else: + self.tf_pad = "SAME" + + def build(self, input_shape): + if self.data_format == "channels_last": + channel = int(input_shape[-1]) + else: + channel = int(input_shape[1]) + self.kernel = self.add_weight( + shape=[self.filters, channel, self.kernel_size[0], self.kernel_size[1]], + initializer=self.kernel_initializer, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint, + trainable=True, + ) + if self.use_bias: + self.bias = self.add_weight( + shape=[1, self.filters, 1, 1], + initializer=self.kernel_initializer, + regularizer=self.kernel_regularizer, + constraint=self.kernel_constraint, + trainable=True, + ) + self.built = True + + def compute_output_shape(self, input_shape): + input_shape = tf.TensorShape(input_shape).as_list() + if self.data_format == "channels_last": + space = input_shape[1:-1] + new_space = [] + for i in range(len(space)): + new_dim = conv_utils.conv_output_length( + space[i], + self.kernel_size[i], + padding=self.padding, + stride=self.strides[i], + dilation=self.dilation_rate[i], + ) + new_space.append(new_dim) + return tf.TensorShape([input_shape[0]] + new_space + [self.filters]) + else: + space = input_shape[2:] + new_space = [] + for i in range(len(space)): + new_dim = conv_utils.conv_output_length( + space[i], + self.kernel_size[i], + padding=self.padding, + stride=self.strides[i], + dilation=self.dilation_rate[i], + ) + new_space.append(new_dim) + return tf.TensorShape([input_shape[0], self.filters] + new_space) + + def call(self, inputs, **kwargs): + if self.data_format == "channels_first": + data = tf.transpose(inputs, [0, 2, 3, 1]) + else: + data = inputs + n, h, w, c_i = tuple(data.get_shape().as_list()) + data_shape = tf.shape(data) + """ + The original implement in paper here bs is set as self.batch_size, here wo use data_shape[0], + because self.batch_size if constant value and can't changed, but actually image batch_size can + change in train and test period, so we use tf.shape to get actual dynamic batch_size. + """ + offset = tf.stop_gradient( + distortion_aware_map( + w, + h, + self.kernel_size[0], + self.kernel_size[1], + s_width=self.strides[0], + s_height=self.strides[1], + bs=data_shape[0], + ) + ) + mask = tf.stop_gradient( + tf.zeros( + shape=[ + data_shape[0], + data_shape[1], + data_shape[2], + self.kernel_size[0] * self.kernel_size[1], + ] + ) + ) + data = tf.transpose(data, [0, 3, 1, 2]) + offset = tf.transpose(offset, [0, 3, 1, 2]) + mask = tf.transpose(mask, [0, 3, 1, 2]) + res = _deformable_conv2d( + data, + self.kernel, + offset, + mask, + [1, 1, self.strides[0], self.strides[1]], + num_groups=self.num_groups, + deformable_groups=self.deformable_groups, + padding=self.tf_pad, + data_format="NCHW", + ) + if self.use_bias: + res = tf.add(res, self.bias) + if self.use_relu: + res = tf.nn.relu(res) + if self.data_format == "channels_last": + return tf.transpose(res, [0, 2, 3, 1]) + else: + return res + + def get_config(self): + config = { + "filters": self.filters, + "kernel_size": self.kernel_size, + "num_groups": self.num_groups, + "deformable_groups": self.deformable_groups, + "strides": self.strides, + "im2col": self.im2col, + "use_bias": self.use_bias, + "padding": self.padding, + "data_format": self.data_format, + "dilations": self.dilations, + "use_relu": self.use_relu, + "kernel_initializer": tf.keras.initializers.serialize( + self.kernel_initializer + ), + "kernel_regularizer": tf.keras.regularizers.serialize( + self.kernel_regularizer + ), + "kernel_constraint": tf.keras.constraints.serialize(self.kernel_constraint), + "tf_pad": self.tf_pad, + } + base_config = super().get_config() + return {**base_config, **config} diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py new file mode 100644 index 0000000000..4438416852 --- /dev/null +++ b/tensorflow_addons/layers/equi_conv_test.py @@ -0,0 +1,23 @@ +import tensorflow as tf +from tensorflow_addons.layers.equi_conv import EquiConv + +from tensorflow_addons.utils import test_utils + + +@test_utils.run_all_in_graph_and_eager_modes +class EquiConvTest(tf.test.TestCase): + def testKerasNHWC(self): + input = tf.ones(shape=[16, 400, 400, 3]) + layer = EquiConv(64, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last") + res = layer(input) + self.assertAllEqual(tf.shape(input), tf.shape(res)) + + def testKerasNCHW(self): + input = tf.ones(shape=[16, 3, 400, 400]) + layer = EquiConv(64, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first") + res = layer(input) + self.assertAllEqual(tf.shape(input), tf.shape(res)) + + +if __name__ == "__main__": + tf.test.main() From 057d927b765a5a31251150ea8c9e7ab7c8b5da6c Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sat, 22 Feb 2020 11:41:30 +0800 Subject: [PATCH 17/22] fix timeout problem --- tensorflow_addons/layers/equi_conv_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py index 4438416852..4c61f6a868 100644 --- a/tensorflow_addons/layers/equi_conv_test.py +++ b/tensorflow_addons/layers/equi_conv_test.py @@ -7,14 +7,14 @@ @test_utils.run_all_in_graph_and_eager_modes class EquiConvTest(tf.test.TestCase): def testKerasNHWC(self): - input = tf.ones(shape=[16, 400, 400, 3]) - layer = EquiConv(64, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last") + input = tf.ones(shape=[1, 10, 10, 3]) + layer = EquiConv(16, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last") res = layer(input) self.assertAllEqual(tf.shape(input), tf.shape(res)) def testKerasNCHW(self): - input = tf.ones(shape=[16, 3, 400, 400]) - layer = EquiConv(64, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first") + input = tf.ones(shape=[1, 3, 10, 10]) + layer = EquiConv(16, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first") res = layer(input) self.assertAllEqual(tf.shape(input), tf.shape(res)) From 663622a6c3f394d848d57641c107f521ff0f41a1 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sat, 22 Feb 2020 11:55:06 +0800 Subject: [PATCH 18/22] fix test problem --- tensorflow_addons/layers/equi_conv_test.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py index 4c61f6a868..5285c5e823 100644 --- a/tensorflow_addons/layers/equi_conv_test.py +++ b/tensorflow_addons/layers/equi_conv_test.py @@ -7,14 +7,20 @@ @test_utils.run_all_in_graph_and_eager_modes class EquiConvTest(tf.test.TestCase): def testKerasNHWC(self): - input = tf.ones(shape=[1, 10, 10, 3]) - layer = EquiConv(16, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last") + channel = 32 + input = tf.ones(shape=[1, 10, 10, channel]) + layer = EquiConv( + channel, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last" + ) res = layer(input) self.assertAllEqual(tf.shape(input), tf.shape(res)) def testKerasNCHW(self): - input = tf.ones(shape=[1, 3, 10, 10]) - layer = EquiConv(16, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first") + channel = 32 + input = tf.ones(shape=[1, channel, 10, 10]) + layer = EquiConv( + channel, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first" + ) res = layer(input) self.assertAllEqual(tf.shape(input), tf.shape(res)) From 6a72bcc9618ab702a21ff58e6db70010b0515114 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Fri, 27 Mar 2020 10:16:06 +0800 Subject: [PATCH 19/22] fix problem --- tensorflow_addons/layers/BUILD | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD index df303a61f9..a2580d1a9e 100644 --- a/tensorflow_addons/layers/BUILD +++ b/tensorflow_addons/layers/BUILD @@ -159,6 +159,11 @@ py_test( "multihead_attention_test.py", ], main = "multihead_attention_test.py", + deps = [ + ":layers", + ], +) +py_test( name = "deformable_conv_test", size = "small", srcs = [ From dc3e3abb34a45e9b0fcbf5015c25477c71e0eb28 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sat, 28 Mar 2020 00:34:05 +0800 Subject: [PATCH 20/22] format code --- tensorflow_addons/layers/deformable_conv2d_test.py | 5 +++-- tensorflow_addons/layers/equi_conv_test.py | 5 ++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py index a66d34bd91..53f8283ed4 100644 --- a/tensorflow_addons/layers/deformable_conv2d_test.py +++ b/tensorflow_addons/layers/deformable_conv2d_test.py @@ -15,6 +15,7 @@ import numpy as np import tensorflow as tf +import pytest from tensorflow_addons.layers.deformable_conv2d import ( DeformableConv2D, DeformablePSROIAlign, @@ -23,7 +24,7 @@ from tensorflow_addons.utils import test_utils -@test_utils.run_all_in_graph_and_eager_modes +@pytest.mark.usefixtures("maybe_run_functions_eagerly") class DeformableConv2DTest(tf.test.TestCase): def _forward( self, @@ -213,7 +214,7 @@ def testKerasNHWC(self): self._keras(data_format="channels_last", use_gpu=True) -@test_utils.run_all_in_graph_and_eager_modes +@pytest.mark.usefixtures("maybe_run_functions_eagerly") class DeformablePSROIAlignTest(tf.test.TestCase): def _forward_simple(self, data_format, use_gpu=False): featuremap = tf.random.normal(shape=[1, 64, 100, 100]) diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py index 5285c5e823..090e242e40 100644 --- a/tensorflow_addons/layers/equi_conv_test.py +++ b/tensorflow_addons/layers/equi_conv_test.py @@ -1,10 +1,9 @@ import tensorflow as tf +import pytest from tensorflow_addons.layers.equi_conv import EquiConv -from tensorflow_addons.utils import test_utils - -@test_utils.run_all_in_graph_and_eager_modes +@pytest.mark.usefixtures("maybe_run_functions_eagerly") class EquiConvTest(tf.test.TestCase): def testKerasNHWC(self): channel = 32 From 4aad322548e1060b752179944588f7ec698f2e90 Mon Sep 17 00:00:00 2001 From: public <975759105@qq.com> Date: Sat, 28 Mar 2020 00:54:09 +0800 Subject: [PATCH 21/22] manully format code --- tensorflow_addons/layers/BUILD | 1 + tensorflow_addons/layers/deformable_conv2d_test.py | 7 ++++--- tensorflow_addons/layers/equi_conv_test.py | 6 ++++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD index a2580d1a9e..cc477b1d0c 100644 --- a/tensorflow_addons/layers/BUILD +++ b/tensorflow_addons/layers/BUILD @@ -163,6 +163,7 @@ py_test( ":layers", ], ) + py_test( name = "deformable_conv_test", size = "small", diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py index 53f8283ed4..cd3d8897d6 100644 --- a/tensorflow_addons/layers/deformable_conv2d_test.py +++ b/tensorflow_addons/layers/deformable_conv2d_test.py @@ -16,6 +16,7 @@ import numpy as np import tensorflow as tf import pytest +import sys from tensorflow_addons.layers.deformable_conv2d import ( DeformableConv2D, DeformablePSROIAlign, @@ -24,7 +25,7 @@ from tensorflow_addons.utils import test_utils -@pytest.mark.usefixtures("maybe_run_functions_eagerly") +@test_utils.run_all_in_graph_and_eager_modes class DeformableConv2DTest(tf.test.TestCase): def _forward( self, @@ -214,7 +215,7 @@ def testKerasNHWC(self): self._keras(data_format="channels_last", use_gpu=True) -@pytest.mark.usefixtures("maybe_run_functions_eagerly") +@test_utils.run_all_in_graph_and_eager_modes class DeformablePSROIAlignTest(tf.test.TestCase): def _forward_simple(self, data_format, use_gpu=False): featuremap = tf.random.normal(shape=[1, 64, 100, 100]) @@ -266,4 +267,4 @@ def testKerasNHWC(self): if __name__ == "__main__": - tf.test.main() + sys.exit(pytest.main([__file__])) diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py index 090e242e40..f618d8d068 100644 --- a/tensorflow_addons/layers/equi_conv_test.py +++ b/tensorflow_addons/layers/equi_conv_test.py @@ -1,9 +1,11 @@ import tensorflow as tf import pytest +import sys from tensorflow_addons.layers.equi_conv import EquiConv +from tensorflow_addons.utils import test_utils -@pytest.mark.usefixtures("maybe_run_functions_eagerly") +@test_utils.run_all_in_graph_and_eager_modes class EquiConvTest(tf.test.TestCase): def testKerasNHWC(self): channel = 32 @@ -25,4 +27,4 @@ def testKerasNCHW(self): if __name__ == "__main__": - tf.test.main() + sys.exit(pytest.main([__file__])) From 430cc6a19a47e797d796d3f5b87dcc8a95d75b23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Libor=20Van=C4=9Bk?= <35312583+liborvaneksw@users.noreply.github.com> Date: Mon, 8 Jun 2020 18:38:19 +0200 Subject: [PATCH 22/22] Update deformable_conv2d.py DeformableConv2D: serialization fixes --- tensorflow_addons/layers/deformable_conv2d.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py index bcd250a765..3b699dc804 100644 --- a/tensorflow_addons/layers/deformable_conv2d.py +++ b/tensorflow_addons/layers/deformable_conv2d.py @@ -19,6 +19,7 @@ from tensorflow_addons.utils import types from tensorflow_addons.utils.resource_loader import LazySO from tensorflow.python.keras.utils import conv_utils +from typing import Union _deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so") @@ -139,23 +140,24 @@ class DeformableConv2D(tf.keras.layers.Layer): def __init__( self, filters: int, - kernel_size: tuple = (3, 3), + kernel_size: Union[tuple, list] = (3, 3), num_groups: int = 1, deformable_groups: int = 1, - strides: tuple = (1, 1), + strides: Union[tuple, list] = (1, 1), im2col: int = 1, use_bias: bool = False, padding: str = "valid", data_format: str = "channels_last", - dilations: tuple = (1, 1), + dilations: Union[tuple, list] = (1, 1), kernel_initializer: types.Initializer = None, bias_initializer: types.Initializer = None, kernel_regularizer: types.Regularizer = None, bias_regularizer: types.Regularizer = None, kernel_constraint: types.Constraint = None, bias_constraint: types.Constraint = None, + **kwargs ): - super(DeformableConv2D, self).__init__() + super(DeformableConv2D, self).__init__(**kwargs) self.filters = filters self.kernel_size = kernel_size self.num_groups = num_groups