From eca80f5d45ed9d3f9a3a165a8503d83767f6357e Mon Sep 17 00:00:00 2001
From: Sean Morgan <seanmorgan91@gmail.com>
Date: Sun, 13 Jan 2019 20:14:54 -0500
Subject: [PATCH 01/22] Update BUILD

---
 change.sh | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 change.sh

diff --git a/change.sh b/change.sh
new file mode 100644
index 0000000000..dcd166fd73
--- /dev/null
+++ b/change.sh
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+git filter-branch -f --env-filter '
+
+OLD_EMAIL="sunjiahe@sss-2.local"
+CORRECT_NAME="public"
+CORRECT_EMAIL="975759105@qq.com"
+
+if [ "$GIT_COMMITTER_EMAIL" = "$OLD_EMAIL" ]
+then
+    export GIT_COMMITTER_NAME="$CORRECT_NAME"
+    export GIT_COMMITTER_EMAIL="$CORRECT_EMAIL"
+fi
+if [ "$GIT_AUTHOR_EMAIL" = "$OLD_EMAIL" ]
+then
+    export GIT_AUTHOR_NAME="$CORRECT_NAME"
+    export GIT_AUTHOR_EMAIL="$CORRECT_EMAIL"
+fi
+' -- --all
\ No newline at end of file

From 350d26ca68c36921373d1ab57cf192a16a94b2a3 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 08:41:34 -0500
Subject: [PATCH 02/22] Add DeformableConv2D and DeformablePSROIAlign operator
 with cpu and gpu

---
 docs/tutorials/deformable_conv2d_ops.ipynb    |  142 ++
 tensorflow_addons/custom_ops/layers/BUILD     |   18 +
 .../cc/kernels/deformable_conv2d_utils.h      |  250 +++
 .../layers/cc/kernels/deformable_conv_op.cc   | 1517 +++++++++++++++++
 .../layers/cc/kernels/deformable_conv_op.h    |  317 ++++
 .../cc/kernels/deformable_conv_op_gpu.cu.cc   |  995 +++++++++++
 .../layers/cc/ops/deformable_conv2d.cc        |  287 ++++
 tensorflow_addons/layers/BUILD                |    8 +
 tensorflow_addons/layers/__init__.py          |    3 +
 tensorflow_addons/layers/deformable_conv2d.py |  414 +++++
 .../layers/deformable_conv2d_test.py          |  279 +++
 11 files changed, 4230 insertions(+)
 create mode 100644 docs/tutorials/deformable_conv2d_ops.ipynb
 create mode 100755 tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h
 create mode 100644 tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
 create mode 100755 tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
 create mode 100755 tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
 create mode 100644 tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc
 create mode 100644 tensorflow_addons/layers/deformable_conv2d.py
 create mode 100644 tensorflow_addons/layers/deformable_conv2d_test.py

diff --git a/docs/tutorials/deformable_conv2d_ops.ipynb b/docs/tutorials/deformable_conv2d_ops.ipynb
new file mode 100644
index 0000000000..7509feba66
--- /dev/null
+++ b/docs/tutorials/deformable_conv2d_ops.ipynb
@@ -0,0 +1,142 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Setup\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "  %tensorflow_version 2.x\n",
+    "except:\n",
+    "  pass\n",
+    "\n",
+    "import tensorflow as tf\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "!pip install -q --no-deps tensorflow-addons~=0.9"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import tensorflow_addons as tfa\n",
+    "import matplotlib.pyplot as plt\n"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Usage example\n",
+    "\n",
+    "(DeformableConv2D is just like normal tf.keras.layers.Conv2D)\n",
+    "\n",
+    "(DeformablePSROIAlign is something like tf.image.crop_to_bounding_box)"
+   ],
+   "metadata": {
+    "collapsed": false
+   }
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "filters = 64\n",
+    "batch_size = 4\n",
+    "channels = 3\n",
+    "kernel_size = (3, 3)\n",
+    "padding = \"same\"\n",
+    "featuremap = tf.random.uniform(shape=[batch_size, 20, 20, channels], dtype=tf.float32)\n",
+    "\n",
+    "deformable_layer = tfa.layers.DeformableConv2D(filters, kernel_size, padding=padding)\n",
+    "result = deformable_layer(featuremap)\n",
+    "\n",
+    "image_featuremap = tf.random.normal(shape=[2, 64, 100, 100])\n",
+    "rois = tf.convert_to_tensor([[0, 1, 1, 800, 800], [1, 2, 2, 400, 400]], dtype=tf.float32)\n",
+    "out_dim = 64\n",
+    "spatial_scale = 1 / 16\n",
+    "group_size = 1\n",
+    "pooled_size = 7\n",
+    "sample_per_part = 4\n",
+    "part_size = 7\n",
+    "trans_std = 1\n",
+    "data_format = \"channels_first\"\n",
+    "psroi_align_layer = tfa.layers.DeformablePSROIAlign(out_dim, spatial_scale, group_size,\n",
+    "                                                    pooled_size, sample_per_part, part_size,\n",
+    "                                                    trans_std, data_format)\n",
+    "features = psroi_align_layer([image_featuremap, rois])"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "pycharm": {
+     "name": "#%%\n"
+    }
+   }
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "source": [],
+    "metadata": {
+     "collapsed": false
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/tensorflow_addons/custom_ops/layers/BUILD b/tensorflow_addons/custom_ops/layers/BUILD
index 5ff49efe99..2a59ba9e19 100644
--- a/tensorflow_addons/custom_ops/layers/BUILD
+++ b/tensorflow_addons/custom_ops/layers/BUILD
@@ -19,3 +19,21 @@ custom_op_library(
         "cc/kernels/correlation_cost_op_gpu.cu.cc",
     ],
 )
+
+custom_op_library(
+    name = "_deformable_conv2d_ops.so",
+    srcs = [
+        "cc/kernels/deformable_conv2d_utils.h",
+        "cc/kernels/deformable_conv_op.cc",
+        "cc/kernels/deformable_conv_op.h",
+        "cc/ops/deformable_conv2d.cc",
+    ],
+    cuda_deps = [
+        "@cub_archive//:cub",
+    ],
+    cuda_srcs = [
+        "cc/kernels/deformable_conv2d_utils.h",
+        "cc/kernels/deformable_conv_op.h",
+        "cc/kernels/deformable_conv_op_gpu.cu.cc",
+    ],
+)
diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h
new file mode 100755
index 0000000000..51943105bb
--- /dev/null
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h
@@ -0,0 +1,250 @@
+
+#ifndef TF_OPS_DEFORMABLE_CONV2D_UTILS_H
+#define TF_OPS_DEFORMABLE_CONV2D_UTILS_H
+
+#include <vector>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool"
+
+namespace tensorflow {
+namespace addons {
+
+namespace functor {
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+using namespace tensorflow::shape_inference;
+Status CheckFormatConstraintsOnShape(const TensorFormat tensor_format,
+                                     const ShapeHandle shape_handle,
+                                     const string &tensor_name,
+                                     InferenceContext *ctx) {
+  if (tensor_format == FORMAT_NCHW_VECT_C) {
+    const int num_dims = ctx->Rank(shape_handle);
+    DimensionHandle vect_dim = ctx->Dim(
+        shape_handle, GetTensorInnerFeatureDimIndex(num_dims, tensor_format));
+    DimensionHandle unused_vect_dim;
+    TF_RETURN_IF_ERROR(ctx->WithValue(vect_dim, 4, &unused_vect_dim));
+  }
+  return Status::OK();
+}
+Status DimensionsFromShape(ShapeHandle shape, TensorFormat format,
+                           DimensionHandle *batch_dim,
+                           gtl::MutableArraySlice<DimensionHandle> spatial_dims,
+                           DimensionHandle *filter_dim, InferenceContext *ctx) {
+  const int32 rank = GetTensorDimsFromSpatialDims(spatial_dims.size(), format);
+  *batch_dim = ctx->Dim(shape, GetTensorBatchDimIndex(rank, format));
+  for (int spatial_dim_index = 0; spatial_dim_index < spatial_dims.size();
+       ++spatial_dim_index) {
+    spatial_dims[spatial_dim_index] = ctx->Dim(
+        shape, GetTensorSpatialDimIndex(rank, format, spatial_dim_index));
+  }
+  *filter_dim = ctx->Dim(shape, GetTensorFeatureDimIndex(rank, format));
+  if (format == FORMAT_NCHW_VECT_C) {
+    TF_RETURN_IF_ERROR(ctx->Multiply(
+        *filter_dim,
+        ctx->Dim(shape, GetTensorInnerFeatureDimIndex(rank, format)),
+        filter_dim));
+  }
+  return Status::OK();
+}
+Status ShapeFromDimensions(DimensionHandle batch_dim,
+                           gtl::ArraySlice<DimensionHandle> spatial_dims,
+                           DimensionHandle filter_dim, TensorFormat format,
+                           InferenceContext *ctx, ShapeHandle *shape) {
+  const int32 rank = GetTensorDimsFromSpatialDims(spatial_dims.size(), format);
+  std::vector<DimensionHandle> out_dims(rank);
+  out_dims[GetTensorBatchDimIndex(rank, format)] = batch_dim;
+  for (int spatial_dim_index = 0; spatial_dim_index < spatial_dims.size();
+       ++spatial_dim_index) {
+    out_dims[GetTensorSpatialDimIndex(rank, format, spatial_dim_index)] =
+        spatial_dims[spatial_dim_index];
+  }
+  if (format == FORMAT_NCHW_VECT_C) {
+    TF_RETURN_IF_ERROR(
+        ctx->Divide(filter_dim, 4, true,
+                    &out_dims[GetTensorFeatureDimIndex(rank, format)]));
+    out_dims[GetTensorInnerFeatureDimIndex(rank, format)] = ctx->MakeDim(4);
+  } else {
+    out_dims[GetTensorFeatureDimIndex(rank, format)] = filter_dim;
+  }
+  *shape = ctx->MakeShape(out_dims);
+  return Status::OK();
+}
+template <typename Ta, typename Tb>
+EIGEN_ALWAYS_INLINE EIGEN_DEVICE_FUNC bool FastBoundsCheck(const Ta index,
+                                                           const Tb limit) {
+  static_assert(std::is_integral<Ta>::value && std::is_integral<Tb>::value,
+                "FastBoundsCheck can only be used on integer types.");
+  typedef typename std::make_unsigned<decltype(index + limit)>::type UIndex;
+  return TF_PREDICT_TRUE(static_cast<UIndex>(index) <
+                         static_cast<UIndex>(limit));
+}
+
+#define TF_REQUIRES(EXP, STATUS)                \
+  do {                                          \
+    if (!TF_PREDICT_TRUE(EXP)) return (STATUS); \
+  } while (false)
+
+Status InitDeformableConv2DParameters(const OpKernelConstruction *ctx,
+                                      DeformableConv2DParameters *params) {
+  TF_RETURN_IF_ERROR(ctx->GetAttr("dilations", &params->dilations));
+  TF_RETURN_IF_ERROR(ctx->GetAttr("strides", &params->strides));
+  TF_RETURN_IF_ERROR(ctx->GetAttr("padding", &params->padding));
+  string data_format_string;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("data_format", &data_format_string));
+  TF_RETURN_IF_ERROR(ctx->GetAttr("num_groups", &params->num_groups));
+  TF_RETURN_IF_ERROR(
+      ctx->GetAttr("deformable_groups", &params->deformable_groups));
+  TF_RETURN_IF_ERROR(ctx->GetAttr("im2col_step", &params->im2col_step));
+  TF_RETURN_IF_ERROR(ctx->GetAttr("no_bias", &params->no_bias));
+  TF_REQUIRES(FormatFromString(data_format_string, &params->data_format),
+              errors::InvalidArgument("Invalid data format"));
+  const auto &strides = params->strides;
+  const auto &dilations = params->dilations;
+  const auto &data_format = params->data_format;
+  TF_REQUIRES(dilations.size() == 4,
+              errors::InvalidArgument("Sliding window dilations field must "
+                                      "specify 4 dimensions"));
+  TF_REQUIRES(strides.size() == 4,
+              errors::InvalidArgument("Sliding window strides field must "
+                                      "specify 4 dimensions"));
+  const int64 stride_n = GetTensorDim(strides, data_format, 'N');
+  const int64 stride_c = GetTensorDim(strides, data_format, 'C');
+  const int64 stride_h = GetTensorDim(strides, data_format, 'H');
+  const int64 stride_w = GetTensorDim(strides, data_format, 'W');
+  TF_REQUIRES(
+      stride_n == 1 && stride_c == 1,
+      errors::InvalidArgument("Current implementation does not yet support "
+                              "strides in the batch and depth dimensions."));
+  TF_REQUIRES(stride_h > 0 && stride_w > 0,
+              errors::InvalidArgument(
+                  "Row and column strides should be larger than 0."));
+
+  const int64 dilation_n = GetTensorDim(dilations, data_format, 'N');
+  const int64 dilation_c = GetTensorDim(dilations, data_format, 'C');
+  const int64 dilation_h = GetTensorDim(dilations, data_format, 'H');
+  const int64 dilation_w = GetTensorDim(dilations, data_format, 'W');
+  TF_REQUIRES(
+      dilation_n == 1 && dilation_c == 1,
+      errors::InvalidArgument("Current implementation does not yet support "
+                              "dilations in the batch and depth dimensions."));
+  TF_REQUIRES(
+      dilation_h > 0 && dilation_w > 0,
+      errors::InvalidArgument("Dilated rates should be larger than 0."));
+
+  return Status::OK();
+}
+Status ComputeDeformableConv2DDimension(
+    const DeformableConv2DParameters &params, const Tensor &input,
+    const Tensor &filter, DeformableConv2DDimensions *dimensions, int flag) {
+  TF_REQUIRES(input.dims() == 4,
+              errors::InvalidArgument("input must be 4-dimensional",
+                                      input.shape().DebugString()));
+  TF_REQUIRES(filter.dims() == 4,
+              errors::InvalidArgument("filter must be 4-dimensional: ",
+                                      filter.shape().DebugString()));
+  for (int i = 3; i > 0; i--) {
+    TF_REQUIRES(
+        FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+        errors::InvalidArgument("filter too large"));
+  }
+  const int64 in_depth_raw = GetTensorDim(input, params.data_format, 'C');
+  const int64 patch_depth_raw = filter.dim_size(1);
+  TF_REQUIRES(FastBoundsCheck(in_depth_raw, std::numeric_limits<int>::max()),
+              errors::InvalidArgument("Input depth too large"));
+  TF_REQUIRES(FastBoundsCheck(patch_depth_raw, std::numeric_limits<int>::max()),
+              errors::InvalidArgument("Patch depth too large"));
+  const int in_depth = static_cast<int>(in_depth_raw);
+  const int patch_depth = static_cast<int>(patch_depth_raw);
+  TF_REQUIRES(in_depth % patch_depth == 0,
+              errors::InvalidArgument(
+                  "input depth must be evenly divisible by filter depth: ",
+                  in_depth, " vs ", patch_depth, " flag: ", flag));
+
+  // The first dimension for filter is out_depth.
+  const int out_depth = static_cast<int>(filter.dim_size(0));
+  const int64 input_rows_raw = GetTensorDim(input, params.data_format, 'H');
+  TF_REQUIRES(FastBoundsCheck(input_rows_raw, std::numeric_limits<int>::max()),
+              errors::InvalidArgument("Input rows too large"));
+  const int input_rows = static_cast<int>(input_rows_raw);
+  const int filter_rows = static_cast<int>(filter.dim_size(2));
+  const int64 input_cols_raw = GetTensorDim(input, params.data_format, 'W');
+  TF_REQUIRES(FastBoundsCheck(input_cols_raw, std::numeric_limits<int>::max()),
+              errors::InvalidArgument("Input cols too large"));
+  const int input_cols = static_cast<int>(input_cols_raw);
+  const int filter_cols = static_cast<int>(filter.dim_size(3));
+  const int64 batch_raw = GetTensorDim(input, params.data_format, 'N');
+  TF_REQUIRES(FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
+              errors::InvalidArgument("batch is too large"));
+  const int batch = static_cast<int>(batch_raw);
+  const int stride_rows = GetTensorDim(params.strides, params.data_format, 'H');
+  const int stride_cols = GetTensorDim(params.strides, params.data_format, 'W');
+  const int dilation_rows =
+      GetTensorDim(params.dilations, params.data_format, 'H');
+  const int dilation_cols =
+      GetTensorDim(params.dilations, params.data_format, 'W');
+
+  // Compute windowed output sizes for rows and columns.
+  int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+      input_rows, filter_rows, dilation_rows, stride_rows, params.padding,
+      &out_rows, &pad_rows));
+  TF_RETURN_IF_ERROR(GetWindowedOutputSizeV2(
+      input_cols, filter_cols, dilation_cols, stride_cols, params.padding,
+      &out_cols, &pad_cols));
+
+  dimensions->batch = batch;
+  dimensions->input_rows = input_rows;
+  dimensions->input_cols = input_cols;
+  dimensions->in_depth = in_depth;
+  dimensions->filter_rows = filter_rows;
+  dimensions->filter_cols = filter_cols;
+  dimensions->patch_depth = patch_depth;
+  dimensions->out_depth = out_depth;
+  dimensions->stride_rows = stride_rows;
+  dimensions->stride_cols = stride_cols;
+  dimensions->dilation_rows = dilation_rows;
+  dimensions->dilation_cols = dilation_cols;
+  dimensions->out_rows = out_rows;
+  dimensions->out_cols = out_cols;
+  dimensions->pad_rows = pad_rows;
+  dimensions->pad_cols = pad_cols;
+  return Status::OK();
+}
+
+inline TShape ToVector(const TShape &shape) { return shape; }
+
+inline std::vector<int> ToVector(const TensorShape &shape) {
+  std::vector<int> res;
+  for (int i = 0; i < shape.dims(); ++i) {
+    res.push_back(shape.dim_size(i));
+  }
+  return res;
+}
+
+inline std::vector<int> SubVector(const TensorShape &shape, int start,
+                                  int end) {
+  std::vector<int> res;
+  for (int i = start; i < end; i++) {
+    res.push_back(shape.dim_size(i));
+  }
+  return res;
+}
+
+inline TShape SubVector(const TShape &shape, int start, int end) {
+  TShape res;
+  for (int i = start; i < end; i++) {
+    res.push_back(shape[i]);
+  }
+  return res;
+}
+
+}  // namespace functor
+}  // namespace addons
+}  // namespace tensorflow
+
+#endif  // TF_OPS_DEFORMABLE_CONV2D_UTILS_H
diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
new file mode 100644
index 0000000000..54e2ab2176
--- /dev/null
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
@@ -0,0 +1,1517 @@
+
+#include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h"
+
+#include <algorithm>
+#include <atomic>
+
+#include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h"
+
+namespace tensorflow {
+namespace addons {
+
+namespace functor {
+
+template <typename DType>
+DType DmcnIm2colBilinear(const DType *bottom_data, const int data_width,
+                         const int height, const int width, DType h, DType w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  DType lh = h - h_low;
+  DType lw = w - w_low;
+  DType hh = 1 - lh, hw = 1 - lw;
+
+  DType v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low];
+  DType v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  DType v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  DType v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+template <typename DType>
+DType DmcnGetGradientWeight(DType argmax_h, DType argmax_w, const int h,
+                            const int w, const int height, const int width) {
+  /*
+   * offset h, offset w, (h, w) coordinate
+   */
+  if (argmax_h <= -1 || argmax_w <= -1 || argmax_h >= height ||
+      argmax_w >= width) {
+    return 0;
+  }
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+  DType weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+template <typename DType>
+DType DmcnGetCoordinateWeight(DType argmax_h, DType argmax_w, const int height,
+                              const int width, const DType *im_data,
+                              const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  DType weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+using ull = unsigned long long int;
+using uInt = unsigned int;
+typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) {
+  return {adj_x ? 0 : 1, adj_y ? 1 : 0};
+}
+
+void AtomicAdd(float *address, float val) {
+  auto *address_as_ull = reinterpret_cast<uInt *>(address);
+  uInt old = *address_as_ull;
+  uInt assumed;
+  float desired;
+  do {
+    assumed = old;
+    desired = *reinterpret_cast<float *>(&assumed) + static_cast<float>(val);
+    old = __sync_val_compare_and_swap(address_as_ull, assumed,
+                                      *reinterpret_cast<uInt *>(&desired));
+  } while (assumed != old);
+}
+
+void AtomicAdd(double *address, double val) {
+  auto *address_as_ull = reinterpret_cast<ull *>(address);
+  ull old = *address_as_ull;
+  ull assumed;
+  double desired;
+  do {
+    assumed = old;
+    desired = *reinterpret_cast<double *>(&assumed) + static_cast<double>(val);
+    old = __sync_val_compare_and_swap(address_as_ull, assumed,
+                                      *reinterpret_cast<ull *>(&desired));
+  } while (assumed != old);
+}
+
+template <typename DType>
+void SwapAxisKernel(const CPUDevice &d, const int n, const int cuda_mem_size,
+                    const int min_unit_size, DType *input_data,
+                    const int dim_num, const int axis_x_dims,
+                    const int axis_y_dims, const int axis_x, const int axis_y) {
+  d.parallelFor(n,
+                Eigen::TensorOpCost(cuda_mem_size, cuda_mem_size,
+                                    cuda_mem_size * axis_y_dims * axis_x_dims),
+                [min_unit_size, input_data, dim_num, axis_x_dims, axis_y_dims,
+                 axis_x, axis_y, cuda_mem_size](int64 start, int64 end) {
+                  for (int64 index = start; index < end; index++) {
+                    auto *device_data = new DType[cuda_mem_size];
+                    DType *input_data_ptr = input_data + index * cuda_mem_size;
+                    for (int j = 0; j < axis_y_dims; j++) {
+                      for (int i = 0; i < axis_x_dims; i++) {
+                        DType *temp_ptr = input_data_ptr +
+                                          (i * axis_x_dims + j) * min_unit_size;
+                        DType *device_data_temp_ptr =
+                            device_data + (j * axis_y_dims + i) * min_unit_size;
+                        for (int k = 0; k < min_unit_size; k++) {
+                          *(device_data_temp_ptr + k) = *(temp_ptr + k);
+                        }
+                      }
+                    }
+                    for (int idx = 0; idx < cuda_mem_size; idx++) {
+                      *(input_data_ptr + idx) = *(device_data + idx);
+                    }
+                    delete[] device_data;
+                  }
+                });
+}
+
+template <typename T>
+void DeformablePSROIPoolBackwardCpuAccKernel(
+    const CPUDevice &d, const int count, const T *top_diff, const T *top_count,
+    const int num_rois, const T spatial_scale, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int output_dim, T *bottom_data_diff,
+    T *bottom_trans_diff, const T *bottom_data, const T *bottom_rois,
+    const T *bottom_trans, const int no_trans, const T trans_std,
+    const int sample_per_part, const int group_size, const int part_size,
+    const int num_classes, const int channels_each_class) {
+  auto f = [count, top_diff, top_count, num_rois, spatial_scale, channels,
+            height, width, pooled_height, pooled_width, output_dim,
+            bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois,
+            bottom_trans, no_trans, trans_std, sample_per_part, group_size,
+            part_size, num_classes,
+            channels_each_class](int64 start, int64 end) {
+    for (int64 index = start; index < end; ++index) {
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int ctop = (index / pooled_width / pooled_height) % output_dim;
+      int n = index / pooled_width / pooled_height / output_dim;
+      // [start, end) interval for spatial sampling
+      const T *offset_bottom_rois = bottom_rois + n * 5;
+      int roi_batch_ind = offset_bottom_rois[0];
+      T roi_start_w = (T)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+      T roi_start_h = (T)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+      T roi_end_w =
+          (T)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+      T roi_end_h =
+          (T)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+      // Force too small ROIs to be 1x1
+      T roi_width =
+          std::max(roi_end_w - roi_start_w, static_cast<T>(0.1));  // avoid 0
+      T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
+
+      // Compute w and h at bottom
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+      T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+      T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+
+      int part_h = floor((T)(ph) / pooled_height * part_size);
+      int part_w = floor((T)(pw) / pooled_width * part_size);
+      int class_id = ctop / channels_each_class;
+      T trans_x =
+          no_trans
+              ? static_cast<T>(0)
+              : bottom_trans[(((n * num_classes + class_id) * 2) * part_size +
+                              part_h) *
+                                 part_size +
+                             part_w] *
+                    (T)trans_std;
+      T trans_y = no_trans
+                      ? (T)(0)
+                      : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
+                                          part_size +
+                                      part_h) *
+                                         part_size +
+                                     part_w] *
+                            (T)trans_std;
+
+      T wstart = (T)(pw)*bin_size_w + roi_start_w;
+      wstart += trans_x * roi_width;
+      T hstart = (T)(ph)*bin_size_h + roi_start_h;
+      hstart += trans_y * roi_height;
+
+      if (top_count[index] <= 0) {
+        continue;
+      }
+      T diff_val = top_diff[index] / top_count[index];
+      const T *offset_bottom_data =
+          bottom_data + roi_batch_ind * channels * height * width;
+      T *offset_bottom_data_diff =
+          bottom_data_diff + roi_batch_ind * channels * height * width;
+      int gw = floor((T)(pw)*group_size / pooled_width);
+      int gh = floor((T)(ph)*group_size / pooled_height);
+      gw = std::min(std::max(gw, 0), group_size - 1);
+      gh = std::min(std::max(gh, 0), group_size - 1);
+      for (int ih = 0; ih < sample_per_part; ih++) {
+        for (int iw = 0; iw < sample_per_part; iw++) {
+          T w = wstart + iw * sub_bin_size_w;
+          T h = hstart + ih * sub_bin_size_h;
+          // bilinear interpolation
+          if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+            continue;
+          }
+          w = std::min(std::max(w, static_cast<T>(0.)),
+                       static_cast<T>(width - 1.));
+          h = std::min(std::max(h, static_cast<T>(0.)),
+                       static_cast<T>(height - 1.));
+          int c = (ctop * group_size + gh) * group_size + gw;
+          // backward on feature
+          int x0 = floor(w);
+          int x1 = ceil(w);
+          int y0 = floor(h);
+          int y1 = ceil(h);
+          T dist_x = w - x0, dist_y = h - y0;
+          T q00 = (1 - dist_x) * (1 - dist_y);
+          T q01 = (1 - dist_x) * dist_y;
+          T q10 = dist_x * (1 - dist_y);
+          T q11 = dist_x * dist_y;
+          int bottom_index_base = c * height * width;
+          AtomicAdd(
+              offset_bottom_data_diff + bottom_index_base + y0 * width + x0,
+              q00 * diff_val);
+          AtomicAdd(
+              offset_bottom_data_diff + bottom_index_base + y1 * width + x0,
+              q01 * diff_val);
+          AtomicAdd(
+              offset_bottom_data_diff + bottom_index_base + y0 * width + x1,
+              q10 * diff_val);
+          AtomicAdd(
+              offset_bottom_data_diff + bottom_index_base + y1 * width + x1,
+              q11 * diff_val);
+
+          if (no_trans) {
+            continue;
+          }
+          T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+          T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+          T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+          T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+          T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y -
+                      U00 * (1 - dist_y)) *
+                     trans_std * diff_val;
+          diff_x *= roi_width;
+          T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x -
+                      U00 * (1 - dist_x)) *
+                     trans_std * diff_val;
+          diff_y *= roi_height;
+
+          AtomicAdd(
+              bottom_trans_diff +
+                  (((n * num_classes + class_id) * 2) * part_size + part_h) *
+                      part_size +
+                  part_w,
+              diff_x);
+          AtomicAdd(bottom_trans_diff +
+                        (((n * num_classes + class_id) * 2 + 1) * part_size +
+                         part_h) *
+                            part_size +
+                        part_w,
+                    diff_y);
+        }
+      }
+    }
+  };
+  d.parallelFor(count, Eigen::TensorOpCost(count, count, count), f);
+}
+
+template <typename T>
+void DeformablePSROIPoolForwardCpuKernel(
+    const CPUDevice &d, const int count, const T *bottom_data,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans, const int no_trans,
+    const T trans_std, const int sample_per_part, const int output_dim,
+    const int group_size, const int part_size, const int num_classes,
+    const int channels_each_class, T *top_data, T *top_count) {
+  auto f = [count, bottom_data, spatial_scale, channels, height, width,
+            pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
+            trans_std, sample_per_part, output_dim, group_size, part_size,
+            num_classes, channels_each_class, top_data,
+            top_count](int64 start, int64 end) {
+    for (int64 index = start; index < end; ++index) {
+      // The output is in order (n, ctop, ph, pw)
+      int pw = index % pooled_width;
+      int ph = (index / pooled_width) % pooled_height;
+      int ctop = (index / pooled_width / pooled_height) % output_dim;
+      int n = index / pooled_width / pooled_height / output_dim;
+      // [start, end) interval for spatial sampling
+      const T *offset_bottom_rois = bottom_rois + n * 5;
+      int roi_batch_ind = offset_bottom_rois[0];
+      T roi_start_w = (T)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+      T roi_start_h = (T)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+      T roi_end_w =
+          (T)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+      T roi_end_h =
+          (T)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+      // Force too small ROIs to be 1x1
+      T roi_width =
+          std::max(roi_end_w - roi_start_w, static_cast<T>(0.1));  // avoid 0
+      T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.1));
+      // Compute w and h at bottom
+      T bin_size_h = roi_height / static_cast<T>(pooled_height);
+      T bin_size_w = roi_width / static_cast<T>(pooled_width);
+      T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+      T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+      int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+      int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+      int class_id = ctop / channels_each_class;
+      T trans_x =
+          no_trans
+              ? (T)(0)
+              : bottom_trans[(((n * num_classes + class_id) * 2) * part_size +
+                              part_h) *
+                                 part_size +
+                             part_w] *
+                    (T)trans_std;
+      T trans_y = no_trans
+                      ? (T)(0)
+                      : bottom_trans[(((n * num_classes + class_id) * 2 + 1) *
+                                          part_size +
+                                      part_h) *
+                                         part_size +
+                                     part_w] *
+                            (T)trans_std;
+      T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+      wstart += trans_x * roi_width;
+      T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+      hstart += trans_y * roi_height;
+      T sum = 0;
+      int total = 0;
+      int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+      int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+      gw = std::min(std::max(gw, 0), group_size - 1);
+      gh = std::min(std::max(gh, 0), group_size - 1);
+      const T *offset_bottom_data =
+          bottom_data + (roi_batch_ind * channels) * height * width;
+      for (int ih = 0; ih < sample_per_part; ++ih) {
+        for (int iw = 0; iw < sample_per_part; ++iw) {
+          T w = wstart + iw * sub_bin_size_w;
+          T h = hstart + ih * sub_bin_size_h;
+          // bilinear interpolation
+          if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+            continue;
+          }
+          w = std::min(std::max(w, static_cast<T>(0.)),
+                       static_cast<T>(width - 1.));
+          h = std::min(std::max(h, static_cast<T>(0.)),
+                       static_cast<T>(height - 1.));
+          int c = (ctop * group_size + gh) * group_size + gw;
+          T val = DmcnIm2colBilinear(offset_bottom_data + c * height * width, w,
+                                     h, w, (T)height, (T)width);
+          sum += val;
+          total++;
+        }
+      }
+      top_data[index] = total == 0 ? (T)(0) : sum / total;
+      top_count[index] = total;
+    }
+  };
+  d.parallelFor(count, Eigen::TensorOpCost(count, count, count), f);
+}
+template <typename DType>
+void DeformableConv2DIm2ColCPUKernel(
+    const CPUDevice &d, const int n, const DType *data_im,
+    const DType *data_offset, const DType *data_mask,
+
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+
+    const int
+        channel_per_deformable_group,  // 输入图通道数除以deformable_group的数量,
+    const int batch_size, const int num_channels,
+    const int
+        deformable_group,  //这里的batch_size代表的是im2col_step_, 一般就设为1了
+    const int height_col, const int width_col, DType *data_col) {
+  auto f = [n, data_im, data_offset, data_mask, height, width, kernel_h,
+            kernel_w, pad_h, pad_w, stride_w, stride_h, dilation_w, dilation_h,
+            channel_per_deformable_group, batch_size, num_channels,
+            deformable_group, height_col, width_col,
+            data_col](int64 start, int64 end) {
+    for (int64 index = start; index < end; index++) {
+      const int w_col = index % width_col;
+      const int h_col = (index / width_col) % height_col;
+      const int b_col = (index / width_col / height_col) % batch_size;
+      const int c_im = (index / width_col / height_col) / batch_size;
+      const int c_col = c_im * kernel_h * kernel_w;
+
+      // compute deformable group index
+      const int deformable_group_index = c_im / channel_per_deformable_group;
+
+      const int h_in = h_col * stride_h - pad_h;
+      const int w_in = w_col * stride_w - pad_w;
+
+      DType *data_col_ptr =
+          data_col +
+          ((c_col * batch_size + b_col) * height_col + h_col) * width_col +
+          w_col;
+      const DType *data_im_ptr =
+          data_im + (b_col * num_channels + c_im) * height * width;
+      const DType *data_offset_ptr =
+          data_offset + (b_col * deformable_group + deformable_group_index) *
+                            2 * kernel_h * kernel_w * height_col *
+                            width_col;  //
+
+      const DType *data_mask_ptr =
+          data_mask + (b_col * deformable_group + deformable_group_index) *
+                          kernel_h * kernel_w * height_col * width_col;  //
+      for (int i = 0; i < kernel_h; ++i) {
+        for (int j = 0; j < kernel_w; ++j) {
+          const int data_offset_h_ptr =
+              ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col +
+              w_col;
+          const int data_offset_w_ptr =
+              ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+              w_col;
+          const int data_mask_hw_ptr =
+              ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+          const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+          const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+          const DType mask = data_mask_ptr[data_mask_hw_ptr];
+          auto val = static_cast<DType>(0);
+          const DType h_im = h_in + i * dilation_h + offset_h;
+          const DType w_im = w_in + j * dilation_w + offset_w;
+          if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+            val = DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im,
+                                     w_im);
+          }
+          *data_col_ptr = val * mask;
+          data_col_ptr += batch_size * height_col * width_col;
+        }
+      }
+    }
+  };
+  d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f);
+}
+
+template <typename DType>
+void DeformableConv2DCol2ImCPUKernel(
+    const CPUDevice &d, const int n, const DType *data_col,
+    const DType *data_offset, const DType *data_mask, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    DType *grad_im) {
+  auto f = [n, data_col, data_offset, data_mask, channels, height, width,
+            kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, batch_size,
+            deformable_group, height_col, width_col,
+            grad_im](int64 start, int64 end) {
+    for (int64 index = start; index < end; ++index) {
+      const int j = (index / width_col / height_col / batch_size) % kernel_w;
+      const int i =
+          (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      const int c =
+          index / width_col / height_col / batch_size / kernel_w / kernel_h;
+      // compute the start and end of the output
+      const int deformable_group_index = c / channel_per_deformable_group;
+      int w_out = index % width_col;
+      int h_out = (index / width_col) % height_col;
+      int b = (index / width_col / height_col) % batch_size;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+
+      const DType *data_offset_ptr =
+          data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                            kernel_h * kernel_w * height_col * width_col;
+      const DType *data_mask_ptr =
+          data_mask + (b * deformable_group + deformable_group_index) *
+                          kernel_h * kernel_w * height_col * width_col;
+      const int data_offset_h_ptr =
+          ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+      const int data_offset_w_ptr =
+          ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+          w_out;
+      const int data_mask_hw_ptr =
+          ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+      const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+      const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+      const DType mask = data_mask_ptr[data_mask_hw_ptr];
+      const DType cur_inv_h_data = h_in + i * dilation_h + offset_h;
+      const DType cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+      const DType cur_top_grad = data_col[index] * mask;
+      const int cur_h = (int)cur_inv_h_data;
+      const int cur_w = (int)cur_inv_w_data;
+      for (int dy = -2; dy <= 2; dy++) {
+        for (int dx = -2; dx <= 2; dx++) {
+          if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+              cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+              abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+            int cur_bottom_grad_pos =
+                ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+            DType weight =
+                DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data,
+                                      cur_h + dy, cur_w + dx, height, width);
+            AtomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+            //                      *(grad_im + cur_bottom_grad_pos) += weight *
+            //                      cur_top_grad;
+          }
+        }
+      }
+    }
+  };
+  d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f);
+}
+template <typename DType>
+void DeformableConv2DCol2ImCoordCPUKernel(
+    const CPUDevice &d, const int n, const DType *data_col,
+    const DType *data_im, const DType *data_offset, const DType *data_mask,
+    const int channels, const int height, const int width,  // 输入的C, H, W
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, DType *grad_offset,
+    DType *grad_mask) {
+  auto f = [n, data_col, data_im, data_offset, data_mask, channels, height,
+            width, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            offset_channels, deformable_group, height_col, width_col,
+            grad_offset, grad_mask](int64 start, int64 end) {
+    for (int64 index = start; index < end; index++) {
+      DType val = 0, mval = 0;
+      int w = index % width_col;
+      int h = (index / width_col) % height_col;
+      int c = (index / width_col / height_col) % offset_channels;
+      int b = (index / width_col / height_col) / offset_channels;
+      // compute the start and end of the output
+
+      const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+      const int col_step = kernel_h * kernel_w;
+      int cnt = 0;
+      const DType *data_col_ptr =
+          data_col + deformable_group_index * channel_per_deformable_group *
+                         batch_size * width_col * height_col;
+      const DType *data_im_ptr =
+          data_im + (b * deformable_group + deformable_group_index) *
+                        channel_per_deformable_group / kernel_h / kernel_w *
+                        height * width;
+      const DType *data_offset_ptr =
+          data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                            kernel_h * kernel_w * height_col * width_col;
+      const DType *data_mask_ptr =
+          data_mask + (b * deformable_group + deformable_group_index) *
+                          kernel_h * kernel_w * height_col * width_col;
+
+      const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+      for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+           col_c += col_step) {
+        const int col_pos =
+            (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+        const int bp_dir = offset_c % 2;
+
+        int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+        int i = (col_pos / width_col / height_col / batch_size / kernel_w) %
+                kernel_h;
+        int w_out = col_pos % width_col;
+        int h_out = (col_pos / width_col) % height_col;
+        int w_in = w_out * stride_w - pad_w;
+        int h_in = h_out * stride_h - pad_h;
+        const int data_offset_h_ptr =
+            (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col +
+             w_out);
+        const int data_offset_w_ptr =
+            (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+             w_out);
+        const int data_mask_hw_ptr =
+            (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+        const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+        const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+        const DType mask = data_mask_ptr[data_mask_hw_ptr];
+        DType inv_h = h_in + i * dilation_h + offset_h;
+        DType inv_w = w_in + j * dilation_w + offset_w;
+        if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+          inv_h = inv_w = -2;
+        } else {
+          mval += data_col_ptr[col_pos] *
+                  DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
+                                     height, width, inv_h, inv_w);
+        }
+        const DType weight = DmcnGetCoordinateWeight(
+            inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+            width, bp_dir);
+        val += weight * data_col_ptr[col_pos] * mask;
+        cnt += 1;
+      }
+
+      grad_offset[index] = val;
+      // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+      if (offset_c % 2 == 0) {
+        grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w +
+                    offset_c / 2) *
+                       height_col +
+                   h) *
+                      width_col +
+                  w] = mval;
+        // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+        // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+        // height_col + h) * width_col + w], mask_req, mval);
+      }
+    }
+  };
+  d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f);
+}
+template <typename DType>
+void PureAddToKernel(const CPUDevice &d, const int n, DType *result_data,
+                     const DType *right_data) {
+  auto f = [n, result_data, right_data](int64 start, int64 end) {
+    for (int64 index = start; index < end; index++) {
+      *(result_data + index) += (right_data[index]);
+    }
+  };
+  d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f);
+}
+template <typename DType>
+void SetZeroKernel(const CPUDevice &d, const int n, DType *result_data) {
+  auto f = [n, result_data](int64 start, int64 end) {
+    for (int64 index = start; index < end; ++index) {
+      *(result_data + index) = DType(0);
+    }
+  };
+  d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f);
+}
+template <typename DType>
+void SetOneKernel(const CPUDevice &d, const int n, DType *result_data) {
+  auto f = [n, result_data](int64 start, int64 end) {
+    for (int64 index = start; index < end; ++index) {
+      *(result_data + index) = DType(1);
+    }
+  };
+  d.parallelFor(n, Eigen::TensorOpCost(n, n, n), f);
+}
+
+template <typename DType>
+void DeformableConv2DCol2ImCoord<CPUDevice, DType>::operator()(
+    const Eigen::ThreadPoolDevice &d, const DType *data_col,
+    const DType *data_im, const DType *data_offset, const DType *data_mask,
+    const TShape &im_shape, const TShape &col_shape, const TShape &kernel_shape,
+    const TShape &pad, const TShape &stride, const TShape &dilation,
+    const int32_t deformable_group, DType *grad_offset, DType *grad_mask) {
+  int num_spatial_axes = kernel_shape.size();
+  int num_kernels = col_shape[1] * col_shape[2] * col_shape[3] * 2 *
+                    kernel_shape[0] * kernel_shape[1] * deformable_group;
+  int channel_per_deformable_group = col_shape[0] / deformable_group;
+  switch (num_spatial_axes) {
+    case 2:
+      DeformableConv2DCol2ImCoordCPUKernel<DType>(
+          d, num_kernels, data_col, data_im, data_offset, data_mask,
+          im_shape[1], im_shape[2], im_shape[3], kernel_shape[0],
+          kernel_shape[1], pad[0], pad[1], stride[0], stride[1], dilation[0],
+          dilation[1], channel_per_deformable_group, col_shape[1],
+          2 * kernel_shape[0] * kernel_shape[1] * deformable_group,
+          deformable_group, col_shape[2], col_shape[3], grad_offset, grad_mask);
+      break;
+    default:
+      LOG(FATAL) << "col2im_nd_gpu does not support computation with "
+                 << num_spatial_axes << "spatial axes";
+  }
+}
+
+template <typename DType>
+void DeformableConv2DCol2Im<CPUDevice, DType>::operator()(
+    const Eigen::ThreadPoolDevice &d, const DType *data_col,
+    const DType *data_offset, const DType *data_mask, const TShape &im_shape,
+    const TShape &col_shape, const TShape &kernel_shape, const TShape &pad,
+    const TShape &stride, const TShape &dilation,
+    const int32_t deformable_group, DType *grad_im) {
+  int num_spatial_axes = kernel_shape.size();
+  int channel_per_deformable_group = im_shape[1] / deformable_group;
+  int num_kernels = ProdShape(col_shape, 0, col_shape.size());
+  // num_axes should be smaller than block size
+  //   using namespace mxnet_op;
+  switch (num_spatial_axes) {
+    case 2:
+      // To avoid involving atomic operations, we will launch one kernel per
+      // bottom dimension, and then in the kernel add up the top dimensions.
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      DeformableConv2DCol2ImCPUKernel<DType>(
+          d, num_kernels, data_col, data_offset, data_mask, im_shape[1],
+          im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1], pad[0],
+          pad[1], stride[0], stride[1], dilation[0], dilation[1],
+          channel_per_deformable_group, col_shape[1], deformable_group,
+          col_shape[2], col_shape[3], grad_im);
+      break;
+    default:
+      LOG(FATAL) << "col2im_nd_gpu does not support computation with "
+                 << num_spatial_axes << " spatial axes";
+  }
+}
+
+template <typename DType>
+void DeformableConv2DIm2Col<CPUDevice, DType>::operator()(
+    const Eigen::ThreadPoolDevice &d, const DType *data_im,
+    const DType *data_offset, const DType *data_mask, const TShape &im_shape,
+    const TShape &col_shape, const TShape &kernel_shape, const TShape &pad,
+    const TShape &stride, const TShape &dilation,
+    const int32_t deformable_group, DType *data_col) {
+  int num_spatial_axes = kernel_shape.size();
+  int channel_per_deformable_group =
+      im_shape[1] / deformable_group;  // imshape[1] = 输入图的通道数
+  int num_kernels =
+      im_shape[1] *
+      ProdShape(col_shape, 1,
+                col_shape.size());  // K * N / k.Size(), k = filter, col_shape =
+                                    // [K, im2col_step_, H, W]
+  switch (num_spatial_axes) {
+    case 2:
+      DeformableConv2DIm2ColCPUKernel<DType>(
+          d, num_kernels, data_im, data_offset, data_mask, im_shape[2],
+          im_shape[3], kernel_shape[0], kernel_shape[1], pad[0], pad[1],
+          stride[0], stride[1], dilation[0], dilation[1],
+          channel_per_deformable_group, col_shape[1], im_shape[1],
+          deformable_group, col_shape[2], col_shape[3], data_col);
+      // MSHADOW_CUDA_POST_KERNEL_CHECK(modulated_deformable_im2col_gpu_kernel);
+      break;
+    default:
+      LOG(FATAL) << "im2col_nd_gpu does not support computation with "
+                 << num_spatial_axes << " spatial axes";
+  }
+}
+
+template <typename DType>
+void SetZeros<CPUDevice, DType>::operator()(const Eigen::ThreadPoolDevice &d,
+                                            int n, DType *result_data) {
+  SetZeroKernel(d, n, result_data);
+}
+template <typename DType>
+void PureAddTo<CPUDevice, DType>::operator()(const Eigen::ThreadPoolDevice &d,
+                                             const int n, DType *result_data,
+                                             const DType *right_data) {
+  PureAddToKernel(d, n, result_data, right_data);
+}
+template <typename DType>
+void SetOne<CPUDevice, DType>::operator()(const Eigen::ThreadPoolDevice &d,
+                                          int n, DType *result_data) {
+  SetOneKernel(d, n, result_data);
+}
+template <typename DType>
+void SetNumAtIndex<CPUDevice, DType>::operator()(
+    const Eigen::ThreadPoolDevice &d, DType num, int index, DType *data) {
+  *(data + index) = num;
+}
+
+template <typename T>
+void LaunchBatchMatMul<CPUDevice, T>::launch(OpKernelContext *context,
+                                             const TensorShape &in_x_shape,
+                                             const TensorShape &in_y_shape,
+                                             const T *in_x_ptr,
+                                             const T *in_y_ptr, bool adj_x,
+                                             bool adj_y, T *out) {
+  const int64 m = in_x_shape.dim_size(adj_x ? 2 : 1);
+  const int64 k = in_x_shape.dim_size(adj_x ? 1 : 2);
+  const int64 n = in_y_shape.dim_size(adj_y ? 1 : 2);
+  const uint64 batch_size = in_x_shape.dim_size(0);
+  Eigen::TensorMap<Eigen::Tensor<const T, 3, Eigen::RowMajor>> t_in_x(
+      in_x_ptr, in_x_shape.AsEigenDSizes<3, Eigen::DenseIndex>());
+  Eigen::TensorMap<Eigen::Tensor<const T, 3, Eigen::RowMajor>> t_in_y(
+      in_y_ptr, in_y_shape.AsEigenDSizes<3, Eigen::DenseIndex>());
+  Eigen::TensorMap<Eigen::Tensor<T, 3, Eigen::RowMajor>> t_out(out, batch_size,
+                                                               m, n);
+  Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> contract_pairs;
+  contract_pairs[0] = ContractionDims(adj_x, adj_y);
+  auto &device = context->eigen_device<CPUDevice>();
+  for (int i = 0; i < t_out.dimension(0); ++i) {
+    t_out.template chip<0>(i).device(device) =
+        (t_in_x.template chip<0>(i))
+            .template contract(t_in_y.template chip<0>(i), contract_pairs);
+  }
+}
+
+template <typename T>
+void DeformablePSROIPoolForward<CPUDevice, T>::operator()(
+    const CPUDevice &d, const int count, const T *bottom_data,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans, const int no_trans,
+    const T trans_std, const int sample_per_part, const int output_dim,
+    const int group_size, const int part_size, const int num_classes,
+    const int channels_each_class, T *top_data, T *top_count) {
+  DeformablePSROIPoolForwardCpuKernel<T>(
+      d, count, bottom_data, spatial_scale, channels, height, width,
+      pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
+      trans_std, sample_per_part, output_dim, group_size, part_size,
+      num_classes, channels_each_class, top_data, top_count);
+}
+
+template <typename T>
+void DeformablePSROIPoolBackwardKernel<CPUDevice, T>::operator()(
+    const CPUDevice &d, const int count, const T *top_diff, const T *top_count,
+    const int num_rois, const T spatial_scale, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int output_dim, T *bottom_data_diff,
+    T *bottom_trans_diff, const T *bottom_data, const T *bottom_rois,
+    const T *bottom_trans, const int no_trans, const T trans_std,
+    const int sample_per_part, const int group_size, const int part_size,
+    const int num_classes, const int channels_each_class) {
+  DeformablePSROIPoolBackwardCpuAccKernel<T>(
+      d, count, top_diff, top_count, num_rois, spatial_scale, channels, height,
+      width, pooled_height, pooled_width, output_dim, bottom_data_diff,
+      bottom_trans_diff, bottom_data, bottom_rois, bottom_trans, no_trans,
+      trans_std, sample_per_part, group_size, part_size, num_classes,
+      channels_each_class);
+}
+template struct DeformableConv2DIm2Col<CPUDevice, double>;
+template struct DeformableConv2DCol2Im<CPUDevice, double>;
+template struct DeformableConv2DCol2ImCoord<CPUDevice, double>;
+template struct PureAddTo<CPUDevice, double>;
+template struct SetOne<CPUDevice, double>;
+template struct SetZeros<CPUDevice, double>;
+template struct SwapAxis<CPUDevice, double>;
+template struct SetNumAtIndex<CPUDevice, double>;
+
+template struct DeformableConv2DIm2Col<CPUDevice, float>;
+template struct DeformableConv2DCol2Im<CPUDevice, float>;
+template struct DeformableConv2DCol2ImCoord<CPUDevice, float>;
+template struct PureAddTo<CPUDevice, float>;
+template struct SetOne<CPUDevice, float>;
+template struct SetZeros<CPUDevice, float>;
+template struct SwapAxis<CPUDevice, float>;
+template struct SetNumAtIndex<CPUDevice, float>;
+
+template struct LaunchBatchMatMul<CPUDevice, float>;
+template struct LaunchBatchMatMul<CPUDevice, double>;
+template struct DeformablePSROIPoolForward<CPUDevice, float>;
+template struct DeformablePSROIPoolForward<CPUDevice, double>;
+template struct DeformablePSROIPoolBackwardKernel<CPUDevice, float>;
+template struct DeformablePSROIPoolBackwardKernel<CPUDevice, double>;
+
+template <typename Device, typename T>
+class DeformableConv2DOp : public OpKernel {
+ public:
+  explicit DeformableConv2DOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, InitDeformableConv2DParameters(ctx, &params_));
+  }
+  void Compute(OpKernelContext *context) override {
+    // Input tensor's shape
+    // [batch, channels, height, weight]
+    const Tensor &input = context->input(0);
+    const TensorShape &input_shape = input.shape();
+    // [out_channels, in_channels, filter_height, filter_weight]
+    const Tensor &filter = context->input(1);
+    const TensorShape &filter_shape = filter.shape();
+    // [batch, 2 * filter.Size(), out_height, out_weight]
+    const Tensor &offset = context->input(2);
+    const TensorShape &offset_shape = offset.shape();
+    // [batch, filter.Size(), out_height, out_weight]
+    const Tensor &mask = context->input(3);
+    const TensorShape &mask_shape = mask.shape();
+
+    DeformableConv2DDimensions dimensions;
+    OP_REQUIRES_OK(context, ComputeDeformableConv2DDimension(
+                                params_, input, filter, &dimensions, 0));
+    // data_format = NCHW
+    // 这个地方我出了bug,原因是shapefromformat的参数必须是data_format, N, H, W,
+    // C,因为其内部是根据data_format来决定是否需要进行transpose,
+    // 如何第三个参数给了C, 且第一个参数为NCHW,那最后得到的结果会是NWCH
+    TensorShape out_shape = ShapeFromFormat(
+        params_.data_format, dimensions.batch, dimensions.out_rows,
+        dimensions.out_cols, dimensions.out_depth);
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_depth, out_rows, out_cols]
+    // Tensor* output = nullptr;
+    // OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+    VLOG(2) << "DeformableConv2D: in_depth = " << dimensions.in_depth
+            << ", patch_depth = " << dimensions.patch_depth
+            << ", input_cols = " << dimensions.input_cols
+            << ", filter_cols = " << dimensions.filter_cols
+            << ", input_rows = " << dimensions.input_rows
+            << ", filter_rows = " << dimensions.filter_rows
+            << ", stride_rows = " << dimensions.stride_rows
+            << ", stride_cols = " << dimensions.stride_cols
+            << ", dilation_rows = " << dimensions.dilation_rows
+            << ", dilation_cols = " << dimensions.dilation_cols
+            << ", out_depth = " << dimensions.out_depth;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+
+    /**
+     * from here i stop use the traditional convolution implement of the
+     * official code which was defined in conv_ops.cc and began to use the
+     * implement of the deformable conv2d of the msra version
+     * **/
+    LayerSetUp(input_shape, filter_shape, offset_shape, mask_shape, out_shape);
+    // notice the fact that the flat function return a reference of a pointer,
+    // but in fact we only need a pointer
+    const T *in_data_ptr = input.template flat<T>().data();
+    const T *offset_ptr = offset.template flat<T>().data();
+    const T *mask_ptr = mask.template flat<T>().data();
+    const Device &d = context->eigen_device<Device>();
+    int col_buffer_shape_temp[4];  // calculate the shape of col_buffer,
+                                   // mxnet源码是 + 1, 多了一个im2col_step_
+    col_buffer_shape_temp[0] = ProdShape(
+        filter_shape, 1,
+        filter_shape
+            .dims());  // 卷积核的参数个数,注意卷积核的形状应该是[out_depth,
+                       // in_depth, height, weight]
+    col_buffer_shape_temp[1] = im2col_step_;
+    col_buffer_shape_temp[2] = out_shape.dim_size(2);
+    col_buffer_shape_temp[3] = out_shape.dim_size(3);
+    TensorShape col_buffer_shape =
+        TensorShape({col_buffer_shape_temp[0], col_buffer_shape_temp[1],
+                     col_buffer_shape_temp[2], col_buffer_shape_temp[3]});
+
+    Tensor col_buffer;
+    OP_REQUIRES_OK(context,
+                   context->allocate_temp(DataTypeToEnum<T>::value,
+                                          col_buffer_shape, &col_buffer));
+    T *col_buffer_ptr = col_buffer.template flat<T>().data();
+
+    int32_t M = conv_out_channels_ / group_;  // filter的数量
+    int32_t N = im2col_step_ * conv_out_spatial_dim_;
+    int32_t K = kernel_dim_;  // 卷积的参数个数
+
+    Tensor weight_3d;
+    TensorShape weight_3d_shape = TensorShape({group_, M, K});
+    OP_REQUIRES(context, weight_3d.CopyFrom(filter, weight_3d_shape),
+                errors::InvalidArgument("shape doesn't match"));
+    T *weight_3d_ptr = weight_3d.template flat<T>().data();
+
+    Tensor *output_temp_4d = NULL;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, out_shape, &output_temp_4d));
+    auto output_temp_4d_ptr = output_temp_4d->template flat<T>().data();
+    //      auto output__ptr = output_temp_4d->flat<T>();
+    /**
+     * 这样的话下面计算矩阵乘法的时候直接就写到这个输出里了
+     * 但是注意的是作者实现的时候划分ｓｔｅｐ，这个时候其实是往ｓｈａｐｅ为｛num_
+     * / im2col_step_, group_, M,
+     * N｝的输出里写的，所以最后一定要置换一下维度的位置
+     * **/
+    SetZeros<Device, T>()(d, ProdShape(out_shape, 0, out_shape.dims()),
+                          output_temp_4d_ptr);
+    TShape pads;
+    pads.push_back(dimensions.pad_rows);
+    pads.push_back(dimensions.pad_cols);
+    for (int32_t n = 0; n < num_ / im2col_step_; ++n) {  // 分batch进行
+      // transform image to col_buffer in order to use gemm
+      DeformableConv2DIm2Col<Device, T>()(
+          d,
+          in_data_ptr +
+              n * im2col_step_ *
+                  input_dim_,  // dptr是获取输入数据的指针 + n * im2col_step_*
+                               // input_dim 是让指针向后移动 一张图片的数据
+          offset_ptr + n * im2col_step_ * input_offset_dim_,  //
+          mask_ptr + n * im2col_step_ * input_mask_dim_, ToVector(input_shape),
+          ToVector(col_buffer_shape), SubVector(filter_shape, 2, 4), pads,
+          SubVector(params_.strides, 2, 4), SubVector(params_.dilations, 2, 4),
+          params_.deformable_groups, col_buffer_ptr);
+      TensorShape col_buffer_3d_shape = TensorShape({group_, K, N});
+
+      auto output_temp_group_ptr = output_temp_4d_ptr + (n * group_ * M * N);
+
+      LaunchBatchMatMul<Device, T>::launch(
+          context, weight_3d_shape, col_buffer_3d_shape, weight_3d_ptr,
+          col_buffer_ptr, false, false, output_temp_group_ptr);
+    }
+  }
+
+ private:
+  DeformableConv2DParameters params_;
+  bool use_cudnn_;
+  bool cudnn_use_autotune_;
+  int32_t channel_axis_;       // channel axis of the input
+  int32_t channels_;           // number of channels of input image
+  int32_t num_spatial_axes_;   // number of spatial axes
+  int32_t num_;                // batch size
+  int32_t group_;              // number of groups
+  int32_t conv_out_channels_;  // number of output channels (num_filter)
+  int32_t
+      conv_out_spatial_dim_;  // number of pixels of output images per channel
+  int32_t conv_in_channels_;  // number of input channels
+  int32_t kernel_dim_;     // number of input channels per group * kernel size
+  int32_t weight_offset_;  // number of output channels per group * kernel_dim_
+  int32_t col_offset_;
+  int32_t output_offset_;
+  int32_t col_buffer_size_;
+  int32_t input_dim_;
+  int32_t input_offset_dim_;
+  int32_t input_mask_dim_;
+  int32_t output_dim_;
+  int32_t num_kernels_im2col_;
+  int32_t num_kernels_col2im_;
+  int32_t im2col_step_;
+  bool bias_term_;  // has bias term?
+  bool is_1x1_;
+  void LayerSetUp(const TensorShape &ishape, const TensorShape &filter_shape,
+                  const TensorShape &offset_shape,
+                  const TensorShape &mask_shape, const TensorShape &oshape) {
+    channel_axis_ = 1;  // hard code channel axis, fixed the input data_format
+    const int32_t first_spatial_axis = channel_axis_ + 1;
+    const int32_t num_axes = filter_shape.dims();
+    num_spatial_axes_ =
+        num_axes -
+        first_spatial_axis;  //表示的是空间坐标个数,比如说2维卷积里,就是2,
+                             // 3维卷积里就是3
+    is_1x1_ = true;  //  判断是否为1x1卷积
+    for (int32_t i = 2; i < filter_shape.dims(); ++i) {
+      // is_1x1_ &= filter_shape.dim_size(i) == 1 && params_.stride[i] == 1 &&
+      // params_.pad[i] == 0;
+      is_1x1_ &=
+          filter_shape.dim_size(i) == 1;  // only judge by the filter's shape
+      if (!is_1x1_) break;
+    }
+    num_ = ishape.dim_size(0);                      // batch size
+    channels_ = ishape.dim_size(1);                 // number of input channels
+    group_ = params_.num_groups;                    //
+    conv_out_channels_ = filter_shape.dim_size(0);  // output channel nums
+    conv_in_channels_ = channels_;                  // input channel nums
+    bias_term_ = !params_.no_bias;                  //
+    kernel_dim_ =
+        conv_in_channels_ / group_ * filter_shape.dim_size(2) *
+        filter_shape.dim_size(
+            3);  // Size()返回tensor中元素个数，即各维度大小的乘积，所以这里的kernel_dim的意思是卷积核的参数个数了．
+    conv_out_spatial_dim_ = ProdShape(
+        oshape, 2,
+        oshape
+            .dims());  // ProdShape(dimstart, dimend)返回指定维度大小乘积,
+                       // 这个变量代表每个通道的像素点个数,
+                       // oshape.ndim()返回这个shape的维度，假设是NCHW那么返回4,则为
+                       // H * W，
+    //        col_offset_ = kernel_dim_ *
+    //        conv_out_spatial_dim_;//kernel_dim代表一个卷积核参数的个数，conv_out_spatial_dim_相当于特征图上的坐标个数，那这个变量相当于总共需要的偏移量
+    //        weight_offset_ = conv_out_channels_ * kernel_dim_ /
+    //        group_;//这里应该是所有的权重的个数，也就是需要求的权重偏移的个数
+    //        output_offset_ = conv_out_channels_ * conv_out_spatial_dim_ /
+    //        group_;//这里是输出通道数乘上每个通道的像素点的个数，所以结果应该是输出的总维度，就是C*H*W
+    im2col_step_ = std::min(params_.im2col_step, num_);
+    col_buffer_size_ =
+        kernel_dim_ * group_ * im2col_step_ *
+        conv_out_spatial_dim_;  // 开辟的缓存大小// size of the column buffer
+                                // used for storing im2col-ed pixels
+
+    input_dim_ = ProdShape(
+        ishape, 1,
+        ishape.dims());  // input image size (#channels * height * width)
+    input_offset_dim_ =
+        ProdShape(offset_shape, 1, offset_shape.dims());  // 18 * H * W
+    input_mask_dim_ = ProdShape(mask_shape, 1, mask_shape.dims());  // 9 * H * W
+    output_dim_ = ProdShape(oshape, 1, oshape.dims());  //输出的元素个数
+
+    num_kernels_im2col_ =
+        conv_in_channels_ *
+        conv_out_spatial_dim_;  //如果输出和输入的分辨率不变的话，代表输入数据的dim,我个人觉得就是把整个输入展开为一个一维向量,在求其维度大小
+    num_kernels_col2im_ = input_dim_;  //输入数据的dim
+  }
+};
+
+template <typename Device, typename T>
+class DeformableConv2DBackPropOp : public OpKernel {
+ public:
+  explicit DeformableConv2DBackPropOp(OpKernelConstruction *context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, InitDeformableConv2DParameters(context, &params_));
+  }
+  void Compute(OpKernelContext *ctx) override {
+    const Tensor &x = ctx->input(0);
+    const TensorShape &x_shape = x.shape();
+    const Tensor &filter = ctx->input(1);
+    const TensorShape &filter_shape = filter.shape();
+    const Tensor &offset = ctx->input(2);
+    const TensorShape &offset_shape = offset.shape();
+    const Tensor &mask = ctx->input(3);
+    const TensorShape &mask_shape = mask.shape();
+    const Tensor &out_grad = ctx->input(4);
+    const TensorShape &out_grad_shape = out_grad.shape();
+    DeformableConv2DDimensions dimensions;
+    OP_REQUIRES_OK(ctx, ComputeDeformableConv2DDimension(params_, x, filter,
+                                                         &dimensions, 1));
+    LayerSetUp(x_shape, filter_shape, offset_shape, mask_shape, out_grad_shape);
+    const Device &d = ctx->eigen_device<Device>();
+    int col_buffer_shape_temp[4];
+    col_buffer_shape_temp[0] = ProdShape(filter_shape, 1, filter_shape.dims());
+    col_buffer_shape_temp[1] = im2col_step_;
+    col_buffer_shape_temp[2] = out_grad_shape.dim_size(2);
+    col_buffer_shape_temp[3] = out_grad_shape.dim_size(3);
+    TensorShape col_buffer_shape =
+        TensorShape({col_buffer_shape_temp[0], col_buffer_shape_temp[1],
+                     col_buffer_shape_temp[2], col_buffer_shape_temp[3]});
+    int32_t M = kernel_dim_;
+    int32_t N = im2col_step_ * conv_out_spatial_dim_;
+    int32_t K = conv_out_channels_ / group_;
+    const auto x_ptr = x.template flat<T>().data();
+    const auto offset_ptr = offset.template flat<T>().data();
+    const auto mask_ptr = mask.template flat<T>().data();
+    const auto weight_3d_ptr = filter.template flat<T>().data();
+    TensorShape weight_3d_shape = TensorShape({group_, K, M});
+    Tensor out_grad_4d;
+    TensorShape out_grad_4d_shape =
+        TensorShape({num_ / im2col_step_, im2col_step_, conv_out_channels_,
+                     conv_out_spatial_dim_});
+    OP_REQUIRES(ctx, out_grad_4d.CopyFrom(out_grad, out_grad_4d_shape),
+                errors::InvalidArgument("shape doesn't match"));
+    auto out_grad_4d_ptr = out_grad_4d.template flat<T>().data();
+    out_grad_4d_shape = TensorShape({num_ / im2col_step_, group_, K, N});
+    Tensor col_buffer;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                           col_buffer_shape, &col_buffer));
+    auto col_buffer_3d_ptr = col_buffer.template flat<T>().data();
+    TensorShape col_buffer_3d_shape = TensorShape({group_, M, N});
+    Tensor *dweight_3d = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, filter_shape, &dweight_3d));
+    T *dweight_3d_ptr = dweight_3d->template flat<T>().data();
+    Tensor *x_grad = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, x_shape, &x_grad));
+    T *x_grad_ptr = x_grad->template flat<T>().data();
+    Tensor *offset_grad = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, offset_shape, &offset_grad));
+    T *offset_grad_ptr = offset_grad->template flat<T>().data();
+
+    Tensor *mask_grad = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(3, mask_shape, &mask_grad));
+    T *mask_grad_ptr = mask_grad->template flat<T>().data();
+    TShape pads;
+    pads.push_back(dimensions.pad_rows);
+    pads.push_back(dimensions.pad_cols);
+    TShape kernel_shape = SubVector(filter_shape, 2, 4);
+    TShape stride_shape = SubVector(params_.strides, 2, 4);
+    TShape dilation_shape = SubVector(params_.dilations, 2, 4);
+    Tensor dweight_3d_temp;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
+                                           filter_shape, &dweight_3d_temp));
+    T *dweight_3d_temp_ptr = dweight_3d_temp.template flat<T>().data();
+    SetZeros<Device, T>()(d, group_ * M * N, col_buffer_3d_ptr);
+    SetZeros<Device, T>()(d, ProdShape(x_shape, 0, x_shape.dims()), x_grad_ptr);
+    SetZeros<Device, T>()(d, ProdShape(filter_shape, 0, filter_shape.dims()),
+                          dweight_3d_ptr);
+    SetZeros<Device, T>()(d, ProdShape(filter_shape, 0, filter_shape.dims()),
+                          dweight_3d_temp_ptr);
+    for (int n = 0; n < num_ / im2col_step_; ++n) {
+      TensorShape out_grad_3d_shape = TensorShape({group_, K, N});
+      T *out_grad_3d_ptr = out_grad_4d_ptr + n * group_ * K * N;
+      LaunchBatchMatMul<Device, T>::launch(
+          ctx, weight_3d_shape, out_grad_3d_shape, weight_3d_ptr,
+          out_grad_3d_ptr, true, false, col_buffer_3d_ptr);
+      DeformableConv2DCol2ImCoord<Device, T>()(
+          d, col_buffer_3d_ptr, x_ptr + n * im2col_step_ * input_dim_,
+          offset_ptr + n * im2col_step_ * input_offset_dim_,
+          mask_ptr + n * im2col_step_ * input_mask_dim_, ToVector(x_shape),
+          ToVector(col_buffer_shape), kernel_shape, pads, stride_shape,
+          dilation_shape, params_.deformable_groups,
+          offset_grad_ptr + n * im2col_step_ * input_offset_dim_,
+          mask_grad_ptr + n * im2col_step_ * input_mask_dim_);
+      DeformableConv2DCol2Im<Device, T>()(
+          d, col_buffer_3d_ptr,
+          offset_ptr + n * im2col_step_ * input_offset_dim_,
+          mask_ptr + n * im2col_step_ * input_mask_dim_, ToVector(x_shape),
+          ToVector(col_buffer_shape), kernel_shape, pads, stride_shape,
+          dilation_shape, params_.deformable_groups,
+          x_grad_ptr + n * im2col_step_ * input_dim_);
+      DeformableConv2DIm2Col<Device, T>()(
+          d, x_ptr + n * im2col_step_ * input_dim_,
+          offset_ptr + n * im2col_step_ * input_offset_dim_,
+          mask_ptr + n * im2col_step_ * input_mask_dim_, ToVector(x_shape),
+          ToVector(col_buffer_shape), kernel_shape, pads, stride_shape,
+          dilation_shape, params_.deformable_groups, col_buffer_3d_ptr);
+      if (n == 0) {
+        LaunchBatchMatMul<Device, T>::launch(
+            ctx, out_grad_3d_shape, col_buffer_3d_shape, out_grad_3d_ptr,
+            col_buffer_3d_ptr, false, true, dweight_3d_ptr);
+      } else {
+        LaunchBatchMatMul<Device, T>::launch(
+            ctx, out_grad_3d_shape, col_buffer_3d_shape, out_grad_3d_ptr,
+            col_buffer_3d_ptr, false, true, dweight_3d_temp_ptr);
+        PureAddTo<Device, T>()(d,
+                               ProdShape(filter_shape, 0, filter_shape.dims()),
+                               dweight_3d_ptr, dweight_3d_temp_ptr);
+      }
+    }
+  }
+
+ private:
+  DeformableConv2DParameters params_;
+  // bool use_cudnn_;
+  // bool cudnn_use_autotune_;
+  int32_t channel_axis_;       // channel axis of the input
+  int32_t channels_;           // number of channels of input image
+  int32_t num_spatial_axes_;   // number of spatial axes
+  int32_t num_;                // batch size
+  int32_t group_;              // number of groups
+  int32_t conv_out_channels_;  // number of output channels (num_filter)
+  int32_t
+      conv_out_spatial_dim_;  // number of pixels of output images per channel
+  int32_t conv_in_channels_;  // number of input channels
+  int32_t kernel_dim_;     // number of input channels per group * kernel size
+  int32_t weight_offset_;  // number of output channels per group * kernel_dim_
+  int32_t col_offset_;
+  int32_t output_offset_;
+  int32_t col_buffer_size_;
+  int32_t input_dim_;
+  int32_t input_offset_dim_;
+  int32_t input_mask_dim_;
+  int32_t output_dim_;
+  int32_t num_kernels_im2col_;
+  int32_t num_kernels_col2im_;
+  int32_t im2col_step_;
+  bool bias_term_;  // has bias term?
+  bool is_1x1_;
+  void LayerSetUp(const TensorShape &ishape, const TensorShape &filter_shape,
+                  const TensorShape &offset_shape,
+                  const TensorShape &mask_shape, const TensorShape &oshape) {
+    channel_axis_ = 1;  // hard code channel axis, fixed the input data_format
+    const int32_t first_spatial_axis = channel_axis_ + 1;
+    const int32_t num_axes = filter_shape.dims();
+    num_spatial_axes_ =
+        num_axes -
+        first_spatial_axis;  //表示的是空间坐标个数,比如说2维卷积里,就是2,
+                             // 3维卷积里就是3
+    is_1x1_ = true;  //  判断是否为1x1卷积
+    for (int32_t i = 2; i < filter_shape.dims(); ++i) {
+      // is_1x1_ &= filter_shape.dim_size(i) == 1 && params_.stride[i] == 1 &&
+      // params_.pad[i] == 0;
+      is_1x1_ &=
+          filter_shape.dim_size(i) == 1;  // only judge by the filter's shape
+      if (!is_1x1_) break;
+    }
+    num_ = ishape.dim_size(0);                      // batch size
+    channels_ = ishape.dim_size(1);                 // number of input channels
+    group_ = params_.num_groups;                    //
+    conv_out_channels_ = filter_shape.dim_size(0);  // output channel nums
+    conv_in_channels_ = channels_;                  // input channel nums
+    bias_term_ = !params_.no_bias;                  //
+    kernel_dim_ =
+        conv_in_channels_ / group_ * filter_shape.dim_size(2) *
+        filter_shape.dim_size(
+            3);  // Size()返回tensor中元素个数，即各维度大小的乘积，所以这里的kernel_dim的意思是卷积核的参数个数了．
+    conv_out_spatial_dim_ = ProdShape(
+        oshape, 2,
+        oshape
+            .dims());  // ProdShape(dimstart, dimend)返回指定维度大小乘积,
+                       // 这个变量代表每个通道的像素点个数,
+                       // oshape.ndim()返回这个shape的维度，假设是NCHW那么返回4,则为
+                       // H * W，
+    col_offset_ =
+        kernel_dim_ *
+        conv_out_spatial_dim_;  // kernel_dim代表一个卷积核参数的个数，conv_out_spatial_dim_相当于特征图上的坐标个数，那这个变量相当于总共需要的偏移量
+    weight_offset_ =
+        conv_out_channels_ * kernel_dim_ /
+        group_;  //这里应该是所有的权重的个数，也就是需要求的权重偏移的个数
+    output_offset_ =
+        conv_out_channels_ * conv_out_spatial_dim_ /
+        group_;  //这里是输出通道数乘上每个通道的像素点的个数，所以结果应该是输出的总维度，就是C*H*W
+    im2col_step_ = std::min(params_.im2col_step, num_);
+    col_buffer_size_ =
+        kernel_dim_ * group_ * im2col_step_ *
+        conv_out_spatial_dim_;  // 开辟的缓存大小// size of the column buffer
+                                // used for storing im2col-ed pixels
+
+    input_dim_ = ProdShape(
+        ishape, 1,
+        ishape.dims());  // input image size (#channels * height * width)
+    input_offset_dim_ =
+        ProdShape(offset_shape, 1, offset_shape.dims());  // 18 * H * W
+    input_mask_dim_ = ProdShape(mask_shape, 1, mask_shape.dims());  // 9 * H * W
+    output_dim_ = ProdShape(oshape, 1, oshape.dims());  //输出的元素个数
+
+    num_kernels_im2col_ =
+        conv_in_channels_ *
+        conv_out_spatial_dim_;  //如果输出和输入的分辨率不变的话，代表输入数据的dim,我个人觉得就是把整个输入展开为一个一维向量,在求其维度大小
+    num_kernels_col2im_ = input_dim_;  //输入数据的dim
+  };
+};
+
+template <typename Device, typename Type>
+class DeformablePSROIPoolOp : public OpKernel {
+ public:
+  explicit DeformablePSROIPoolOp(OpKernelConstruction *ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("pooled_size", &pool_size));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("no_trans", &no_trans));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("spatial_scale", &spatial_scale));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_dim", &output_dim));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("group_size", &group_size));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("part_size", &part_size));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sample_per_part", &sample_per_part));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("trans_std", &trans_std));
+  }
+  void Compute(OpKernelContext *ctx) override {
+    const Tensor &data = ctx->input(0);
+    const Tensor &bbox = ctx->input(1);
+    const Tensor &trans = ctx->input(2);
+    const int batch = data.dim_size(0);
+    const int channels = data.dim_size(1);
+    const int height = data.dim_size(2);
+    const int width = data.dim_size(3);
+    const int channels_trans = no_trans ? 2 : trans.dim_size(1);
+    const int num_bbox = bbox.dim_size(0);
+    Tensor *output;
+    Tensor *top_count;
+    const int pooled_width = pool_size;
+    const int pooled_height = pool_size;
+    const int count = num_bbox * output_dim * pooled_height * pooled_width;
+    const int num_classes = no_trans ? 1 : channels_trans / 2;
+    const int channels_each_class =
+        no_trans ? output_dim : output_dim / num_classes;
+    TensorShape output_shape{num_bbox, output_dim, pooled_height, pooled_width};
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output));
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, output_shape, &top_count));
+    const Type *bottom_data = data.flat<Type>().data();
+    const Type *bottom_rois = bbox.flat<Type>().data();
+    const Type *bottom_trans = no_trans ? nullptr : trans.flat<Type>().data();
+    Type *top_data = output->flat<Type>().data();
+    Type *top_count_data = top_count->flat<Type>().data();
+    const Device &d = ctx->eigen_device<Device>();
+    DeformablePSROIPoolForward<Device, Type>()(
+        d, count, bottom_data, spatial_scale, channels, height, width,
+        pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
+        trans_std, sample_per_part, output_dim, group_size, part_size,
+        num_classes, channels_each_class, top_data, top_count_data);
+  }
+
+ private:
+  int pool_size;
+  int no_trans;
+  float spatial_scale;
+  int output_dim;
+  int group_size;
+  int part_size;
+  int sample_per_part;
+  float trans_std;
+};
+
+template <typename Device, typename Type>
+class DeformablePSROIPoolBackPropOp : public OpKernel {
+ public:
+  explicit DeformablePSROIPoolBackPropOp(OpKernelConstruction *ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("pooled_size", &pool_size));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("no_trans", &no_trans));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("spatial_scale", &spatial_scale));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_dim", &output_dim));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("group_size", &group_size));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("part_size", &part_size));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("sample_per_part", &sample_per_part));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("trans_std", &trans_std));
+  }
+  void Compute(OpKernelContext *ctx) override {
+    const Tensor &data = ctx->input(0);
+    const Tensor &bbox = ctx->input(1);
+    const Tensor &trans = ctx->input(2);
+    const Tensor &top_count = ctx->input(3);
+    const Tensor &out_grad = ctx->input(4);
+    const int batch = data.dim_size(0);
+    const int channels = data.dim_size(1);
+    const int height = data.dim_size(2);
+    const int width = data.dim_size(3);
+    const int channels_trans = no_trans ? 2 : trans.dim_size(1);
+    const int num_bbox = bbox.dim_size(0);
+    const int num_rois = num_bbox;
+    const int pooled_height = pool_size;
+    const int pooled_width = pool_size;
+    const int count = num_bbox * output_dim * pooled_height * pooled_width;
+    const int num_classes = no_trans ? 1 : channels_trans / 2;
+    const int channels_each_class =
+        no_trans ? output_dim : output_dim / num_classes;
+    Tensor *in_grad = nullptr;
+    Tensor *trans_grad = nullptr;
+    const TensorShape &in_grad_shape = data.shape();
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, in_grad_shape, &in_grad));
+    TensorShape trans_grad_shape;
+    const Type *top_diff = out_grad.flat<Type>().data();
+    const Type *bottom_data = data.flat<Type>().data();
+    const Type *bottom_rois = bbox.flat<Type>().data();
+    trans_grad_shape = trans.shape();
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(1, trans_grad_shape, &trans_grad));
+    const Type *bottom_trans = no_trans ? nullptr : trans.flat<Type>().data();
+    Type *bottom_data_diff = in_grad->flat<Type>().data();
+    Type *bottom_trans_diff =
+        no_trans ? nullptr : trans_grad->flat<Type>().data();
+    const Type *top_count_data = top_count.flat<Type>().data();
+    const Device &d = ctx->eigen_device<Device>();
+    DeformablePSROIPoolBackwardKernel<Device, Type>()(
+        d, count, top_diff, top_count_data, num_rois, spatial_scale, channels,
+        height, width, pooled_height, pooled_width, output_dim,
+        bottom_data_diff, bottom_trans_diff, bottom_data, bottom_rois,
+        bottom_trans, no_trans, trans_std, sample_per_part, group_size,
+        part_size, num_classes, channels_each_class);
+  }
+
+ private:
+  int pool_size;
+  int no_trans;
+  float spatial_scale;
+  int output_dim;
+  int group_size;
+  int part_size;
+  int sample_per_part;
+  float trans_std;
+};
+
+#define REGISTER_CPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(Name("AddonsDeformableConv2D")         \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<T>("T"),           \
+                          DeformableConv2DOp<CPUDevice, T>);     \
+  REGISTER_KERNEL_BUILDER(Name("AddonsDeformableConv2DBackProp") \
+                              .Device(DEVICE_CPU)                \
+                              .TypeConstraint<T>("T"),           \
+                          DeformableConv2DBackPropOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+#undef REGISTER_CPU
+#define REGISTER_CPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("AddonsDeformablePsroiPool")         \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T"),              \
+                          DeformablePSROIPoolOp<CPUDevice, T>);     \
+  REGISTER_KERNEL_BUILDER(Name("AddonsDeformablePsroiPoolBackProp") \
+                              .Device(DEVICE_CPU)                   \
+                              .TypeConstraint<T>("T"),              \
+                          DeformablePSROIPoolBackPropOp<CPUDevice, T>);
+REGISTER_CPU(float);
+REGISTER_CPU(double);
+#undef REGISTER_CPU
+
+#ifdef GOOGLE_CUDA
+#define REGISTER_GPU(T)                                          \
+  REGISTER_KERNEL_BUILDER(Name("AddonsDeformableConv2D")         \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<T>("T"),           \
+                          DeformableConv2DOp<GPUDevice, T>);     \
+  REGISTER_KERNEL_BUILDER(Name("AddonsDeformableConv2DBackProp") \
+                              .Device(DEVICE_GPU)                \
+                              .TypeConstraint<T>("T"),           \
+                          DeformableConv2DBackPropOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#undef REGISTER_GPU
+#define REGISTER_GPU(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("AddonsDeformablePsroiPool")         \
+                              .Device(DEVICE_GPU)                   \
+                              .TypeConstraint<T>("T"),              \
+                          DeformablePSROIPoolOp<GPUDevice, T>);     \
+  REGISTER_KERNEL_BUILDER(Name("AddonsDeformablePsroiPoolBackProp") \
+                              .Device(DEVICE_GPU)                   \
+                              .TypeConstraint<T>("T"),              \
+                          DeformablePSROIPoolBackPropOp<GPUDevice, T>);
+REGISTER_GPU(float);
+REGISTER_GPU(double);
+#endif
+
+}  // namespace functor
+}  // namespace addons
+}  // namespace tensorflow
diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
new file mode 100755
index 0000000000..04413f1f60
--- /dev/null
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
@@ -0,0 +1,317 @@
+
+#ifndef TF_OPS_DEFORMABLE_CONV2D_H
+#define TF_OPS_DEFORMABLE_CONV2D_H
+
+#ifdef __JETBRAINS_IDE__
+#define __host__
+#define __device__
+#define __shared__
+#define __constant__
+#define __global__
+
+// This is slightly mental, but gets it to properly index device function calls
+// like __popc and whatever.
+//#define __CUDACC__
+
+// These headers are all implicitly present when you compile CUDA with clang.
+// Clion doesn't know that, so we include them explicitly to make the indexer
+// happy. Doing this when you actually build is, obviously, a terrible idea :D
+//#include <__clang_cuda_builtin_vars.h>
+//#include <__clang_cuda_intrinsics.h>
+//#include <__clang_cuda_math_forward_declares.h>
+//#include <__clang_cuda_complex_builtins.h>
+//#include <__clang_cuda_cmath.h>
+#endif  // __JETBRAINS_IDE__
+
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_GPU
+
+#include <iostream>
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
+namespace tensorflow {
+namespace addons {
+
+namespace functor {
+
+using TShape = std::vector<int>;
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+inline int ProdShape(const TShape &shape, int start, int end) {
+  int res = 1;
+  for (int i = start; i < end; ++i) {
+    res *= shape[i];
+  }
+  return res;
+}
+inline int ProdShape(const TensorShape &shape, int start, int end) {
+  int res = 1;
+  for (int i = start; i < end; ++i) {
+    res *= shape.dim_size(i);
+  }
+  return res;
+}
+
+template <typename Device, typename DType>
+struct PureAddTo {
+  void operator()(const Device &d, const int n, DType *result_data,
+                  const DType *right_data);
+};
+struct DeformableConv2DParameters {
+  TShape dilations;
+  TShape strides;
+  Padding padding;
+  int32_t num_groups;
+  int32_t deformable_groups;
+  int32_t im2col_step;
+  bool no_bias;
+  TensorFormat data_format;
+};
+struct DeformableConv2DDimensions {
+  int batch;
+  int input_rows;
+  int input_cols;
+  int in_depth;
+  int filter_rows;
+  int filter_cols;
+  int patch_depth;
+  int out_depth;
+  int stride_rows;
+  int stride_cols;
+  int dilation_rows;
+  int dilation_cols;
+  int out_rows;
+  int out_cols;
+  int pad_rows;
+  int pad_cols;
+};
+template <typename Device, typename T>
+struct LaunchBatchMatMul;
+
+template <typename Device, typename DType>
+struct DeformableConv2DCol2ImCoord {
+  void operator()(const Device &d, const DType *data_col, const DType *data_im,
+                  const DType *data_offset, const DType *data_mask,
+                  const TShape &im_shape, const TShape &col_shape,
+                  const TShape &kernel_shape, const TShape &pad,
+                  const TShape &stride, const TShape &dilation,
+                  const int32_t deformable_group, DType *grad_offset,
+                  DType *grad_mask);
+};
+template <typename Device, typename DType>
+struct SwapAxis {
+  void operator()(const Device &d, DType *input_data,
+                  const TShape &origin_shape, const int axis_x,
+                  const int axis_y);
+};
+template <typename Device, typename DType>
+struct DeformableConv2DCol2Im {
+  void operator()(const Device &d, const DType *data_col,
+                  const DType *data_offset, const DType *data_mask,
+                  const TShape &im_shape, const TShape &col_shape,
+                  const TShape &kernel_shape, const TShape &pad,
+                  const TShape &stride, const TShape &dilation,
+                  const int32_t deformable_group, DType *grad_im);
+};
+template <typename Device, typename DType>
+struct DeformableConv2DIm2Col {
+  void operator()(const Device &d, const DType *data_im,
+                  const DType *data_offset, const DType *data_mask,
+                  const TShape &im_shape, const TShape &col_shape,
+                  const TShape &kernel_shape, const TShape &pad,
+                  const TShape &stride, const TShape &dilation,
+                  const int32_t deformable_group, DType *data_col);
+};
+template <typename Device, typename DType>
+struct SetZeros {
+  void operator()(const Device &d, int n, DType *result_data);
+};
+template <typename Device, typename DType>
+struct SetOne {
+  void operator()(const Device &d, int n, DType *result_data);
+};
+template <typename Device, typename DType>
+struct SetNumAtIndex {
+  void operator()(const Device &d, DType num, int index, DType *data);
+};
+#ifdef GOOGLE_CUDA
+template <typename DType>
+struct DeformableConv2DIm2Col<Eigen::GpuDevice, DType> {
+  void operator()(const Eigen::GpuDevice &d, const DType *data_im,
+                  const DType *data_offset, const DType *data_mask,
+                  const TShape &im_shape, const TShape &col_shape,
+                  const TShape &kernel_shape, const TShape &pad,
+                  const TShape &stride, const TShape &dilation,
+                  const int32_t deformable_group, DType *data_col);
+};
+template <typename DType>
+struct DeformableConv2DCol2Im<Eigen::GpuDevice, DType> {
+  void operator()(const Eigen::GpuDevice &d, const DType *data_col,
+                  const DType *data_offset, const DType *data_mask,
+                  const TShape &im_shape, const TShape &col_shape,
+                  const TShape &kernel_shape, const TShape &pad,
+                  const TShape &stride, const TShape &dilation,
+                  const int32_t deformable_group, DType *grad_im);
+};
+template <typename DType>
+struct DeformableConv2DCol2ImCoord<Eigen::GpuDevice, DType> {
+  void operator()(const Eigen::GpuDevice &d, const DType *data_col,
+                  const DType *data_im, const DType *data_offset,
+                  const DType *data_mask, const TShape &im_shape,
+                  const TShape &col_shape, const TShape &kernel_shape,
+                  const TShape &pad, const TShape &stride,
+                  const TShape &dilation, const int32_t deformable_group,
+                  DType *grad_offset, DType *grad_mask);
+};
+template <typename DType>
+struct SetNumAtIndex<Eigen::GpuDevice, DType> {
+  void operator()(const Eigen::GpuDevice &d, DType num, int index, DType *data);
+};
+template <typename DType>
+struct SetZeros<Eigen::GpuDevice, DType> {
+  void operator()(const Eigen::GpuDevice &d, int n, DType *result_data);
+};
+template <typename DType>
+struct SetOne<Eigen::GpuDevice, DType> {
+  void operator()(const Eigen::GpuDevice &d, int n, DType *result_data);
+};
+template <typename DType>
+struct PureAddTo<Eigen::GpuDevice, DType> {
+  void operator()(const Eigen::GpuDevice &d, const int n, DType *result_data,
+                  const DType *right_data);
+};
+#endif
+template <typename DType>
+struct DeformableConv2DIm2Col<Eigen::ThreadPoolDevice, DType> {
+  void operator()(const Eigen::ThreadPoolDevice &d, const DType *data_im,
+                  const DType *data_offset, const DType *data_mask,
+                  const TShape &im_shape, const TShape &col_shape,
+                  const TShape &kernel_shape, const TShape &pad,
+                  const TShape &stride, const TShape &dilation,
+                  const int32_t deformable_group, DType *data_col);
+};
+template <typename DType>
+struct DeformableConv2DCol2Im<Eigen::ThreadPoolDevice, DType> {
+  void operator()(const Eigen::ThreadPoolDevice &d, const DType *data_col,
+                  const DType *data_offset, const DType *data_mask,
+                  const TShape &im_shape, const TShape &col_shape,
+                  const TShape &kernel_shape, const TShape &pad,
+                  const TShape &stride, const TShape &dilation,
+                  const int32_t deformable_group, DType *grad_im);
+};
+template <typename DType>
+struct DeformableConv2DCol2ImCoord<Eigen::ThreadPoolDevice, DType> {
+  void operator()(const Eigen::ThreadPoolDevice &d, const DType *data_col,
+                  const DType *data_im, const DType *data_offset,
+                  const DType *data_mask, const TShape &im_shape,
+                  const TShape &col_shape, const TShape &kernel_shape,
+                  const TShape &pad, const TShape &stride,
+                  const TShape &dilation, const int32_t deformable_group,
+                  DType *grad_offset, DType *grad_mask);
+};
+template <typename DType>
+struct SetNumAtIndex<Eigen::ThreadPoolDevice, DType> {
+  void operator()(const Eigen::ThreadPoolDevice &d, DType num, int index,
+                  DType *data);
+};
+template <typename DType>
+struct SetZeros<Eigen::ThreadPoolDevice, DType> {
+  void operator()(const Eigen::ThreadPoolDevice &d, int n, DType *result_data);
+};
+template <typename DType>
+struct SetOne<Eigen::ThreadPoolDevice, DType> {
+  void operator()(const Eigen::ThreadPoolDevice &d, int n, DType *result_data);
+};
+template <typename DType>
+struct PureAddTo<Eigen::ThreadPoolDevice, DType> {
+  void operator()(const Eigen::ThreadPoolDevice &d, const int n,
+                  DType *result_data, const DType *right_data);
+};
+
+template <typename T>
+struct LaunchBatchMatMul<GPUDevice, T> {
+  static void launch(OpKernelContext *context, const TensorShape &in_x_shape,
+                     const TensorShape &in_y_shape, const T *in_x_ptr,
+                     const T *in_y_ptr, bool adj_x, bool adj_y, T *out);
+};
+template <typename T>
+struct LaunchBatchMatMul<CPUDevice, T> {
+  static void launch(OpKernelContext *context, const TensorShape &in_x_shape,
+                     const TensorShape &in_y_shape, const T *in_x_ptr,
+                     const T *in_y_ptr, bool adj_x, bool adj_y, T *out);
+};
+
+template <typename Device, typename T>
+struct DeformablePSROIPoolForward {};
+template <typename T>
+struct DeformablePSROIPoolForward<CPUDevice, T> {
+  void operator()(const CPUDevice &d, const int count, const T *bottom_data,
+                  const T spatial_scale, const int channels, const int height,
+                  const int width, const int pooled_height,
+                  const int pooled_width, const T *bottom_rois,
+                  const T *bottom_trans, const int no_trans, const T trans_std,
+                  const int sample_per_part, const int output_dim,
+                  const int group_size, const int part_size,
+                  const int num_classes, const int channels_each_class,
+                  T *top_data, T *top_count);
+};
+template <typename T>
+struct DeformablePSROIPoolForward<GPUDevice, T> {
+  void operator()(const GPUDevice &d, const int count, const T *bottom_data,
+                  const T spatial_scale, const int channels, const int height,
+                  const int width, const int pooled_height,
+                  const int pooled_width, const T *bottom_rois,
+                  const T *bottom_trans, const int no_trans, const T trans_std,
+                  const int sample_per_part, const int output_dim,
+                  const int group_size, const int part_size,
+                  const int num_classes, const int channels_each_class,
+                  T *top_data, T *top_count);
+};
+
+template <typename Device, typename T>
+struct DeformablePSROIPoolBackwardKernel {};
+
+template <typename T>
+struct DeformablePSROIPoolBackwardKernel<GPUDevice, T> {
+  void operator()(const GPUDevice &d, const int count, const T *top_diff,
+                  const T *top_count, const int num_rois, const T spatial_scale,
+                  const int channels, const int height, const int width,
+                  const int pooled_height, const int pooled_width,
+                  const int output_dim, T *bottom_data_diff,
+                  T *bottom_trans_diff, const T *bottom_data,
+                  const T *bottom_rois, const T *bottom_trans,
+                  const int no_trans, const T trans_std,
+                  const int sample_per_part, const int group_size,
+                  const int part_size, const int num_classes,
+                  const int channels_each_class);
+};
+
+template <typename T>
+struct DeformablePSROIPoolBackwardKernel<CPUDevice, T> {
+  void operator()(const CPUDevice &d, const int count, const T *top_diff,
+                  const T *top_count, const int num_rois, const T spatial_scale,
+                  const int channels, const int height, const int width,
+                  const int pooled_height, const int pooled_width,
+                  const int output_dim, T *bottom_data_diff,
+                  T *bottom_trans_diff, const T *bottom_data,
+                  const T *bottom_rois, const T *bottom_trans,
+                  const int no_trans, const T trans_std,
+                  const int sample_per_part, const int group_size,
+                  const int part_size, const int num_classes,
+                  const int channels_each_class);
+};
+}  // namespace functor
+}  // namespace addons
+}  // namespace tensorflow
+
+#endif  // TF_OPS_DEFORMABLE_CONV2D_H
diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
new file mode 100755
index 0000000000..d7f52aaa12
--- /dev/null
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
@@ -0,0 +1,995 @@
+
+#include <algorithm>
+#include <cstdlib>
+
+#include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h"
+
+#ifdef GOOGLE_CUDA
+#include "tensorflow/core/platform/stream_executor.h"
+#include "tensorflow/core/util/gpu_kernel_helper.h"
+#endif
+
+namespace tensorflow {
+namespace addons {
+
+namespace functor {
+
+typedef Eigen::GpuDevice GPUDevice;
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename DType>
+__device__ DType DmcnIm2colBilinear(const DType *bottom_data,
+                                    const int data_width, const int height,
+                                    const int width, DType h, DType w) {
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  DType lh = h - h_low;
+  DType lw = w - w_low;
+  DType hh = 1 - lh, hw = 1 - lw;
+
+  DType v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = bottom_data[h_low * data_width + w_low];
+  DType v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = bottom_data[h_low * data_width + w_high];
+  DType v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = bottom_data[h_high * data_width + w_low];
+  DType v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = bottom_data[h_high * data_width + w_high];
+
+  DType w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  DType val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+template <typename DType>
+__device__ DType DmcnGetGradientWeight(DType argmax_h, DType argmax_w,
+                                       const int h, const int w,
+                                       const int height, const int width) {
+  /*
+   * offset h, offset w, (h, w) coordinate
+   */
+  if (argmax_h <= -1 || argmax_w <= -1 || argmax_h >= height ||
+      argmax_w >= width) {
+    return 0;
+  }
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+  DType weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+template <typename DType>
+__device__ DType DmcnGetCoordinateWeight(DType argmax_h, DType argmax_w,
+                                         const int height, const int width,
+                                         const DType *im_data,
+                                         const int data_width,
+                                         const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  DType weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+#ifdef GOOGLE_CUDA
+template <typename DType>
+__global__ void SwapAxisKernel(const int n, const int cuda_mem_size,
+                               const int min_unit_size, DType *input_data,
+                               const int dim_num, const int axis_x_dims,
+                               const int axis_y_dims, const int axis_x,
+                               const int axis_y) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    DType *device_data = new DType[cuda_mem_size];
+    DType *input_data_ptr = input_data + index * cuda_mem_size;
+    for (int j = 0; j < axis_y_dims; j++) {
+      for (int i = 0; i < axis_x_dims; i++) {
+        DType *temp_ptr =
+            input_data_ptr + (i * axis_x_dims + j) * min_unit_size;
+        DType *device_data_temp_ptr =
+            device_data + (j * axis_y_dims + i) * min_unit_size;
+        for (int k = 0; k < min_unit_size; k++) {
+          *(device_data_temp_ptr + k) = *(temp_ptr + k);
+        }
+      }
+    }
+    for (int i = 0; i < cuda_mem_size; i++) {
+      *(input_data_ptr + i) = *(device_data + i);
+    }
+    delete[] device_data;
+  }
+}
+template <typename DType>
+__global__ void DeformableConv2DIm2ColKernel(
+    const int n, const DType *data_im, const DType *data_offset,
+    const DType *data_mask, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int num_channels, const int deformable_group,
+    const int height_col, const int width_col, DType *data_col) {
+  /*
+   * channel_per_deformable_group // 输入图通道数除以deformable_group的数量,
+   * //这里的batch_size代表的是im2col_step_, 一般就设为1了
+   */
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    DType *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const DType *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const DType *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const DType *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+        const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+        const DType mask = data_mask_ptr[data_mask_hw_ptr];
+        DType val = static_cast<DType>(0);
+        const DType h_im = h_in + i * dilation_h + offset_h;
+        const DType w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
+        }
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+template <typename T>
+__global__ void DeformablePSROIPoolForwardKernel(
+    const int count, const T *bottom_data, const T spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const T *bottom_rois,
+    const T *bottom_trans, const int no_trans, const T trans_std,
+    const int sample_per_part, const int output_dim, const int group_size,
+    const int part_size, const int num_classes, const int channels_each_class,
+    T *top_data, T *top_count) {
+  CUDA_1D_KERNEL_LOOP(index, count) {
+    // The output is in order (n, ctop, ph, pw)
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = (T)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = (T)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = (T)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = (T)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.1));  // avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.1));
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+    T sub_bin_size_h = bin_size_h / static_cast<T>(sample_per_part);
+    T sub_bin_size_w = bin_size_w / static_cast<T>(sample_per_part);
+    int part_h = floor(static_cast<T>(ph) / pooled_height * part_size);
+    int part_w = floor(static_cast<T>(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x =
+        no_trans
+            ? (T)(0)
+            : bottom_trans[(((n * num_classes + class_id) * 2) * part_size +
+                            part_h) *
+                               part_size +
+                           part_w] *
+                  (T)trans_std;
+    T trans_y =
+        no_trans
+            ? (T)(0)
+            : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size +
+                            part_h) *
+                               part_size +
+                           part_w] *
+                  (T)trans_std;
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+    T sum = 0;
+    int total = 0;
+    int gw = floor(static_cast<T>(pw) * group_size / pooled_width);
+    int gh = floor(static_cast<T>(ph) * group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+    const T *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels) * height * width;
+    for (int ih = 0; ih < sample_per_part; ++ih) {
+      for (int iw = 0; iw < sample_per_part; ++iw) {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+          continue;
+        }
+        w = min(max(w, static_cast<T>(0.)), static_cast<T>(width - 1.));
+        h = min(max(h, static_cast<T>(0.)), static_cast<T>(height - 1.));
+        int c = (ctop * group_size + gh) * group_size + gw;
+        T val = DmcnIm2colBilinear(offset_bottom_data + c * height * width, w,
+                                   h, w, (T)height, (T)width);
+        sum += val;
+        total++;
+      }
+    }
+    top_data[index] = total == 0 ? (T)(0) : sum / total;
+    top_count[index] = total;
+  }
+}
+template <typename T>
+__global__ void DeformablePSROIPoolBackwardAccKernel(
+    const int count, const T *top_diff, const T *top_count, const int num_rois,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const int output_dim, T *bottom_data_diff, T *bottom_trans_diff,
+    const T *bottom_data, const T *bottom_rois, const T *bottom_trans,
+    const int no_trans, const T trans_std, const int sample_per_part,
+    const int group_size, const int part_size, const int num_classes,
+    const int channels_each_class) {
+  CUDA_1D_KERNEL_LOOP(index, count) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int ctop = (index / pooled_width / pooled_height) % output_dim;
+    int n = index / pooled_width / pooled_height / output_dim;
+    // [start, end) interval for spatial sampling
+    const T *offset_bottom_rois = bottom_rois + n * 5;
+    int roi_batch_ind = offset_bottom_rois[0];
+    T roi_start_w = (T)(round(offset_bottom_rois[1])) * spatial_scale - 0.5;
+    T roi_start_h = (T)(round(offset_bottom_rois[2])) * spatial_scale - 0.5;
+    T roi_end_w = (T)(round(offset_bottom_rois[3]) + 1.) * spatial_scale - 0.5;
+    T roi_end_h = (T)(round(offset_bottom_rois[4]) + 1.) * spatial_scale - 0.5;
+    // Force too small ROIs to be 1x1
+    T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.1));  // avoid 0
+    T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.1));
+
+    // Compute w and h at bottom
+    T bin_size_h = roi_height / (T)(pooled_height);
+    T bin_size_w = roi_width / (T)(pooled_width);
+
+    T sub_bin_size_h = bin_size_h / (T)(sample_per_part);
+    T sub_bin_size_w = bin_size_w / (T)(sample_per_part);
+
+    int part_h = floor((T)(ph) / pooled_height * part_size);
+    int part_w = floor((T)(pw) / pooled_width * part_size);
+    int class_id = ctop / channels_each_class;
+    T trans_x =
+        no_trans
+            ? (T)(0)
+            : bottom_trans[(((n * num_classes + class_id) * 2) * part_size +
+                            part_h) *
+                               part_size +
+                           part_w] *
+                  (T)trans_std;
+    T trans_y =
+        no_trans
+            ? (T)(0)
+            : bottom_trans[(((n * num_classes + class_id) * 2 + 1) * part_size +
+                            part_h) *
+                               part_size +
+                           part_w] *
+                  (T)trans_std;
+
+    T wstart = (T)(pw)*bin_size_w + roi_start_w;
+    wstart += trans_x * roi_width;
+    T hstart = (T)(ph)*bin_size_h + roi_start_h;
+    hstart += trans_y * roi_height;
+
+    if (top_count[index] <= 0) {
+      continue;
+    }
+    T diff_val = top_diff[index] / top_count[index];
+    const T *offset_bottom_data =
+        bottom_data + roi_batch_ind * channels * height * width;
+    T *offset_bottom_data_diff =
+        bottom_data_diff + roi_batch_ind * channels * height * width;
+    int gw = floor((T)(pw)*group_size / pooled_width);
+    int gh = floor((T)(ph)*group_size / pooled_height);
+    gw = min(max(gw, 0), group_size - 1);
+    gh = min(max(gh, 0), group_size - 1);
+    for (int ih = 0; ih < sample_per_part; ih++) {
+      for (int iw = 0; iw < sample_per_part; iw++) {
+        T w = wstart + iw * sub_bin_size_w;
+        T h = hstart + ih * sub_bin_size_h;
+        // bilinear interpolation
+        if (w < -0.5 || w > width - 0.5 || h < -0.5 || h > height - 0.5) {
+          continue;
+        }
+        w = min(max(w, 0.), width - 1.);
+        h = min(max(h, 0.), height - 1.);
+        int c = (ctop * group_size + gh) * group_size + gw;
+        // backward on feature
+        int x0 = floor(w);
+        int x1 = ceil(w);
+        int y0 = floor(h);
+        int y1 = ceil(h);
+        T dist_x = w - x0, dist_y = h - y0;
+        T q00 = (1 - dist_x) * (1 - dist_y);
+        T q01 = (1 - dist_x) * dist_y;
+        T q10 = dist_x * (1 - dist_y);
+        T q11 = dist_x * dist_y;
+        int bottom_index_base = c * height * width;
+        CudaAtomicAdd(
+            offset_bottom_data_diff + bottom_index_base + y0 * width + x0,
+            q00 * diff_val);
+        CudaAtomicAdd(
+            offset_bottom_data_diff + bottom_index_base + y1 * width + x0,
+            q01 * diff_val);
+        CudaAtomicAdd(
+            offset_bottom_data_diff + bottom_index_base + y0 * width + x1,
+            q10 * diff_val);
+        CudaAtomicAdd(
+            offset_bottom_data_diff + bottom_index_base + y1 * width + x1,
+            q11 * diff_val);
+
+        if (no_trans) {
+          continue;
+        }
+        T U00 = offset_bottom_data[bottom_index_base + y0 * width + x0];
+        T U01 = offset_bottom_data[bottom_index_base + y1 * width + x0];
+        T U10 = offset_bottom_data[bottom_index_base + y0 * width + x1];
+        T U11 = offset_bottom_data[bottom_index_base + y1 * width + x1];
+        T diff_x = (U11 * dist_y + U10 * (1 - dist_y) - U01 * dist_y -
+                    U00 * (1 - dist_y)) *
+                   trans_std * diff_val;
+        diff_x *= roi_width;
+        T diff_y = (U11 * dist_x + U01 * (1 - dist_x) - U10 * dist_x -
+                    U00 * (1 - dist_x)) *
+                   trans_std * diff_val;
+        diff_y *= roi_height;
+
+        CudaAtomicAdd(
+            bottom_trans_diff +
+                (((n * num_classes + class_id) * 2) * part_size + part_h) *
+                    part_size +
+                part_w,
+            diff_x);
+        CudaAtomicAdd(
+            bottom_trans_diff +
+                (((n * num_classes + class_id) * 2 + 1) * part_size + part_h) *
+                    part_size +
+                part_w,
+            diff_y);
+      }
+    }
+  }
+}
+template <typename DType>
+__global__ void DeformableConv2DCol2ImKernel(
+    const int n, const DType *data_col, const DType *data_offset,
+    const DType *data_mask, const int channels, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    DType *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const DType *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const DType *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+    const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+    const DType mask = data_mask_ptr[data_mask_hw_ptr];
+    const DType cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const DType cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const DType cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          DType weight =
+              DmcnGetGradientWeight(cur_inv_h_data, cur_inv_w_data, cur_h + dy,
+                                    cur_w + dx, height, width);
+          CudaAtomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+template <typename DType>
+__global__ void DeformableConv2DCol2ImCoordGPUKernel(
+    const int n, const DType *data_col, const DType *data_im,
+    const DType *data_offset, const DType *data_mask, const int channels,
+    const int height, const int width,  // 输入的C, H, W
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, DType *grad_offset,
+    DType *grad_mask) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    DType val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const DType *data_col_ptr =
+        data_col + deformable_group_index * channel_per_deformable_group *
+                       batch_size * width_col * height_col;
+    const DType *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const DType *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const DType *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const DType offset_h = data_offset_ptr[data_offset_h_ptr];
+      const DType offset_w = data_offset_ptr[data_offset_w_ptr];
+      const DType mask = data_mask_ptr[data_mask_hw_ptr];
+      DType inv_h = h_in + i * dilation_h + offset_h;
+      DType inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
+        inv_h = inv_w = -2;
+      } else {
+        mval += data_col_ptr[col_pos] *
+                DmcnIm2colBilinear(data_im_ptr + cnt * height * width, width,
+                                   height, width, inv_h, inv_w);
+      }
+      const DType weight = DmcnGetCoordinateWeight(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    if (offset_c % 2 == 0) {
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+    }
+  }
+}
+template <typename DType>
+__global__ void PureAddToKernel(const int n, DType *result_data,
+                                const DType *right_data) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    CudaAtomicAdd(result_data + index, right_data[index]);
+  }
+}
+template <typename DType>
+__global__ void SetZeroKernel(const int n, DType *result_data) {
+  CUDA_1D_KERNEL_LOOP(index, n) { *(result_data + index) = DType(0); }
+}
+template <typename DType>
+__global__ void SetOneKernel(const int n, DType *result_data) {
+  CUDA_1D_KERNEL_LOOP(index, n) { *(result_data + index) = DType(1); }
+}
+template <typename DType>
+__global__ void SetNumAtIndexKernel(DType num, int index, DType *data) {
+  *(data + index) = num;
+}
+template <typename DType>
+void DeformableConv2DCol2ImCoord<GPUDevice, DType>::operator()(
+    const Eigen::GpuDevice &d, const DType *data_col, const DType *data_im,
+    const DType *data_offset, const DType *data_mask, const TShape &im_shape,
+    const TShape &col_shape, const TShape &kernel_shape, const TShape &pad,
+    const TShape &stride, const TShape &dilation,
+    const int32_t deformable_group, DType *grad_offset, DType *grad_mask) {
+  int num_spatial_axes = kernel_shape.size();
+  int num_kernels = col_shape[1] * col_shape[2] * col_shape[3] * 2 *
+                    kernel_shape[0] * kernel_shape[1] * deformable_group;
+  int channel_per_deformable_group = col_shape[0] / deformable_group;
+  // num_axes should be smaller than block size
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  CHECK_LT(num_spatial_axes, config.thread_per_block);
+  switch (num_spatial_axes) {
+    case 2:
+      // To avoid involving atomic operations, we will launch one kernel per
+      // bottom dimension, and then in the kernel add up the top dimensions.
+      // NOLINT_NEXT_LINE(whitespace/operators)
+
+      DeformableConv2DCol2ImCoordGPUKernel<DType>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              num_kernels, data_col, data_im, data_offset, data_mask,
+              im_shape[1], im_shape[2], im_shape[3], kernel_shape[0],
+              kernel_shape[1], pad[0], pad[1], stride[0], stride[1],
+              dilation[0], dilation[1], channel_per_deformable_group,
+              col_shape[1],
+              2 * kernel_shape[0] * kernel_shape[1] * deformable_group,
+              deformable_group, col_shape[2], col_shape[3], grad_offset,
+              grad_mask);
+      // MSHADOW_CUDA_POST_KERNEL_CHECK(DeformableConv2DCol2ImCoordGPUKernel);
+      break;
+    default:
+      LOG(FATAL) << "col2im_nd_gpu does not support computation with "
+                 << num_spatial_axes << " spatial axes";
+  }
+}
+template <typename DType>
+void DeformableConv2DCol2Im<GPUDevice, DType>::operator()(
+    const GPUDevice &d, const DType *data_col, const DType *data_offset,
+    const DType *data_mask, const TShape &im_shape, const TShape &col_shape,
+    const TShape &kernel_shape, const TShape &pad, const TShape &stride,
+    const TShape &dilation, const int32_t deformable_group, DType *grad_im) {
+  int num_spatial_axes = kernel_shape.size();
+  int im_size = ProdShape(im_shape, 1, im_shape.size());
+  int channel_per_deformable_group = im_shape[1] / deformable_group;
+  int num_kernels = ProdShape(col_shape, 0, col_shape.size());
+  // num_axes should be smaller than block size
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  CHECK_LT(num_spatial_axes, config.thread_per_block);
+  //   using namespace mxnet_op;
+  switch (num_spatial_axes) {
+    case 2:
+      // To avoid involving atomic operations, we will launch one kernel per
+      // bottom dimension, and then in the kernel add up the top dimensions.
+      // NOLINT_NEXT_LINE(whitespace/operators)
+      DeformableConv2DCol2ImKernel<DType>
+          <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+              num_kernels, data_col, data_offset, data_mask, im_shape[1],
+              im_shape[2], im_shape[3], kernel_shape[0], kernel_shape[1],
+              pad[0], pad[1], stride[0], stride[1], dilation[0], dilation[1],
+              channel_per_deformable_group, col_shape[1], deformable_group,
+              col_shape[2], col_shape[3], grad_im);
+      // MSHADOW_CUDA_POST_KERNEL_CHECK(modulated_deformable_col2im_gpu_kernel);
+      break;
+    default:
+      LOG(FATAL) << "col2im_nd_gpu does not support computation with "
+                 << num_spatial_axes << " spatial axes";
+  }
+}
+
+template <typename DType>
+void DeformableConv2DIm2Col<GPUDevice, DType>::operator()(
+    const GPUDevice &d, const DType *data_im, const DType *data_offset,
+    const DType *data_mask, const TShape &im_shape, const TShape &col_shape,
+    const TShape &kernel_shape, const TShape &pad, const TShape &stride,
+    const TShape &dilation, const int32_t deformable_group, DType *data_col) {
+  int num_spatial_axes = kernel_shape.size();
+  int channel_per_deformable_group =
+      im_shape[1] / deformable_group;  // imshape[1] = 输入图的通道数
+  int num_kernels =
+      im_shape[1] *
+      ProdShape(col_shape, 1,
+                col_shape.size());  // K * N / k.Size(), k = filter, col_shape =
+                                    // [K, im2col_step_, H, W]
+  CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
+  CHECK_LT(num_spatial_axes, config.thread_per_block);
+  switch (num_spatial_axes) {
+    case 2:
+      DeformableConv2DIm2ColKernel<
+          DType>  // NOLINT_NEXT_LINE(whitespace/operators)
+          <<<config.block_count,
+             config
+                 .thread_per_block,  // 注意这里申请的block的个数是num_kernel个,
+             0, d.stream()>>>(
+              // CUDA对device(GPU
+              // )的内存管理主要通过cudaMalloc()、cudaFree()、cudaMemcpy()
+              // 进行管理。另外，从上述代码我们可以看到， add()
+              // 函数的调用比较奇怪相对于C语言来说，需要用add<<<M，N>>>
+              // 这种形式表明这是一个从host(CPU)代码调用device的代码，
+              //并且括号中的数值表明，M个block，每个block有 N个线程,
+              //所以这个函数总共有M*N个线程。
+              num_kernels, data_im, data_offset, data_mask, im_shape[2],
+              im_shape[3], kernel_shape[0], kernel_shape[1], pad[0], pad[1],
+              stride[0], stride[1], dilation[0], dilation[1],
+              channel_per_deformable_group, col_shape[1], im_shape[1],
+              deformable_group, col_shape[2], col_shape[3], data_col);
+      // MSHADOW_CUDA_POST_KERNEL_CHECK(modulated_deformable_im2col_gpu_kernel);
+      break;
+    default:
+      LOG(FATAL) << "im2col_nd_gpu does not support computation with "
+                 << num_spatial_axes << " spatial axes";
+  }
+}
+
+template <typename DType>
+void SetZeros<GPUDevice, DType>::operator()(const GPUDevice &d, int n,
+                                            DType *result_data) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(n, d);
+  SetZeroKernel<DType>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          n, result_data);
+}
+
+template <typename DType>
+void PureAddTo<GPUDevice, DType>::operator()(const GPUDevice &d, const int n,
+                                             DType *result_data,
+                                             const DType *right_data) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(n, d);
+  PureAddToKernel<DType>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          n, result_data, right_data);
+}
+
+template <typename DType>
+void SetOne<GPUDevice, DType>::operator()(const GPUDevice &d, int n,
+                                          DType *result_data) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(n, d);
+  SetOneKernel<DType>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          n, result_data);
+}
+
+template <typename DType>
+void SetNumAtIndex<GPUDevice, DType>::operator()(const GPUDevice &d, DType num,
+                                                 int index, DType *data) {
+  CudaLaunchConfig config = GetCudaLaunchConfig(1, d);
+  SetNumAtIndexKernel<DType>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          num, index, data);
+}
+
+// 如果没有在这里实例化的话, 生成的.so会报类似于 undefined symbol:
+// _ZN10tensorflow13setNumAtIndexIN5Eigen9GpuDeviceEfEclERKS2_fiPf的错误 I guess
+// the reason for instancing the functional structure below is that certifying
+// single functor instance for every functor.
+template struct DeformableConv2DIm2Col<GPUDevice, double>;
+template struct DeformableConv2DCol2Im<GPUDevice, double>;
+template struct DeformableConv2DCol2ImCoord<GPUDevice, double>;
+template struct PureAddTo<GPUDevice, double>;
+template struct SetOne<GPUDevice, double>;
+template struct SetZeros<GPUDevice, double>;
+template struct SwapAxis<GPUDevice, double>;
+template struct SetNumAtIndex<GPUDevice, double>;
+
+template struct DeformableConv2DIm2Col<GPUDevice, float>;
+template struct DeformableConv2DCol2Im<GPUDevice, float>;
+template struct DeformableConv2DCol2ImCoord<GPUDevice, float>;
+template struct PureAddTo<GPUDevice, float>;
+template struct SetOne<GPUDevice, float>;
+template struct SetZeros<GPUDevice, float>;
+template struct SwapAxis<GPUDevice, float>;
+template struct SetNumAtIndex<GPUDevice, float>;
+template <typename T>
+se::DeviceMemory<T> AsDeviceMemory(const T *cuda_memory) {
+  se::DeviceMemoryBase wrapped(const_cast<T *>(cuda_memory));
+  se::DeviceMemory<T> typed(wrapped);
+  return typed;
+}
+
+class CublasScratchAllocator : public se::ScratchAllocator {
+ public:
+  using Stream = se::Stream;
+  using DeviceMemoryBytes = se::DeviceMemory<uint8>;
+
+  CublasScratchAllocator(OpKernelContext *context) : context_(context) {}
+
+  int64 GetMemoryLimitInBytes() override { return -1; }
+
+  se::port::StatusOr<DeviceMemoryBytes> AllocateBytes(
+      int64 byte_size) override {
+    Tensor temporary_memory;
+
+    Status allocation_status(context_->allocate_temp(
+        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
+    if (!allocation_status.ok()) {
+      return se::port::StatusOr<DeviceMemoryBytes>(
+          DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
+    }
+    // Hold the reference of the allocated tensors until the end of the
+    // allocator.
+    allocated_tensors_.push_back(temporary_memory);
+    return se::port::StatusOr<DeviceMemoryBytes>(
+        DeviceMemoryBytes::MakeFromByteSize(
+            temporary_memory.flat<uint8>().data(),
+            temporary_memory.flat<uint8>().size()));
+  }
+
+  se::port::StatusOr<DeviceMemoryBytes> AllocateBytes(Stream *stream,
+                                                      int64 byte_size) {
+    Tensor temporary_memory;
+
+    Status allocation_status(context_->allocate_temp(
+        DT_UINT8, TensorShape({byte_size}), &temporary_memory));
+    if (!allocation_status.ok()) {
+      return se::port::StatusOr<DeviceMemoryBytes>(
+          DeviceMemoryBytes::MakeFromByteSize(nullptr, 0));
+    }
+    // Hold the reference of the allocated tensors until the end of the
+    // allocator.
+    allocated_tensors_.push_back(temporary_memory);
+    return se::port::StatusOr<DeviceMemoryBytes>(
+        DeviceMemoryBytes::MakeFromByteSize(
+            temporary_memory.flat<uint8>().data(),
+            temporary_memory.flat<uint8>().size()));
+  }
+
+ private:
+  OpKernelContext *context_;
+  std::vector<Tensor> allocated_tensors_;
+};
+
+template <typename Scalar>
+void LaunchBatchMatMul<GPUDevice, Scalar>::launch(
+    OpKernelContext *context, const TensorShape &in_x_shape,
+    const TensorShape &in_y_shape, const Scalar *in_x_ptr,
+    const Scalar *in_y_ptr, bool adj_x, bool adj_y, Scalar *out) {
+  constexpr se::blas::Transpose kTranspose =
+      is_complex<Scalar>::value ? se::blas::Transpose::kConjugateTranspose
+                                : se::blas::Transpose::kTranspose;
+  se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose, kTranspose};
+
+  const uint64 m = in_x_shape.dim_size(adj_x ? 2 : 1);
+  const uint64 k = in_x_shape.dim_size(adj_x ? 1 : 2);
+  const uint64 n = in_y_shape.dim_size(adj_y ? 1 : 2);
+  const uint64 batch_size = in_x_shape.dim_size(0);
+  auto blas_transpose_a = trans[adj_x];
+  auto blas_transpose_b = trans[adj_y];
+
+  auto *stream = context->op_device_context()->stream();
+  OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+  typedef se::DeviceMemory<Scalar> DeviceMemoryType;
+  std::vector<DeviceMemoryType> a_device_memory;
+  std::vector<DeviceMemoryType> b_device_memory;
+  std::vector<DeviceMemoryType> c_device_memory;
+  std::vector<DeviceMemoryType *> a_ptrs;
+  std::vector<DeviceMemoryType *> b_ptrs;
+  std::vector<DeviceMemoryType *> c_ptrs;
+  a_device_memory.reserve(batch_size);
+  b_device_memory.reserve(batch_size);
+  c_device_memory.reserve(batch_size);
+  a_ptrs.reserve(batch_size);
+  b_ptrs.reserve(batch_size);
+  c_ptrs.reserve(batch_size);
+  auto *a_base_ptr = in_x_ptr;
+  auto *b_base_ptr = in_y_ptr;
+  auto *c_base_ptr = out;
+  for (int64 i = 0; i < batch_size; ++i) {
+    a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+    b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+    c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+    a_ptrs.push_back(&a_device_memory.back());
+    b_ptrs.push_back(&b_device_memory.back());
+    c_ptrs.push_back(&c_device_memory.back());
+  }
+
+  typedef Scalar Coefficient;
+
+  // Cublas does
+  // C = A x B
+  // where A, B and C are assumed to be in column major.
+  // We want the output to be in row-major, so we can compute
+  // C' = B' x A', where ' stands for transpose (not adjoint).
+  // TODO(yangzihao): Choose the best of the three strategies using autotune.
+  if (batch_size == 1) {
+    // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
+    // overhead of the scratch allocator and the batch interface.
+    if (n == 1 &&
+        blas_transpose_b != se::blas::Transpose::kConjugateTranspose &&
+        blas_transpose_a != se::blas::Transpose::kConjugateTranspose) {
+      // This is a matrix*vector multiply so use GEMV to compute A * b.
+      // Here we are multiplying in the natural order, so we have to flip
+      // the transposition flag to compensate for the tensor being stored
+      // row-major. Since GEMV doesn't provide a way to just conjugate an
+      // argument, we have to defer those cases to GEMM below.
+      auto gemv_trans_a = blas_transpose_a == se::blas::Transpose::kTranspose
+                              ? se::blas::Transpose::kNoTranspose
+                              : se::blas::Transpose::kTranspose;
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
+                             static_cast<Coefficient>(1.0), *(a_ptrs[0]),
+                             adj_x ? m : k, *(b_ptrs[0]), 1,
+                             static_cast<Coefficient>(0.0), c_ptrs[0], 1)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMV launch failed : a.shape=", in_x_shape.DebugString(),
+            ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n,
+            ", k=", k));
+      }
+    } else {
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
+                             static_cast<Coefficient>(1.0), *(b_ptrs[0]),
+                             adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                             static_cast<Coefficient>(0.0), c_ptrs[0], n)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMM launch failed : a.shape=", in_x_shape.DebugString(),
+            ", b.shape=", in_y_shape.DebugString(), ", m=", m, ", n=", n,
+            ", k=", k));
+      }
+    }
+  } else {
+    CublasScratchAllocator scratch_allocator(context);
+    bool blas_launch_status =
+        stream
+            ->ThenBlasGemmBatchedWithScratch(
+                blas_transpose_b, blas_transpose_a, n, m, k,
+                static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
+                batch_size, &scratch_allocator)
+            .ok();
+    if (!blas_launch_status) {
+      context->SetStatus(errors::Internal(
+          "Blas xGEMMBatched launch failed : a.shape=",
+          in_x_shape.DebugString(), ", b.shape=", in_y_shape.DebugString(),
+          ", m=", m, ", n=", n, ", k=", k, ", batch_size=", batch_size));
+    }
+  }
+}
+template <typename T>
+void DeformablePSROIPoolForward<GPUDevice, T>::operator()(
+    const GPUDevice &d, const int count, const T *bottom_data,
+    const T spatial_scale, const int channels, const int height,
+    const int width, const int pooled_height, const int pooled_width,
+    const T *bottom_rois, const T *bottom_trans, const int no_trans,
+    const T trans_std, const int sample_per_part, const int output_dim,
+    const int group_size, const int part_size, const int num_classes,
+    const int channels_each_class, T *top_data, T *top_count) {
+  auto config = GetCudaLaunchConfig(count, d);
+  DeformablePSROIPoolForwardKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          count, bottom_data, spatial_scale, channels, height, width,
+          pooled_height, pooled_width, bottom_rois, bottom_trans, no_trans,
+          trans_std, sample_per_part, output_dim, group_size, part_size,
+          num_classes, channels_each_class, top_data, top_count);
+}
+template <typename T>
+void DeformablePSROIPoolBackwardKernel<GPUDevice, T>::operator()(
+    const GPUDevice &d, const int count, const T *top_diff, const T *top_count,
+    const int num_rois, const T spatial_scale, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int output_dim, T *bottom_data_diff,
+    T *bottom_trans_diff, const T *bottom_data, const T *bottom_rois,
+    const T *bottom_trans, const int no_trans, const T trans_std,
+    const int sample_per_part, const int group_size, const int part_size,
+    const int num_classes, const int channels_each_class) {
+  auto config = GetCudaLaunchConfig(count, d);
+  DeformablePSROIPoolBackwardAccKernel<T>
+      <<<config.block_count, config.thread_per_block, 0, d.stream()>>>(
+          count, top_diff, top_count, num_rois, spatial_scale, channels, height,
+          width, pooled_height, pooled_width, output_dim, bottom_data_diff,
+          bottom_trans_diff, bottom_data, bottom_rois, bottom_trans, no_trans,
+          trans_std, sample_per_part, group_size, part_size, num_classes,
+          channels_each_class);
+}
+template struct LaunchBatchMatMul<GPUDevice, float>;
+template struct LaunchBatchMatMul<GPUDevice, double>;
+template struct DeformablePSROIPoolForward<GPUDevice, float>;
+template struct DeformablePSROIPoolForward<GPUDevice, double>;
+template struct DeformablePSROIPoolBackwardKernel<GPUDevice, float>;
+template struct DeformablePSROIPoolBackwardKernel<GPUDevice, double>;
+#endif
+}  // namespace functor
+}  // namespace addons
+}  // namespace tensorflow
diff --git a/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc b/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc
new file mode 100644
index 0000000000..c9ec963b13
--- /dev/null
+++ b/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc
@@ -0,0 +1,287 @@
+//
+// Created by 孙嘉禾 on 2019/12/31.
+//
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <string>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+namespace addons {
+
+namespace functor {
+
+using shape_inference::DimensionHandle;
+using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
+
+REGISTER_OP("AddonsDeformableConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("offset: T")
+    .Input("mask: T")
+    .Output("output: T")
+    .Attr("T: {float, double}")
+    .Attr("strides: list(int)")
+    // .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("num_groups: int")
+    .Attr("deformable_groups: int")
+    .Attr("im2col_step: int")
+    .Attr("no_bias: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr("data_format: {'NCHW' } = 'NCHW' ")
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext *c) {
+      ShapeHandle input_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &input_shape));
+      ShapeHandle filter_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &filter_shape));
+      ShapeHandle offset_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 4, &offset_shape));
+      ShapeHandle mask_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 4, &mask_shape));
+
+      std::vector<int32> strides;
+      TF_RETURN_IF_ERROR(c->GetAttr("strides", &strides));
+      if (strides.size() != 4) {
+        return errors::InvalidArgument(
+            "Deformconv requires the stride attribute to contain 4 values, but "
+            "got: ",
+            strides.size());
+      }
+
+      std::vector<int32> rates;
+      TF_RETURN_IF_ERROR(c->GetAttr("dilations", &rates));
+      if (rates.size() != 4) {
+        return errors::InvalidArgument(
+            "Deformconv requires the dilations attribute to contain 4 values, "
+            "but "
+            "got: ",
+            rates.size());
+      }
+      string data_format;
+      TensorFormat data_format_;
+      TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format));
+      FormatFromString(data_format, &data_format_);
+      const int32 stride_rows = GetTensorDim(strides, data_format_, 'H');
+      const int32 stride_cols = GetTensorDim(strides, data_format_, 'W');
+
+      const int32 rate_rows = GetTensorDim(rates, data_format_, 'H');
+      const int32 rate_cols = GetTensorDim(rates, data_format_, 'W');
+
+      int groups;
+      TF_RETURN_IF_ERROR(c->GetAttr("num_groups", &groups));
+      int deform_groups;
+      TF_RETURN_IF_ERROR(c->GetAttr("deformable_groups", &deform_groups));
+
+      DimensionHandle batch_size_dim = c->Dim(input_shape, 0);
+      DimensionHandle in_depths_dim = c->Dim(input_shape, 1);
+      DimensionHandle in_rows_dim = c->Dim(input_shape, 2);
+      DimensionHandle in_cols_dim = c->Dim(input_shape, 3);
+      DimensionHandle filter_rows_dim = c->Dim(filter_shape, 2);
+      DimensionHandle filter_cols_dim = c->Dim(filter_shape, 3);
+      DimensionHandle filter_depth_dim = c->Dim(filter_shape, 1);
+      DimensionHandle output_depth_dim = c->Dim(filter_shape, 0);
+      DimensionHandle multiplied_depth;
+      DimensionHandle depth_per_dfgps;
+      auto filter_row = c->Value(filter_rows_dim);
+      auto filter_col = c->Value(filter_cols_dim);
+      auto offset_dpt = c->Value(c->Dim(offset_shape, 1));
+      if ((offset_dpt % (filter_row * filter_col) != 0) ||
+          (offset_dpt / (2 * filter_row * filter_col) != deform_groups)) {
+        return errors::InvalidArgument(
+            "Deformconv requires the offset compatible with filter, but "
+            "got: ",
+            c->DebugString(offset_shape));
+      }
+
+      auto mask_dpt = c->Value(c->Dim(mask_shape, 1));
+      if ((mask_dpt % (filter_row * filter_col) != 0) ||
+          (mask_dpt / (filter_row * filter_col) != deform_groups)) {
+        return errors::InvalidArgument(
+            "Deformconv requires the mask compatible with filter, but "
+            "got: ",
+            c->DebugString(offset_shape));
+      }
+
+      TF_RETURN_IF_ERROR(
+          c->Multiply(filter_depth_dim, groups, &multiplied_depth));
+      TF_RETURN_IF_ERROR(
+          c->Divide(filter_depth_dim, deform_groups, true, &depth_per_dfgps));
+      TF_RETURN_IF_ERROR(
+          c->Divide(in_depths_dim, deform_groups, true, &depth_per_dfgps));
+
+      if (!c->ValueKnown(in_rows_dim) || !c->ValueKnown(in_cols_dim) ||
+          !c->ValueKnown(filter_rows_dim) || !c->ValueKnown(filter_cols_dim)) {
+        ShapeHandle output_shape = c->MakeShape(
+            {batch_size_dim, output_depth_dim, InferenceContext::kUnknownDim,
+             InferenceContext::kUnknownDim});
+        c->set_output(0, output_shape);
+        return Status::OK();
+      }
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(input_shape, 1), multiplied_depth, &unused));
+
+      auto in_rows = c->Value(in_rows_dim);
+      auto in_cols = c->Value(in_cols_dim);
+      auto filter_rows = c->Value(filter_rows_dim);
+      auto filter_cols = c->Value(filter_cols_dim);
+      auto filter_rows_eff = filter_rows + (filter_rows - 1) * (rate_rows - 1);
+      auto filter_cols_eff = filter_cols + (filter_cols - 1) * (rate_cols - 1);
+
+      Padding padding;
+      TF_RETURN_IF_ERROR(c->GetAttr("padding", &padding));
+
+      int64 output_rows, output_cols;
+      int64 padding_before, padding_after;
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_rows, filter_rows_eff, stride_rows, padding, &output_rows,
+          &padding_before, &padding_after));
+      TF_RETURN_IF_ERROR(GetWindowedOutputSizeVerbose(
+          in_cols, filter_cols_eff, stride_cols, padding, &output_cols,
+          &padding_before, &padding_after));
+
+      ShapeHandle output_shape = c->MakeShape(
+          {batch_size_dim, output_depth_dim, output_rows, output_cols});
+      c->set_output(0, output_shape);
+      // shape_inference::ShapeHandle offset_shape = c->input(2);
+      // shape_inference::ShapeHandle mask_shape = c->input(3);
+      shape_inference::DimensionHandle offset_batch = c->Dim(offset_shape, 0);
+      shape_inference::DimensionHandle offset_channel = c->Dim(offset_shape, 1);
+      shape_inference::DimensionHandle offset_height = c->Dim(offset_shape, 2);
+      shape_inference::DimensionHandle offset_weight = c->Dim(offset_shape, 3);
+      shape_inference::DimensionHandle mask_channel = c->Dim(mask_shape, 1);
+      shape_inference::DimensionHandle mask_height = c->Dim(mask_shape, 2);
+      shape_inference::DimensionHandle mask_weight = c->Dim(mask_shape, 3);
+      shape_inference::DimensionHandle mask_batch = c->Dim(mask_shape, 0);
+      TF_RETURN_IF_ERROR(c->WithRank(offset_shape, 4, &offset_shape));
+      TF_RETURN_IF_ERROR(c->WithRank(mask_shape, 4, &mask_shape));
+      TF_RETURN_IF_ERROR(
+          c->WithValue(offset_batch, c->Value(batch_size_dim), &offset_batch));
+      TF_RETURN_IF_ERROR(c->WithValue(
+          offset_channel,
+          2 * c->Value(filter_rows_dim) * c->Value(filter_cols_dim),
+          &offset_channel));
+      TF_RETURN_IF_ERROR(
+          c->WithValue(offset_height, output_rows, &offset_height));
+      TF_RETURN_IF_ERROR(
+          c->WithValue(offset_weight, output_cols, &offset_weight));
+      TF_RETURN_IF_ERROR(
+          c->WithValue(mask_batch, c->Value(batch_size_dim), &mask_batch));
+      TF_RETURN_IF_ERROR(c->WithValue(
+          mask_channel, c->Value(filter_rows_dim) * c->Value(filter_cols_dim),
+          &mask_channel));
+      TF_RETURN_IF_ERROR(c->WithValue(mask_height, output_rows, &mask_height));
+      TF_RETURN_IF_ERROR(c->WithValue(mask_weight, output_cols, &mask_weight));
+      return Status::OK();
+    })
+    .Doc(R"doc(
+        DeformableConv2D is a new convolution operation with the deformable kernel locations.
+        The inputs should have format NCHW, which is faster on GPUS.
+        The offset and mask should have same input spatial resolution.
+        Also, the output's shape depends on the stride, and I only consider the situation of dilation rate = 1.
+    )doc");
+
+// Opkernel defination.
+// template parameter <T> is the datatype of the tensors
+// in my opnion, the deformable convolution op ought to be implemented by
+// extending the Conv2DOp, however, we can not get the conv_ops.h file if we
+// choose to dynamic link the op
+
+REGISTER_OP("AddonsDeformableConv2DBackProp")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("offset: T")
+    .Input("mask: T")
+    .Input("out_grad: T")
+    .Output("x_grad: T")
+    .Output("filter_grad: T")
+    .Output("offset_grad: T")
+    .Output("mask_grad: T")
+    .Attr("T: {float, double}")
+    .Attr("strides: list(int)")
+    // .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr("num_groups: int")
+    .Attr("deformable_groups: int")
+    .Attr("im2col_step: int")
+    .Attr("no_bias: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr("data_format: { 'NCHW' } = 'NCHW' ")
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .SetShapeFn([](InferenceContext *c) {
+      c->set_output(0, c->input(0));
+      c->set_output(1, c->input(1));
+      c->set_output(2, c->input(2));
+      c->set_output(3, c->input(3));
+      return Status::OK();
+    })
+    .Doc(R"doc(only support NCHW now)doc");
+
+REGISTER_OP("AddonsDeformablePsroiPool")
+    .Input("input: T")
+    .Input("bbox: T")
+    .Input("trans: T")
+    .Output("output: T")
+    .Output("top_count: T")
+    .Attr("T: {float, double}")
+    .Attr("pooled_size: int")
+    .Attr("no_trans: int")
+    .Attr("spatial_scale: float")
+    .Attr("output_dim: int")
+    .Attr("group_size: int")
+    .Attr("part_size: int")
+    .Attr("sample_per_part: int")
+    .Attr("trans_std: float")
+    .SetShapeFn([](InferenceContext *ctx) {
+      int pooled_size;
+      int output_dim;
+      TF_RETURN_IF_ERROR(ctx->GetAttr("pooled_size", &pooled_size));
+      TF_RETURN_IF_ERROR(ctx->GetAttr("output_dim", &output_dim));
+      auto input_handle = ctx->input(0);
+      auto batch = ctx->Dim(input_handle, 0);
+      auto output_dim_handle = ctx->MakeDim(output_dim);
+      auto pooled_size_handle = ctx->MakeDim(pooled_size);
+      ctx->set_output(
+          0, ctx->MakeShape({batch, output_dim_handle, pooled_size_handle,
+                             pooled_size_handle}));
+      ctx->set_output(
+          1, ctx->MakeShape({batch, output_dim_handle, pooled_size_handle,
+                             pooled_size_handle}));
+      return Status::OK();
+    })
+    .Doc(
+        R"doc(DeformablePsROIPool is a new pooling operation with the deformable
+kernel locations. The inpus should have format NCHW, which is faster on GPUS.)doc");
+REGISTER_OP("AddonsDeformablePsroiPoolBackProp")
+    .Input("data: T")
+    .Input("bbox: T")
+    .Input("trans: T")
+    .Input("top_count: T")
+    .Input("out_grad: T")
+    .Output("in_grad: T")
+    .Output("trans_grad: T")
+    .Attr("pooled_size: int")
+    .Attr("T: {float, double}")
+    .Attr("no_trans: int")
+    .Attr("spatial_scale: float")
+    .Attr("output_dim: int")
+    .Attr("group_size: int")
+    .Attr("part_size: int")
+    .Attr("sample_per_part: int")
+    .Attr("trans_std: float")
+    .SetShapeFn([](InferenceContext *ctx) {
+      ctx->set_output(0, ctx->input(0));
+      ctx->set_output(1, ctx->input(2));
+      return Status::OK();
+    })
+    .Doc(R"doc("BackProp operation for DeformablePSROIPool")doc");
+
+}  // namespace functor
+}  // namespace addons
+}  // namespace tensorflow
diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index 2ee52d4f6b..5483f6a64f 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -6,6 +6,7 @@ py_library(
     name = "layers",
     srcs = [
         "__init__.py",
+        "deformable_conv2d.py",
         "gelu.py",
         "maxout.py",
         "multihead_attention.py",
@@ -20,6 +21,7 @@ py_library(
     ],
     data = [
         "//tensorflow_addons/custom_ops/layers:_correlation_cost_ops.so",
+        "//tensorflow_addons/custom_ops/layers:_deformable_conv2d_ops.so",
     ],
     deps = [
         "//tensorflow_addons/activations",
@@ -156,6 +158,12 @@ py_test(
         "multihead_attention_test.py",
     ],
     main = "multihead_attention_test.py",
+    name = "deformable_conv_test",
+    size = "small",
+    srcs = [
+        "deformable_conv2d_test.py",
+    ],
+    main = "deformable_conv2d_test.py",
     deps = [
         ":layers",
     ],
diff --git a/tensorflow_addons/layers/__init__.py b/tensorflow_addons/layers/__init__.py
index 9606352b2d..733cb79c76 100644
--- a/tensorflow_addons/layers/__init__.py
+++ b/tensorflow_addons/layers/__init__.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Additional layers that conform to Keras API."""
 
+
 from tensorflow_addons.layers.gelu import GELU
 from tensorflow_addons.layers.maxout import Maxout
 from tensorflow_addons.layers.multihead_attention import MultiHeadAttention
@@ -25,3 +26,5 @@
 from tensorflow_addons.layers.sparsemax import Sparsemax
 from tensorflow_addons.layers.tlu import TLU
 from tensorflow_addons.layers.wrappers import WeightNormalization
+from tensorflow_addons.layers.deformable_conv2d import DeformableConv2D
+from tensorflow_addons.layers.deformable_conv2d import DeformablePSROIAlign
diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py
new file mode 100644
index 0000000000..948a48a80d
--- /dev/null
+++ b/tensorflow_addons/layers/deformable_conv2d.py
@@ -0,0 +1,414 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Tensorflow op performing correlation cost operation."""
+
+import tensorflow as tf
+from typeguard import typechecked
+from tensorflow_addons.utils.resource_loader import LazySO
+from tensorflow.python.keras.utils import conv_utils
+
+# _deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so")
+_deformable_conv2d_ops_so = LazySO(
+    "/home/admin-seu/TempData/sss/custom_ops/deformable_conv2d_ops_new/deformable_conv2D.so"
+)
+# _deformable_conv2d_ops_so = LazySO("/home/admin-seu/TempData/sss/SoftWare/addons/bazel-bin/tensorflow_addons/custom_ops/layers/_deformable_conv2d_ops.so")
+# _deformable_conv2d_ops_so = tf.load_op_library("custom_ops/layers/_deformable_conv2d_ops.so")
+
+
+def _deformable_conv2d(
+    input,
+    filter,
+    offset,
+    mask,
+    strides=[1, 1, 1, 1],
+    num_groups=1,
+    deformable_groups=1,
+    im2col_step=1,
+    no_bias=True,
+    padding="VALID",
+    data_format="NHWC",
+    dilations=[1, 1, 1, 1],
+):
+    if data_format == "NHWC":
+        input = tf.transpose(input, [0, 3, 1, 2])
+        filter = tf.transpose(filter, [3, 2, 0, 1])
+        offset = tf.transpose(offset, [0, 3, 1, 2])
+        mask = tf.transpose(mask, [0, 3, 1, 2])
+    ret = _deformable_conv2d_ops_so.ops.addons_deformable_conv2d(
+        input=input,
+        filter=filter,
+        offset=offset,
+        mask=mask,
+        strides=strides,
+        num_groups=num_groups,
+        deformable_groups=deformable_groups,
+        im2col_step=im2col_step,
+        no_bias=no_bias,
+        padding=padding,
+        data_format="NCHW",
+        dilations=dilations,
+    )
+    if data_format == "NHWC":
+        return tf.transpose(ret, [0, 2, 3, 1])
+    return ret
+
+
+@tf.RegisterGradient("AddonsDeformableConv2D")
+def _deformable_conv2d_back_prop(op, grad):
+    """The gradients for `deform_conv`.
+        Args:
+        op: The `deform_conv` `Operation` that we are differentiating, which we can use
+        to find the inputs and outputs of the original op.
+        grad: Gradient with respect to the output of the `roi_pool` op.
+        Returns:
+        Gradients with respect to the input of `deform_conv`.
+    """
+    data = op.inputs[0]
+    filter = op.inputs[1]
+    offset = op.inputs[2]
+    mask = op.inputs[3]
+    """
+        .Attr("strides: list(int)")
+        .Attr("num_groups: int")
+        .Attr("deformable_groups: int")
+        .Attr("im2col_step: int")
+        .Attr("no_bias: bool = true")
+        .Attr(GetPaddingAttrString())
+        .Attr("data_format: {'NCHW' } = 'NCHW' ")
+        .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    """
+    strides = op.get_attr("strides")
+    dilations = op.get_attr("dilations")
+    data_format = op.get_attr("data_format")
+    im2col_step = op.get_attr("im2col_step")
+    no_bias = op.get_attr("no_bias")
+    pads = op.get_attr("padding")
+    num_groups = op.get_attr("num_groups")
+    deformable_groups = op.get_attr("deformable_groups")
+    """
+    REGISTER_OP("Addons>DeformableConv2DBackProp")
+        .Input("input: T")
+        .Input("filter: T")
+        .Input("offset: T")
+        .Input("mask: T")
+        .Input("out_grad: T")
+        .Output("x_grad: T")
+        .Output("filter_grad: T")
+        .Output("offset_grad: T")
+        .Output("mask_grad: T")
+        .Attr("T: {float, double}")
+        .Attr("strides: list(int)")
+        .Attr("num_groups: int")
+        .Attr("deformable_groups: int")
+        .Attr("im2col_step: int")
+        .Attr("no_bias: bool = true")
+        .Attr(GetPaddingAttrString())
+        .Attr("data_format: { 'NCHW' } = 'NCHW' ")
+        .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    """
+    # compute gradient
+    data_grad = _deformable_conv2d_ops_so.ops.addons_deformable_conv2d_back_prop(
+        data,
+        filter,
+        offset,
+        mask,
+        grad,
+        strides=strides,
+        num_groups=num_groups,
+        deformable_groups=deformable_groups,
+        im2col_step=im2col_step,
+        no_bias=no_bias,
+        padding=pads,
+        data_format=data_format,
+        dilations=dilations,
+    )
+    return data_grad  # List of 4 Tensor, since we have 4 input
+
+
+# @tf.keras.utils.register_keras_serializable(package="Addons")
+class DeformableConv2D(tf.keras.layers.Layer):
+    @typechecked
+    def __init__(
+        self,
+        filters: int,
+        kernel_size=(3, 3),
+        num_groups=1,
+        deformable_groups=1,
+        strides=(1, 1),
+        im2col=1,
+        use_bias=False,
+        padding="valid",
+        data_format="channels_last",
+        dilations=(1, 1),
+    ):
+        super(DeformableConv2D, self).__init__()
+        self.filters = filters
+        self.kernel_size = kernel_size
+        self.num_groups = num_groups
+        self.deformable_groups = deformable_groups
+        self.strides = strides
+        self.im2col = im2col
+        self.use_bias = use_bias
+        self.padding = padding
+        self.data_format = data_format
+        self.dilations = dilations
+        self.conv_offset = tf.keras.layers.Conv2D(
+            self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            strides=(1, 1),
+            padding=self.padding,
+            use_bias=True,
+            data_format=data_format,
+        )
+
+    def build(self, input_shape):
+        if self.data_format == "channels_last":
+            channel = int(input_shape[-1])
+        else:
+            channel = int(input_shape[1])
+        if self.data_format == "channels_last":
+            self.filter = tf.Variable(
+                initial_value=tf.random.normal(
+                    shape=[
+                        self.kernel_size[0],
+                        self.kernel_size[1],
+                        channel,
+                        self.filters,
+                    ]
+                )
+            )
+        else:
+            self.filter = tf.Variable(
+                initial_value=tf.random.normal(
+                    shape=[
+                        self.filters,
+                        channel,
+                        self.kernel_size[0],
+                        self.kernel_size[1],
+                    ]
+                )
+            )
+        self.built = True
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_last":
+            space = input_shape[1:-1]
+            new_space = []
+            for i in range(len(space)):
+                new_dim = conv_utils.conv_output_length(
+                    space[i],
+                    self.kernel_size[i],
+                    padding=self.padding,
+                    stride=self.strides[i],
+                    dilation=self.dilation_rate[i],
+                )
+                new_space.append(new_dim)
+            return tensor_shape.TensorShape(
+                [input_shape[0]] + new_space + [self.filters]
+            )
+        else:
+            space = input_shape[2:]
+            new_space = []
+            for i in range(len(space)):
+                new_dim = conv_utils.conv_output_length(
+                    space[i],
+                    self.kernel_size[i],
+                    padding=self.padding,
+                    stride=self.strides[i],
+                    dilation=self.dilation_rate[i],
+                )
+                new_space.append(new_dim)
+            return tf.TensorShape([input_shape[0], self.filters] + new_space)
+
+    def call(self, inputs, **kwargs):
+        """
+        Build static Graph
+        :param inputs: [B, Height, Width, Channel]
+        :param kwargs:
+        :return:
+        """
+        weight_info = self.conv_offset(inputs)
+        tf_data_format = "NCHW"
+        tf_padding = "VALID"
+        if self.padding == "same":
+            tf_padding = "SAME"
+        if self.data_format == "channels_last":
+            tf_data_format = "NHWC"
+            o1, o2, mask = tf.split(weight_info, 3, axis=-1)
+            offset = tf.concat((o1, o2), axis=-1)
+            mask = tf.sigmoid(mask)
+        else:
+            o1, o2, mask = tf.split(weight_info, 3, axis=1)
+            offset = tf.concat((o1, o2), axis=1)
+            mask = tf.sigmoid(mask)
+        result = _deformable_conv2d(
+            input=inputs,
+            filter=self.filter,
+            offset=offset,
+            mask=mask,
+            strides=[1, self.strides[0], self.strides[1], 1],
+            num_groups=self.num_groups,
+            deformable_groups=self.deformable_groups,
+            im2col_step=self.im2col,
+            no_bias=(not self.use_bias),
+            padding=tf_padding,
+            data_format=tf_data_format,
+            dilations=[1, self.dilations[0], self.dilations[1], 1],
+        )
+        return result
+
+    def get_config(self):
+        config = {
+            "kernel_size": self.kernel_size,
+            "filters": self.filters,
+            "num_groups": self.num_groups,
+            "deformable_groups": self.deformable_groups,
+            "strides": self.strides,
+            "im2col": self.im2col,
+            "use_bias": self.use_bias,
+            "padding": self.padding,
+            "data_format": self.data_format,
+            "dilations": self.dilations,
+        }
+        base_config = super().get_config()
+        return {**base_config, **config}
+
+
+@tf.RegisterGradient("AddonsDeformablePsroiPool")
+def _deformable_psroi_pool_back_prop(op, *grad):
+    data = op.inputs[0]
+    bbox = op.inputs[1]
+    trans = op.inputs[2]
+    top_count = op.outputs[1]
+    pooled_size = op.get_attr("pooled_size")
+    no_trans = op.get_attr("no_trans")
+    spatial_scale = op.get_attr("spatial_scale")
+    output_dim = op.get_attr("output_dim")
+    group_size = op.get_attr("group_size")
+    part_size = op.get_attr("part_size")
+    sample_per_part = op.get_attr("sample_per_part")
+    trans_std = op.get_attr("trans_std")
+    data_grad = _deformable_conv2d_ops_so.ops.addons_deformable_psroi_pool_back_prop(
+        data,
+        bbox,
+        trans,
+        top_count,
+        grad[0],
+        pooled_size=pooled_size,
+        no_trans=no_trans,
+        spatial_scale=spatial_scale,
+        output_dim=output_dim,
+        group_size=group_size,
+        part_size=part_size,
+        sample_per_part=sample_per_part,
+        trans_std=trans_std,
+    )
+    return [data_grad[0], tf.zeros_like(bbox), data_grad[1]]
+
+
+# @tf.keras.utils.register_keras_serializable(package="Addons")
+class DeformablePSROIAlign(tf.keras.layers.Layer):
+    def __init__(
+        self,
+        output_dim=256,
+        spatial_scale=1 / 16,
+        group_size=1,
+        pooled_size=7,
+        sample_per_part=4,
+        part_size=7,
+        trans_std=1,
+        data_format="channels_last",
+    ):
+        super(DeformablePSROIAlign, self).__init__()
+        self.spatial_scale = spatial_scale
+        self.group_size = group_size
+        self.output_dim = output_dim
+        self.pooled_size = pooled_size
+        self.sample_per_part = sample_per_part
+        self.part_size = part_size
+        self.trans_std = trans_std
+        self.data_format = data_format
+        self.flat = tf.keras.layers.Flatten(data_format="channels_first")
+        self.fully_connect = tf.keras.layers.Dense(
+            self.pooled_size * self.pooled_size * 2
+        )
+
+    def compute_output_shape(self, input_shape):
+        data_shape = input_shape[0]
+        batch_size = data_shape[0]
+        if self.data_format == "channels_last":
+            return tf.TensorShape(
+                [batch_size, self.pooled_size, self.pooled_size, self.output_dim]
+            )
+        else:
+            return tf.TensorShape(
+                [batch_size, self.output_dim, self.pooled_size, self.pooled_size]
+            )
+
+    def call(self, inputs, **kwargs):
+        featuremap = inputs[0]
+        rois = inputs[1]
+        if self.data_format == "channels_last":
+            featuremap = tf.transpose(featuremap, perm=[0, 3, 1, 2])
+        (
+            offset_t,
+            top_count,
+        ) = _deformable_conv2d_ops_so.ops.addons_deformable_psroi_pool(
+            featuremap,
+            rois,
+            tf.convert_to_tensor(0.0),
+            pooled_size=self.pooled_size,
+            no_trans=True,
+            spatial_scale=self.spatial_scale,
+            output_dim=self.output_dim,
+            group_size=self.group_size,
+            part_size=self.part_size,
+            sample_per_part=self.sample_per_part,
+            trans_std=1.0,
+        )
+        offset_flat = self.flat(offset_t)
+        offset = self.fully_connect(offset_flat)
+        offset_reshape = tf.reshape(offset, shape=[-1, 2, 7, 7], name="offset_reshape")
+        ret, ret_count = _deformable_conv2d_ops_so.ops.addons_deformable_psroi_pool(
+            featuremap,
+            rois,
+            offset_reshape,
+            pooled_size=self.pooled_size,
+            no_trans=False,
+            spatial_scale=self.spatial_scale,
+            output_dim=self.output_dim,
+            group_size=self.group_size,
+            part_size=self.part_size,
+            sample_per_part=self.sample_per_part,
+            trans_std=self.trans_std,
+        )
+        if self.data_format == "channels_last":
+            ret = tf.transpose(ret, [0, 2, 3, 1])
+        return ret
+
+    def get_config(self):
+        config = {
+            "spatial_scale": self.spatial_scale,
+            "group_size": self.group_size,
+            "output_dim": self.output_dim,
+            "pooled_size": self.pooled_size,
+            "sample_per_part": self.sample_per_part,
+            "part_size": self.part_size,
+            "trans_std": self.trans_std,
+            "data_format": self.data_format,
+        }
+        base_config = super().get_config()
+        return {**config, **base_config}
diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py
new file mode 100644
index 0000000000..08e9e0a7b1
--- /dev/null
+++ b/tensorflow_addons/layers/deformable_conv2d_test.py
@@ -0,0 +1,279 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+import numpy as np
+import tensorflow as tf
+from tensorflow_addons.layers.deformable_conv2d import (
+    DeformableConv2D,
+    DeformablePSROIAlign,
+)
+from tensorflow_addons.layers.deformable_conv2d import (
+    _deformable_conv2d,
+    _deformable_conv2d_ops_so,
+)
+from tensorflow_addons.utils import test_utils
+
+
+@test_utils.run_all_in_graph_and_eager_modes
+class DeformableConv2DTest(tf.test.TestCase):
+    def _forward(
+        self,
+        input,
+        filters,
+        kernel_size=(3, 3),
+        num_groups=1,
+        deformable_groups=1,
+        strides=(1, 1),
+        im2col=1,
+        use_bias=False,
+        padding="valid",
+        data_format="channels_last",
+        dilations=(1, 1),
+    ):
+        input_op = tf.convert_to_tensor(input)
+        output = DeformableConv2D(
+            filters,
+            kernel_size,
+            num_groups,
+            deformable_groups,
+            strides,
+            im2col,
+            use_bias,
+            padding,
+            data_format,
+            dilations,
+        )(input_op)
+        return output
+
+    def _create_test_data(self, data_format):
+        height = 20
+        width = 20
+        channel = 3
+        batch = 1
+        val = np.random.uniform(size=[batch, height, width, channel]).astype(np.float32)
+        if data_format == "channels_first":
+            val = np.transpose(val, [0, 3, 1, 2])
+        return val
+
+    """
+    Because DeformableConv2D layer use built in random_normal initializer to initialize weight, So the output can't be actually tested, So here we just simple compare the result in tf.nn.conv2d and _deformable_conv2d function in deformable_conv2d.py.
+    """
+
+    def _forward_simple(self, data_format, use_gpu=False):
+        with test_utils.device(False):
+            batch_size = 4
+            padding = "SAME"
+            kernel_h = 3
+            kernel_w = 3
+            channel = 3
+            height = 20
+            width = 20
+            out_channel = 16
+            # input = tf.random.uniform(shape=[batch_size, channel, height, width], maxval=10)
+            input = tf.convert_to_tensor(
+                [i for i in range(batch_size * channel * height * width)],
+                dtype=tf.float32,
+            )
+            input = tf.reshape(input, [batch_size, channel, height, width])
+            input_trans = tf.transpose(input, [0, 2, 3, 1])
+            # filter = tf.random.uniform(
+            #        shape=[kernel_h, kernel_w, channel, out_channel], maxval=10
+            #    )
+            filter = tf.convert_to_tensor(
+                np.random.uniform(
+                    0, 1, [kernel_h, kernel_w, channel, out_channel]
+                ).astype(np.float32)
+            )
+            filter_deform = tf.transpose(filter, [3, 2, 0, 1])
+            offset = tf.constant(
+                [
+                    0.0
+                    for i in range(
+                        batch_size * kernel_h * kernel_w * 2 * height * width
+                    )
+                ],
+                shape=[batch_size, kernel_h * kernel_w * 2, height, width],
+            )
+            mask = tf.constant(
+                [1.0 for i in range(batch_size * kernel_h * kernel_w * height * width)],
+                shape=[batch_size, kernel_h * kernel_w, height, width],
+            )
+            result1 = _deformable_conv2d_ops_so.ops.addons_deformable_conv2d(
+                input=input,
+                filter=filter_deform,
+                offset=offset,
+                mask=mask,
+                strides=[1, 1, 1, 1],
+                num_groups=1,
+                deformable_groups=1,
+                im2col_step=1,
+                no_bias=True,
+                padding=padding,
+                data_format="NCHW",
+                dilations=[1, 1, 1, 1],
+            )
+            result2 = tf.nn.conv2d(input_trans, filter, [1, 1, 1, 1], padding)
+            result2 = tf.transpose(result2, [0, 3, 1, 2])
+            # print("Debug!", tf.reduce_mean(result1 - result2))
+            self.assertAllClose(result1, result2, 1e-4, 1e-4)
+
+    """
+    def _gradients(self, data_format, use_gpu=False):
+        with test_utils.device(use_gpu):
+            with tf.GradientTape(persistent=True) as tape:
+                val = self._create_test_data(data_format)
+                input = tf.constant(val, dtype=tf.float32)
+                if data_format == "channels_last":
+                    input = tf.transpose(input, [0, 3, 1, 2])
+                input_trans = tf.transpose(input, [0, 2, 3, 1])
+                padding = "SAME"
+                kernel_h = 3
+                kernel_w = 3
+                channel = 3
+                height = 20
+                width = 20
+                out_channel = 16
+                filter = tf.Variable(
+                    np.random.uniform(0, 1, [kernel_h, kernel_w, channel, out_channel]
+                    ).astype(np.float32)
+                )
+                tape.watch(filter)
+                filter_deform = tf.transpose(filter, [3, 2, 0, 1])
+                offset = tf.constant(
+                    [0.0 for i in range(kernel_h * kernel_w * 2 * height * width)],
+                    shape=[1, kernel_h * kernel_w * 2, height, width],
+                )
+                mask = tf.constant(
+                    [1.0 for i in range(kernel_h * kernel_w * height * width)],
+                    shape=[1, kernel_h * kernel_w, height, width],
+                )
+                result1 = _deformable_conv2d_ops_so.ops.addons_deformable_conv2d(
+                    input=input,
+                    filter=filter_deform,
+                    offset=offset,
+                    mask=mask,
+                    strides=[1, 1, 1, 1],
+                    num_groups=1,
+                    deformable_groups=1,
+                    im2col_step=1,
+                    no_bias=True,
+                    padding=padding,
+                    data_format="NCHW",
+                    dilations=[1, 1, 1, 1],
+                )
+                result2 = tf.nn.conv2d(input_trans, filter, [1, 1, 1, 1], "SAME")
+                grad1 = tape.gradient(result1, filter)
+                grad2 = tape.gradient(result2, filter)
+                self.assertAllClose(grad1, grad2, 1e-4, 1e-4)
+    """
+
+    def _keras(self, data_format, use_gpu=False):
+        inputs = self._create_test_data(data_format)
+        output = self._forward(inputs, 64, data_format=data_format)
+
+    def testForwardNCHW(self):
+        self._forward_simple(data_format="channels_first", use_gpu=False)
+        if tf.test.is_gpu_available():
+            self._forward_simple(data_format="channels_first", use_gpu=True)
+
+    def testForwardNHWC(self):
+        self._forward_simple(data_format="channels_last", use_gpu=False)
+        if tf.test.is_gpu_available():
+            self._forward_simple(data_format="channels_last", use_gpu=True)
+
+    """		
+    def testBackwardNCHW(self):
+        self._gradients(data_format="channels_first", use_gpu=False)
+        if tf.test.is_gpu_available():
+            self._gradients(data_format="channels_first", use_gpu=True)
+
+    def testBackwardNHWC(self):
+        self._gradients(data_format="channels_last", use_gpu=False)
+        if tf.test.is_gpu_available():
+            self._gradients(data_format="channels_last", use_gpu=True)
+    """
+
+    def testKerasNCHW(self):
+        self._keras(data_format="channels_first", use_gpu=False)
+        if tf.test.is_gpu_available():
+            self._keras(data_format="channels_first", use_gpu=True)
+
+    def testKerasNHWC(self):
+        self._keras(data_format="channels_last", use_gpu=False)
+        if tf.test.is_gpu_available():
+            self._keras(data_format="channels_last", use_gpu=True)
+
+
+@test_utils.run_all_in_graph_and_eager_modes
+class DeformablePSROIAlignTest(tf.test.TestCase):
+    def _forward_simple(self, data_format, use_gpu=False):
+        featuremap = tf.random.normal(shape=[1, 64, 100, 100])
+        rois = tf.convert_to_tensor(
+            [[0, 1, 1, 800, 800], [0, 2, 2, 400, 400]], dtype=tf.float32
+        )
+        spatial_scale = 1 / 16
+        group_size = 1
+        pooled_size = 7
+        sample_per_part = 4
+        no_trans = True
+        part_size = 7
+        trans_std = 1
+        (
+            offset_t,
+            top_count,
+        ) = _deformable_conv2d_ops_so.ops.addons_deformable_psroi_pool(
+            featuremap,
+            rois,
+            tf.convert_to_tensor(0),
+            pooled_size=pooled_size,
+            no_trans=True,
+            spatial_scale=spatial_scale,
+            output_dim=64,
+            group_size=group_size,
+            part_size=part_size,
+            sample_per_part=sample_per_part,
+            trans_std=trans_std,
+        )
+        return offset_t
+
+    def _keras(self, data_format, use_gpu=False):
+        featuremap = tf.random.normal(shape=[1, 64, 100, 100])
+        rois = tf.convert_to_tensor(
+            [[0, 1, 1, 800, 800], [0, 2, 2, 400, 400]], dtype=tf.float32
+        )
+        spatial_scale = 1 / 16
+        group_size = 1
+        pooled_size = 7
+        sample_per_part = 4
+        no_trans = True
+        part_size = 7
+        trans_std = 1
+        psroilayer = DeformablePSROIAlign(output_dim=64, data_format="channels_first")
+        ret = psroilayer([featuremap, rois])
+        return ret
+
+    def testKerasNCHW(self):
+        self._keras(data_format="channels_first", use_gpu=False)
+        if tf.test.is_gpu_available():
+            self._keras(data_format="channels_first", use_gpu=True)
+
+    def testKerasNHWC(self):
+        self._keras(data_format="channels_last", use_gpu=False)
+        if tf.test.is_gpu_available():
+            self._keras(data_format="channels_last", use_gpu=True)
+
+
+if __name__ == "__main__":
+    tf.test.main()

From bec6e34eee6f544064c28edd137fbf310447b927 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 08:55:10 -0500
Subject: [PATCH 03/22] fixed a problem

---
 tensorflow_addons/layers/deformable_conv2d.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py
index 948a48a80d..dd090848e7 100644
--- a/tensorflow_addons/layers/deformable_conv2d.py
+++ b/tensorflow_addons/layers/deformable_conv2d.py
@@ -216,7 +216,7 @@ def compute_output_shape(self, input_shape):
                     dilation=self.dilation_rate[i],
                 )
                 new_space.append(new_dim)
-            return tensor_shape.TensorShape(
+            return tf.TensorShape(
                 [input_shape[0]] + new_space + [self.filters]
             )
         else:

From a84a1f07e4a7d09669617fccd923bd4167fe0cc9 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 09:08:27 -0500
Subject: [PATCH 04/22] fix problem

---
 .../custom_ops/layers/cc/kernels/deformable_conv_op.h           | 2 ++
 .../custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc   | 2 --
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
index 04413f1f60..484baccd67 100755
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
@@ -24,7 +24,9 @@
 #endif  // __JETBRAINS_IDE__
 
 #define EIGEN_USE_THREADS
+#ifdef GOOGLE_CUDA
 #define EIGEN_USE_GPU
+#endif
 
 #include <iostream>
 #include <vector>
diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
index d7f52aaa12..9e88f66b56 100755
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
@@ -7,7 +7,6 @@
 #ifdef GOOGLE_CUDA
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
-#endif
 
 namespace tensorflow {
 namespace addons {
@@ -123,7 +122,6 @@ __device__ DType DmcnGetCoordinateWeight(DType argmax_h, DType argmax_w,
   return weight;
 }
 
-#ifdef GOOGLE_CUDA
 template <typename DType>
 __global__ void SwapAxisKernel(const int n, const int cuda_mem_size,
                                const int min_unit_size, DType *input_data,

From 30a543414e8c4b8d86c223377e1d790a4d5a0759 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 09:39:06 -0500
Subject: [PATCH 05/22] reformat

---
 .../layers/.deformable_conv2d_test.py.swp     | Bin 0 -> 16384 bytes
 tensorflow_addons/layers/deformable_conv2d.py |  38 +++++++++---------
 .../layers/deformable_conv2d_test.py          |  15 +------
 3 files changed, 20 insertions(+), 33 deletions(-)
 create mode 100644 tensorflow_addons/layers/.deformable_conv2d_test.py.swp

diff --git a/tensorflow_addons/layers/.deformable_conv2d_test.py.swp b/tensorflow_addons/layers/.deformable_conv2d_test.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..ddef5ff00d52eea8d16a37539521ed627963d517
GIT binary patch
literal 16384
zcmeI3ON<;x8OIw;;yB=#!~(=2qC7ri7RIv=Teh(^#N*wyJlNia-L)NSSz0yIH8Y*`
zba%R{cd`gMk`)OlA_op|;o<`vqVSMF5f%xLgcC0j9+xNxECnF}5)cIu4oKnuRrmC3
z*1K_Vgo3)I-%fYE>-(y{s;^vbPhDu8V@K;#2A=mC#@q)_eEQXWkKcCh`wW9y1KT|y
zMM!a5JYhR@Rd`_f=oV=x>T*)x<cb*hb6oLBDdi-$EYFpbO1RPsx{fzA;}C~$h(JzS
zqU!|%-gbo9@!a(p%Tz+Dy1%haePwhhfl>ll3G6lw&P?m6zQ3}M?Y-xGHnMy#B~VJB
zlt3wgQUav}N(qz_C?!xz;Qw3#%HLr;iax$G?q@IleMjN<_4v6EhxZHdJL2!f`1kvT
z`1rH@DJ4)!pp-x<fl>mc1WE~%5-255N}!ZLDS=V~r3C&565y6$+=hC;M-D&!{eR^D
zKflK?eh4mu<KS+v2mJ0X!}u-u5?BH|!E?I}<0@DL3*b1|5B~ff!}tyOF1QAI;3zl(
z4uf6brFR>~kHItGYv9Y^QDB3!;1H;QJHab=8phASx4={2v)~e#10Ml<!9R8x#_z#T
z!PDR>_#E)Tr@$#N4?X~PfWO~?{(zr=hrvA93k>l3PQ!Q&{QGv43tk1+z!$(exCBms
z`@kOX8U%SA{0La!82H<}4CCkE2S9=&U^jRc2PRK~%U~6}AH0G?n3us<z!l(w2f!il
zN9gATKze$XbVUy<JZrVFAWAW@X(+|Cx$5*yX@5qr53`3v;0nj=r>{f2&VatKdwqqM
zp>3)DrlsMA2D6#%vVgliQQb7n6mFTI(%iFZWIw%bR|>@%Om*wH>LC?(RlcmQ?I;nL
z(Up)Wp+c3?_oS#sku|n@u*PO;Y^uhl*Ycgue^J#FbuQJ0FRHFz$K4aI4$h1xqJ&f$
ziXKU+Hj>w{#yWlOLUc9eh02US)(py&eNX{fYI%ct=-PCnWwV|oeP0)WGL?s`E_5p<
zpL9PJn{TBKqF~wQzG%GhX4cq%KeEo92JQt={?W6aMbeBKv{l1x*GIRt7K#edQ4ktr
z2Yo;*MSbW5aTP*0y*kR1j?IjKqmfvP2I985wMelIjHOty<I`i+W`&WPR*Uf->vC)g
zOcZ8XrW<raX-m~*(iWr|t1fN94LhDuT7DR01|Xz5eb^YOlYaH=Mes~s)A8&c3m<j9
z!4Bq%iD$qNC2bnfAB*sDpUZZ82(K(R&&^kI#iiXv{GlXjXsbS>dtPR!Y)95Dv2J%n
z^%Qrcm^h}B!3Z!D>KpRFf)N||j;KoEbZgAQ)iR11a8+ZW6lTv48>6f`Z>18o;-u(s
zBw%yN9kW@wVm8P0-80I|+97OCY}zn;O_$1b5MV2tJFuMrQ(oqi4Yj$fv1N~`z96H3
z3Bo$uy-%^WV7#M3?l>E|cf!)?`XUg>#~uu&Vo_IE#~b(@!Bj>dWEeF9quF)IIN}~0
zgE?WbEzDexW8Kj0DBE+X(PNkAaBr@jh#(PnAXFH*42B>HVtDWkt-x}nMixQQjHc;~
zd7YVsb~h_|)_}g_K_tzt9Y|G)6qhTxI0+!o#;P6^wRqu)r+YFR*0!|SIx8AIopfZg
zfxO9FD9!^tygGs4O6UuRCXK4`0fxbt)D%lG(s&X&5!p1;^YfF{vK>xpl6kw79lS{y
z3{O0b@~LTbw{2c1!ye3ZJSU%0s=&5HAz@~C^a3yR3(;;kFgFX2Z2;{Y@|8r3My8Z_
z^f$(4Sh}QXiwNmJY0<i0FrYd<5?ws=Ng9dt8dVI#>4~%*838uZ0q9_zucV1$VKyol
z&$g9`7zs487ioq_q>bq`Bo^p+{fRA!WtYvAN;Dgqk!PcdiFGj3oiUMWd1RVFCeP3!
z$r!|{(Ls;pnAlffQmYWNzCueMvs@e06B%0@gsuq-W!f$}!2P}nJ7Wso!^UgiVF+RC
z*$un_3|QgDZe81I22<<>Dr3EGlsvlN3)`-3b$Y>fRKmDz!!VJi^o(dY%q1B=vAnd{
zYC3k$jYN^gME1P)W}8qolHQCu%ecfq9fd=o{sslPKHMG{y<7R9K*t`~DzXzSbS+wK
zuppepV(3ayM}lP$tr2mM-q27I>5GotwL4hK++GOnv7U#S<<d&*i(p_&Nydj35tMSV
zwQ_p#{0h6+Tv}=_thDBr+2RtLU0j%Jt+W;w@O6qc7cQ|gt%bQ76Ih+m67h(Sl}R$%
z{A2mGg+&drY?>MEPzh_i#mttu1X!iG%PP%f)>^Kxlg;JUat&picO5D?5D$lTfO6Up
zv5y)$w9cVNL#mVa0)aTa`m$kw_1NvznDn{|TLNM1M>+KLmGX~GP7a5|I`?^}FX~><
zn?zY8K{<K0H9NnsJbz#Y?g?eN<b~K1Z!oW|I2KTjg%}F<x>>n1u8?vgPU(6|?-W!Y
zox^k$5KKq-decCwxMOp-gY{i+Y4Xa8m6jD$Q@3%X;TpSuJMgraqkXJOimk+%Dig<b
zgJ;pcKznF2Q_~-1D@d}UXXYu~rDx4h^}RsWS<`XY5+##tNk|c_leWnJ{|<c5J@8e@
z|6e4(63>h9^`8Kr2UkEF+zb8*fBzfc>);8{0u}HQ{QX~pUxBBA0_VUka0mDceEp}u
zW1tC6fcwEc;5vN!uYxXUf`7q}|10<fxCX9*$AAkS01MzK*au#O5C0taCU_hO@NuvY
z+zwubAOAe~B2eH#umnB{J_N4Azkdb<pbzc_e}FIlOYj2tHVD8OuoK)4{sdV)39f+q
z!N))YQ~=5GE>J$D1WF0K?Ib|U#b|Rhfc39V7;tJ#)8zwUn%bi=O^Ps0_%i9npYoCG
zSvRqh<FdNd7Bq24SAtNa9s%kZ)oe?0SWUjp;UQqXt7f*nH(g8Q_8Qbe?6dTy><qPW
z;q=8>?agEz7)rH$4UsRGde(1IM~c<luqU&pFOKyjcE#{xP^aj@J6sH31|GzU|5;XD
z?ZUYW12N!!jUiNKeAzS^@}#@Tbe~y`JR8%p2aUsra%T=1uOsS1s^#>JTC_?hW8}Tm
z*wHDx;?G)gT!s^~LllTVYX~hKW*JlfJ=E|UxlV>ofj?<aqolFd9i3%V<C7;egL*Pe
zwc<}K4H`A^uzsVbD^%;q7uH9*W~zm}1oXLrDV5bo-crwoijkbobxdqnO`L|PM&Uc3
zm{RED>_+mIPoRC-C`J03Pm))wL2Vd4(#JlWJErV#M>KL@`7&Ib+d{&!K{F#C<$InZ
z=(q~U9GUlgqD|f4sf<rZEh4?=v!zN-!zZBc8@f?-cvJQ0KU-$g=&^57>ryA3YE+(&
z|AC4B?3or%!O^L`nF&p^C0a!@(i53PI|w~uP#N?TRExOuI4~osT%J^mxe9W+p%^xq
zJq^u|!MD#Wy!pe<;vqn4zi!b%K%~0p@AfN4bO=+ffLdPLi|b9VfTQE>k#?f0aF}D+
z!c}km{^TZW+GJ%v$vY3x!TvVel<g`iO7Oqfs@!Z<TXib4V<8nmUAy3{j<iL;k;V!W
J{?>0R{{u6P7R>+v

literal 0
HcmV?d00001

diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py
index dd090848e7..338e714fee 100644
--- a/tensorflow_addons/layers/deformable_conv2d.py
+++ b/tensorflow_addons/layers/deformable_conv2d.py
@@ -143,15 +143,15 @@ class DeformableConv2D(tf.keras.layers.Layer):
     def __init__(
         self,
         filters: int,
-        kernel_size=(3, 3),
-        num_groups=1,
-        deformable_groups=1,
-        strides=(1, 1),
-        im2col=1,
-        use_bias=False,
-        padding="valid",
-        data_format="channels_last",
-        dilations=(1, 1),
+        kernel_size: tuple = (3, 3),
+        num_groups: int = 1,
+        deformable_groups: int = 1,
+        strides: tuple = (1, 1),
+        im2col: int = 1,
+        use_bias: bool = False,
+        padding: str = "valid",
+        data_format: str = "channels_last",
+        dilations: tuple = (1, 1),
     ):
         super(DeformableConv2D, self).__init__()
         self.filters = filters
@@ -216,9 +216,7 @@ def compute_output_shape(self, input_shape):
                     dilation=self.dilation_rate[i],
                 )
                 new_space.append(new_dim)
-            return tf.TensorShape(
-                [input_shape[0]] + new_space + [self.filters]
-            )
+            return tf.TensorShape([input_shape[0]] + new_space + [self.filters])
         else:
             space = input_shape[2:]
             new_space = []
@@ -323,14 +321,14 @@ def _deformable_psroi_pool_back_prop(op, *grad):
 class DeformablePSROIAlign(tf.keras.layers.Layer):
     def __init__(
         self,
-        output_dim=256,
-        spatial_scale=1 / 16,
-        group_size=1,
-        pooled_size=7,
-        sample_per_part=4,
-        part_size=7,
-        trans_std=1,
-        data_format="channels_last",
+        output_dim: int = 256,
+        spatial_scale: float = 1 / 16,
+        group_size: int = 1,
+        pooled_size: int = 7,
+        sample_per_part: int = 4,
+        part_size: int = 7,
+        trans_std: int = 1,
+        data_format: str = "channels_last",
     ):
         super(DeformablePSROIAlign, self).__init__()
         self.spatial_scale = spatial_scale
diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py
index 08e9e0a7b1..da66c6541f 100644
--- a/tensorflow_addons/layers/deformable_conv2d_test.py
+++ b/tensorflow_addons/layers/deformable_conv2d_test.py
@@ -19,10 +19,7 @@
     DeformableConv2D,
     DeformablePSROIAlign,
 )
-from tensorflow_addons.layers.deformable_conv2d import (
-    _deformable_conv2d,
-    _deformable_conv2d_ops_so,
-)
+from tensorflow_addons.layers.deformable_conv2d import _deformable_conv2d_ops_so
 from tensorflow_addons.utils import test_utils
 
 
@@ -181,7 +178,7 @@ def _gradients(self, data_format, use_gpu=False):
 
     def _keras(self, data_format, use_gpu=False):
         inputs = self._create_test_data(data_format)
-        output = self._forward(inputs, 64, data_format=data_format)
+        self._forward(inputs, 64, data_format=data_format)
 
     def testForwardNCHW(self):
         self._forward_simple(data_format="channels_first", use_gpu=False)
@@ -227,7 +224,6 @@ def _forward_simple(self, data_format, use_gpu=False):
         group_size = 1
         pooled_size = 7
         sample_per_part = 4
-        no_trans = True
         part_size = 7
         trans_std = 1
         (
@@ -253,13 +249,6 @@ def _keras(self, data_format, use_gpu=False):
         rois = tf.convert_to_tensor(
             [[0, 1, 1, 800, 800], [0, 2, 2, 400, 400]], dtype=tf.float32
         )
-        spatial_scale = 1 / 16
-        group_size = 1
-        pooled_size = 7
-        sample_per_part = 4
-        no_trans = True
-        part_size = 7
-        trans_std = 1
         psroilayer = DeformablePSROIAlign(output_dim=64, data_format="channels_first")
         ret = psroilayer([featuremap, rois])
         return ret

From 948f8efba83cd9aea30d3fc7e70c0e6c78d6f6db Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 09:41:23 -0500
Subject: [PATCH 06/22] reformat

---
 .../layers/.deformable_conv2d_test.py.swp       | Bin 16384 -> 0 bytes
 .../layers/deformable_conv2d_test.py            |   2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 delete mode 100644 tensorflow_addons/layers/.deformable_conv2d_test.py.swp

diff --git a/tensorflow_addons/layers/.deformable_conv2d_test.py.swp b/tensorflow_addons/layers/.deformable_conv2d_test.py.swp
deleted file mode 100644
index ddef5ff00d52eea8d16a37539521ed627963d517..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16384
zcmeI3ON<;x8OIw;;yB=#!~(=2qC7ri7RIv=Teh(^#N*wyJlNia-L)NSSz0yIH8Y*`
zba%R{cd`gMk`)OlA_op|;o<`vqVSMF5f%xLgcC0j9+xNxECnF}5)cIu4oKnuRrmC3
z*1K_Vgo3)I-%fYE>-(y{s;^vbPhDu8V@K;#2A=mC#@q)_eEQXWkKcCh`wW9y1KT|y
zMM!a5JYhR@Rd`_f=oV=x>T*)x<cb*hb6oLBDdi-$EYFpbO1RPsx{fzA;}C~$h(JzS
zqU!|%-gbo9@!a(p%Tz+Dy1%haePwhhfl>ll3G6lw&P?m6zQ3}M?Y-xGHnMy#B~VJB
zlt3wgQUav}N(qz_C?!xz;Qw3#%HLr;iax$G?q@IleMjN<_4v6EhxZHdJL2!f`1kvT
z`1rH@DJ4)!pp-x<fl>mc1WE~%5-255N}!ZLDS=V~r3C&565y6$+=hC;M-D&!{eR^D
zKflK?eh4mu<KS+v2mJ0X!}u-u5?BH|!E?I}<0@DL3*b1|5B~ff!}tyOF1QAI;3zl(
z4uf6brFR>~kHItGYv9Y^QDB3!;1H;QJHab=8phASx4={2v)~e#10Ml<!9R8x#_z#T
z!PDR>_#E)Tr@$#N4?X~PfWO~?{(zr=hrvA93k>l3PQ!Q&{QGv43tk1+z!$(exCBms
z`@kOX8U%SA{0La!82H<}4CCkE2S9=&U^jRc2PRK~%U~6}AH0G?n3us<z!l(w2f!il
zN9gATKze$XbVUy<JZrVFAWAW@X(+|Cx$5*yX@5qr53`3v;0nj=r>{f2&VatKdwqqM
zp>3)DrlsMA2D6#%vVgliQQb7n6mFTI(%iFZWIw%bR|>@%Om*wH>LC?(RlcmQ?I;nL
z(Up)Wp+c3?_oS#sku|n@u*PO;Y^uhl*Ycgue^J#FbuQJ0FRHFz$K4aI4$h1xqJ&f$
ziXKU+Hj>w{#yWlOLUc9eh02US)(py&eNX{fYI%ct=-PCnWwV|oeP0)WGL?s`E_5p<
zpL9PJn{TBKqF~wQzG%GhX4cq%KeEo92JQt={?W6aMbeBKv{l1x*GIRt7K#edQ4ktr
z2Yo;*MSbW5aTP*0y*kR1j?IjKqmfvP2I985wMelIjHOty<I`i+W`&WPR*Uf->vC)g
zOcZ8XrW<raX-m~*(iWr|t1fN94LhDuT7DR01|Xz5eb^YOlYaH=Mes~s)A8&c3m<j9
z!4Bq%iD$qNC2bnfAB*sDpUZZ82(K(R&&^kI#iiXv{GlXjXsbS>dtPR!Y)95Dv2J%n
z^%Qrcm^h}B!3Z!D>KpRFf)N||j;KoEbZgAQ)iR11a8+ZW6lTv48>6f`Z>18o;-u(s
zBw%yN9kW@wVm8P0-80I|+97OCY}zn;O_$1b5MV2tJFuMrQ(oqi4Yj$fv1N~`z96H3
z3Bo$uy-%^WV7#M3?l>E|cf!)?`XUg>#~uu&Vo_IE#~b(@!Bj>dWEeF9quF)IIN}~0
zgE?WbEzDexW8Kj0DBE+X(PNkAaBr@jh#(PnAXFH*42B>HVtDWkt-x}nMixQQjHc;~
zd7YVsb~h_|)_}g_K_tzt9Y|G)6qhTxI0+!o#;P6^wRqu)r+YFR*0!|SIx8AIopfZg
zfxO9FD9!^tygGs4O6UuRCXK4`0fxbt)D%lG(s&X&5!p1;^YfF{vK>xpl6kw79lS{y
z3{O0b@~LTbw{2c1!ye3ZJSU%0s=&5HAz@~C^a3yR3(;;kFgFX2Z2;{Y@|8r3My8Z_
z^f$(4Sh}QXiwNmJY0<i0FrYd<5?ws=Ng9dt8dVI#>4~%*838uZ0q9_zucV1$VKyol
z&$g9`7zs487ioq_q>bq`Bo^p+{fRA!WtYvAN;Dgqk!PcdiFGj3oiUMWd1RVFCeP3!
z$r!|{(Ls;pnAlffQmYWNzCueMvs@e06B%0@gsuq-W!f$}!2P}nJ7Wso!^UgiVF+RC
z*$un_3|QgDZe81I22<<>Dr3EGlsvlN3)`-3b$Y>fRKmDz!!VJi^o(dY%q1B=vAnd{
zYC3k$jYN^gME1P)W}8qolHQCu%ecfq9fd=o{sslPKHMG{y<7R9K*t`~DzXzSbS+wK
zuppepV(3ayM}lP$tr2mM-q27I>5GotwL4hK++GOnv7U#S<<d&*i(p_&Nydj35tMSV
zwQ_p#{0h6+Tv}=_thDBr+2RtLU0j%Jt+W;w@O6qc7cQ|gt%bQ76Ih+m67h(Sl}R$%
z{A2mGg+&drY?>MEPzh_i#mttu1X!iG%PP%f)>^Kxlg;JUat&picO5D?5D$lTfO6Up
zv5y)$w9cVNL#mVa0)aTa`m$kw_1NvznDn{|TLNM1M>+KLmGX~GP7a5|I`?^}FX~><
zn?zY8K{<K0H9NnsJbz#Y?g?eN<b~K1Z!oW|I2KTjg%}F<x>>n1u8?vgPU(6|?-W!Y
zox^k$5KKq-decCwxMOp-gY{i+Y4Xa8m6jD$Q@3%X;TpSuJMgraqkXJOimk+%Dig<b
zgJ;pcKznF2Q_~-1D@d}UXXYu~rDx4h^}RsWS<`XY5+##tNk|c_leWnJ{|<c5J@8e@
z|6e4(63>h9^`8Kr2UkEF+zb8*fBzfc>);8{0u}HQ{QX~pUxBBA0_VUka0mDceEp}u
zW1tC6fcwEc;5vN!uYxXUf`7q}|10<fxCX9*$AAkS01MzK*au#O5C0taCU_hO@NuvY
z+zwubAOAe~B2eH#umnB{J_N4Azkdb<pbzc_e}FIlOYj2tHVD8OuoK)4{sdV)39f+q
z!N))YQ~=5GE>J$D1WF0K?Ib|U#b|Rhfc39V7;tJ#)8zwUn%bi=O^Ps0_%i9npYoCG
zSvRqh<FdNd7Bq24SAtNa9s%kZ)oe?0SWUjp;UQqXt7f*nH(g8Q_8Qbe?6dTy><qPW
z;q=8>?agEz7)rH$4UsRGde(1IM~c<luqU&pFOKyjcE#{xP^aj@J6sH31|GzU|5;XD
z?ZUYW12N!!jUiNKeAzS^@}#@Tbe~y`JR8%p2aUsra%T=1uOsS1s^#>JTC_?hW8}Tm
z*wHDx;?G)gT!s^~LllTVYX~hKW*JlfJ=E|UxlV>ofj?<aqolFd9i3%V<C7;egL*Pe
zwc<}K4H`A^uzsVbD^%;q7uH9*W~zm}1oXLrDV5bo-crwoijkbobxdqnO`L|PM&Uc3
zm{RED>_+mIPoRC-C`J03Pm))wL2Vd4(#JlWJErV#M>KL@`7&Ib+d{&!K{F#C<$InZ
z=(q~U9GUlgqD|f4sf<rZEh4?=v!zN-!zZBc8@f?-cvJQ0KU-$g=&^57>ryA3YE+(&
z|AC4B?3or%!O^L`nF&p^C0a!@(i53PI|w~uP#N?TRExOuI4~osT%J^mxe9W+p%^xq
zJq^u|!MD#Wy!pe<;vqn4zi!b%K%~0p@AfN4bO=+ffLdPLi|b9VfTQE>k#?f0aF}D+
z!c}km{^TZW+GJ%v$vY3x!TvVel<g`iO7Oqfs@!Z<TXib4V<8nmUAy3{j<iL;k;V!W
J{?>0R{{u6P7R>+v

diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py
index da66c6541f..a66d34bd91 100644
--- a/tensorflow_addons/layers/deformable_conv2d_test.py
+++ b/tensorflow_addons/layers/deformable_conv2d_test.py
@@ -190,7 +190,7 @@ def testForwardNHWC(self):
         if tf.test.is_gpu_available():
             self._forward_simple(data_format="channels_last", use_gpu=True)
 
-    """		
+    """
     def testBackwardNCHW(self):
         self._gradients(data_format="channels_first", use_gpu=False)
         if tf.test.is_gpu_available():

From 176fcb12882fbac79bd6bd6e60a2936f6e8acf00 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 09:56:50 -0500
Subject: [PATCH 07/22] fixed problem

---
 tensorflow_addons/layers/deformable_conv2d.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py
index 338e714fee..b3a4447ac7 100644
--- a/tensorflow_addons/layers/deformable_conv2d.py
+++ b/tensorflow_addons/layers/deformable_conv2d.py
@@ -19,10 +19,10 @@
 from tensorflow_addons.utils.resource_loader import LazySO
 from tensorflow.python.keras.utils import conv_utils
 
-# _deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so")
-_deformable_conv2d_ops_so = LazySO(
-    "/home/admin-seu/TempData/sss/custom_ops/deformable_conv2d_ops_new/deformable_conv2D.so"
-)
+_deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so")
+# _deformable_conv2d_ops_so = LazySO(
+#    "/home/admin-seu/TempData/sss/custom_ops/deformable_conv2d_ops_new/deformable_conv2D.so"
+# )
 # _deformable_conv2d_ops_so = LazySO("/home/admin-seu/TempData/sss/SoftWare/addons/bazel-bin/tensorflow_addons/custom_ops/layers/_deformable_conv2d_ops.so")
 # _deformable_conv2d_ops_so = tf.load_op_library("custom_ops/layers/_deformable_conv2d_ops.so")
 

From 603ffa79a32d2e16cff745c5da5fbfeed6043246 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 10:04:10 -0500
Subject: [PATCH 08/22] fix problem

---
 .../custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
index 9e88f66b56..d3e0ea8861 100755
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
@@ -987,7 +987,7 @@ template struct DeformablePSROIPoolForward<GPUDevice, float>;
 template struct DeformablePSROIPoolForward<GPUDevice, double>;
 template struct DeformablePSROIPoolBackwardKernel<GPUDevice, float>;
 template struct DeformablePSROIPoolBackwardKernel<GPUDevice, double>;
-#endif
 }  // namespace functor
 }  // namespace addons
 }  // namespace tensorflow
+#endif

From 1c7b33ad3cb028835e6043c57a0e1cdb35fde15c Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 10:23:30 -0500
Subject: [PATCH 09/22] fixed cpu don't support NCHW format in nn.conv2d

---
 tensorflow_addons/layers/deformable_conv2d.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py
index b3a4447ac7..af753efcc5 100644
--- a/tensorflow_addons/layers/deformable_conv2d.py
+++ b/tensorflow_addons/layers/deformable_conv2d.py
@@ -170,7 +170,7 @@ def __init__(
             strides=(1, 1),
             padding=self.padding,
             use_bias=True,
-            data_format=data_format,
+            data_format="channels_last",
         )
 
     def build(self, input_shape):
@@ -238,7 +238,10 @@ def call(self, inputs, **kwargs):
         :param kwargs:
         :return:
         """
-        weight_info = self.conv_offset(inputs)
+        if self.data_format == "channels_first":
+            weight_info = self.conv_offset(tf.transpose(inputs, [0, 2, 3, 1]))
+        else:
+            weight_info = self.conv_offset(inputs)
         tf_data_format = "NCHW"
         tf_padding = "VALID"
         if self.padding == "same":
@@ -249,7 +252,7 @@ def call(self, inputs, **kwargs):
             offset = tf.concat((o1, o2), axis=-1)
             mask = tf.sigmoid(mask)
         else:
-            o1, o2, mask = tf.split(weight_info, 3, axis=1)
+            o1, o2, mask = tf.split(tf.transpose(weight_info, [0, 3, 1, 2]), 3, axis=1)
             offset = tf.concat((o1, o2), axis=1)
             mask = tf.sigmoid(mask)
         result = _deformable_conv2d(

From 527dc81eb17f27ef4b084e133a18fc35dc070265 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 11:02:55 -0500
Subject: [PATCH 10/22] fixed cpu don't support NCHW format in nn.conv2d

---
 .../layers/cc/kernels/deformable_conv_op.cc           | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
index 54e2ab2176..ce4bd2f53a 100644
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
@@ -121,7 +121,15 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) {
   return {adj_x ? 0 : 1, adj_y ? 1 : 0};
 }
-
+#ifdef PLATFORM_WINDOWS
+#include <mutex>
+template <typename T>
+void AtomicAdd(T *address, T val) {
+  static std::mutex mu;
+  std::lock_guard<std::mutex> lk(mu);
+  *address += val;
+}
+#else
 void AtomicAdd(float *address, float val) {
   auto *address_as_ull = reinterpret_cast<uInt *>(address);
   uInt old = *address_as_ull;
@@ -147,6 +155,7 @@ void AtomicAdd(double *address, double val) {
                                       *reinterpret_cast<ull *>(&desired));
   } while (assumed != old);
 }
+#endif
 
 template <typename DType>
 void SwapAxisKernel(const CPUDevice &d, const int n, const int cuda_mem_size,

From b13201efa98e4c3a02da7c765e5f45669731d10b Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 11:25:59 -0500
Subject: [PATCH 11/22] use WIN32 to support windows

---
 .../custom_ops/layers/cc/kernels/deformable_conv_op.cc          | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
index ce4bd2f53a..d427da4a31 100644
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
@@ -121,7 +121,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) {
   return {adj_x ? 0 : 1, adj_y ? 1 : 0};
 }
-#ifdef PLATFORM_WINDOWS
+#ifdef WIN32
 #include <mutex>
 template <typename T>
 void AtomicAdd(T *address, T val) {

From e60191b47c0a9956be8c3ddc43bf10d8186becc2 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 12:09:47 -0500
Subject: [PATCH 12/22] use conditional compile to support windows

---
 .../custom_ops/layers/cc/kernels/deformable_conv_op.cc        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
index d427da4a31..685f4dc3d7 100644
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
@@ -3,6 +3,7 @@
 
 #include <algorithm>
 #include <atomic>
+#include <mutex>
 
 #include "tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv2d_utils.h"
 
@@ -121,8 +122,7 @@ typedef Eigen::ThreadPoolDevice CPUDevice;
 Eigen::IndexPair<Eigen::DenseIndex> ContractionDims(bool adj_x, bool adj_y) {
   return {adj_x ? 0 : 1, adj_y ? 1 : 0};
 }
-#ifdef WIN32
-#include <mutex>
+#if PLATFORM_WINDOWS
 template <typename T>
 void AtomicAdd(T *address, T val) {
   static std::mutex mu;

From d1fc3ca967982ce9835dc6eb3cec7b0e902749fc Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 12:43:41 -0500
Subject: [PATCH 13/22] use conditional compile to support windows

---
 .../custom_ops/layers/cc/kernels/deformable_conv_op.cc      | 2 --
 .../custom_ops/layers/cc/kernels/deformable_conv_op.h       | 6 ------
 .../layers/cc/kernels/deformable_conv_op_gpu.cu.cc          | 2 --
 3 files changed, 10 deletions(-)

diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
index 685f4dc3d7..024adae84a 100644
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
@@ -887,7 +887,6 @@ template struct DeformableConv2DCol2ImCoord<CPUDevice, double>;
 template struct PureAddTo<CPUDevice, double>;
 template struct SetOne<CPUDevice, double>;
 template struct SetZeros<CPUDevice, double>;
-template struct SwapAxis<CPUDevice, double>;
 template struct SetNumAtIndex<CPUDevice, double>;
 
 template struct DeformableConv2DIm2Col<CPUDevice, float>;
@@ -896,7 +895,6 @@ template struct DeformableConv2DCol2ImCoord<CPUDevice, float>;
 template struct PureAddTo<CPUDevice, float>;
 template struct SetOne<CPUDevice, float>;
 template struct SetZeros<CPUDevice, float>;
-template struct SwapAxis<CPUDevice, float>;
 template struct SetNumAtIndex<CPUDevice, float>;
 
 template struct LaunchBatchMatMul<CPUDevice, float>;
diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
index 484baccd67..27376bfb61 100755
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
@@ -111,12 +111,6 @@ struct DeformableConv2DCol2ImCoord {
                   DType *grad_mask);
 };
 template <typename Device, typename DType>
-struct SwapAxis {
-  void operator()(const Device &d, DType *input_data,
-                  const TShape &origin_shape, const int axis_x,
-                  const int axis_y);
-};
-template <typename Device, typename DType>
 struct DeformableConv2DCol2Im {
   void operator()(const Device &d, const DType *data_col,
                   const DType *data_offset, const DType *data_mask,
diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
index d3e0ea8861..9d139703f5 100755
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op_gpu.cu.cc
@@ -762,7 +762,6 @@ template struct DeformableConv2DCol2ImCoord<GPUDevice, double>;
 template struct PureAddTo<GPUDevice, double>;
 template struct SetOne<GPUDevice, double>;
 template struct SetZeros<GPUDevice, double>;
-template struct SwapAxis<GPUDevice, double>;
 template struct SetNumAtIndex<GPUDevice, double>;
 
 template struct DeformableConv2DIm2Col<GPUDevice, float>;
@@ -771,7 +770,6 @@ template struct DeformableConv2DCol2ImCoord<GPUDevice, float>;
 template struct PureAddTo<GPUDevice, float>;
 template struct SetOne<GPUDevice, float>;
 template struct SetZeros<GPUDevice, float>;
-template struct SwapAxis<GPUDevice, float>;
 template struct SetNumAtIndex<GPUDevice, float>;
 template <typename T>
 se::DeviceMemory<T> AsDeviceMemory(const T *cuda_memory) {

From 84d1d05cca6da9786e4d4df33df51c71b85289b0 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 13:07:26 -0500
Subject: [PATCH 14/22] fix syntax problem in windows

---
 .../custom_ops/layers/cc/kernels/deformable_conv_op.cc       | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
index 024adae84a..b66f37f947 100644
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.cc
@@ -842,9 +842,8 @@ void LaunchBatchMatMul<CPUDevice, T>::launch(OpKernelContext *context,
   contract_pairs[0] = ContractionDims(adj_x, adj_y);
   auto &device = context->eigen_device<CPUDevice>();
   for (int i = 0; i < t_out.dimension(0); ++i) {
-    t_out.template chip<0>(i).device(device) =
-        (t_in_x.template chip<0>(i))
-            .template contract(t_in_y.template chip<0>(i), contract_pairs);
+    t_out.chip(i, 0).device(device) =
+        (t_in_x.chip(i, 0)).contract(t_in_y.chip(i, 0), contract_pairs);
   }
 }
 

From 52ae9015d24f43b1520212ce15d62d031a2b5455 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sun, 9 Feb 2020 22:18:15 -0500
Subject: [PATCH 15/22] remove some redudant code

---
 .../layers/cc/kernels/deformable_conv_op.h    | 21 -------------------
 .../layers/cc/ops/deformable_conv2d.cc        |  3 ---
 tensorflow_addons/layers/deformable_conv2d.py |  9 ++------
 3 files changed, 2 insertions(+), 31 deletions(-)

diff --git a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
index 27376bfb61..2640cac134 100755
--- a/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
+++ b/tensorflow_addons/custom_ops/layers/cc/kernels/deformable_conv_op.h
@@ -2,27 +2,6 @@
 #ifndef TF_OPS_DEFORMABLE_CONV2D_H
 #define TF_OPS_DEFORMABLE_CONV2D_H
 
-#ifdef __JETBRAINS_IDE__
-#define __host__
-#define __device__
-#define __shared__
-#define __constant__
-#define __global__
-
-// This is slightly mental, but gets it to properly index device function calls
-// like __popc and whatever.
-//#define __CUDACC__
-
-// These headers are all implicitly present when you compile CUDA with clang.
-// Clion doesn't know that, so we include them explicitly to make the indexer
-// happy. Doing this when you actually build is, obviously, a terrible idea :D
-//#include <__clang_cuda_builtin_vars.h>
-//#include <__clang_cuda_intrinsics.h>
-//#include <__clang_cuda_math_forward_declares.h>
-//#include <__clang_cuda_complex_builtins.h>
-//#include <__clang_cuda_cmath.h>
-#endif  // __JETBRAINS_IDE__
-
 #define EIGEN_USE_THREADS
 #ifdef GOOGLE_CUDA
 #define EIGEN_USE_GPU
diff --git a/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc b/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc
index c9ec963b13..c6d15fad92 100644
--- a/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc
+++ b/tensorflow_addons/custom_ops/layers/cc/ops/deformable_conv2d.cc
@@ -1,6 +1,3 @@
-//
-// Created by 孙嘉禾 on 2019/12/31.
-//
 
 #include <algorithm>
 #include <cmath>
diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py
index af753efcc5..f49fb7a2d3 100644
--- a/tensorflow_addons/layers/deformable_conv2d.py
+++ b/tensorflow_addons/layers/deformable_conv2d.py
@@ -20,11 +20,6 @@
 from tensorflow.python.keras.utils import conv_utils
 
 _deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so")
-# _deformable_conv2d_ops_so = LazySO(
-#    "/home/admin-seu/TempData/sss/custom_ops/deformable_conv2d_ops_new/deformable_conv2D.so"
-# )
-# _deformable_conv2d_ops_so = LazySO("/home/admin-seu/TempData/sss/SoftWare/addons/bazel-bin/tensorflow_addons/custom_ops/layers/_deformable_conv2d_ops.so")
-# _deformable_conv2d_ops_so = tf.load_op_library("custom_ops/layers/_deformable_conv2d_ops.so")
 
 
 def _deformable_conv2d(
@@ -137,7 +132,7 @@ def _deformable_conv2d_back_prop(op, grad):
     return data_grad  # List of 4 Tensor, since we have 4 input
 
 
-# @tf.keras.utils.register_keras_serializable(package="Addons")
+@tf.keras.utils.register_keras_serializable(package="Addons")
 class DeformableConv2D(tf.keras.layers.Layer):
     @typechecked
     def __init__(
@@ -320,7 +315,7 @@ def _deformable_psroi_pool_back_prop(op, *grad):
     return [data_grad[0], tf.zeros_like(bbox), data_grad[1]]
 
 
-# @tf.keras.utils.register_keras_serializable(package="Addons")
+@tf.keras.utils.register_keras_serializable(package="Addons")
 class DeformablePSROIAlign(tf.keras.layers.Layer):
     def __init__(
         self,

From 27aa599a3534af7bbdb7a6895049b7efbb8a5f3f Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sat, 22 Feb 2020 11:17:41 +0800
Subject: [PATCH 16/22] add EquiConv according to
 https://arxiv.org/pdf/1903.08094.pdf

---
 docs/tutorials/deformable_conv2d_ops.ipynb    |   8 +-
 tensorflow_addons/layers/BUILD                |  13 +
 tensorflow_addons/layers/__init__.py          |   1 +
 tensorflow_addons/layers/deformable_conv2d.py |  55 +--
 tensorflow_addons/layers/equi_conv.py         | 333 ++++++++++++++++++
 tensorflow_addons/layers/equi_conv_test.py    |  23 ++
 6 files changed, 412 insertions(+), 21 deletions(-)
 create mode 100644 tensorflow_addons/layers/equi_conv.py
 create mode 100644 tensorflow_addons/layers/equi_conv_test.py

diff --git a/docs/tutorials/deformable_conv2d_ops.ipynb b/docs/tutorials/deformable_conv2d_ops.ipynb
index 7509feba66..fa9481da2d 100644
--- a/docs/tutorials/deformable_conv2d_ops.ipynb
+++ b/docs/tutorials/deformable_conv2d_ops.ipynb
@@ -65,7 +65,10 @@
     "\n",
     "(DeformableConv2D is just like normal tf.keras.layers.Conv2D)\n",
     "\n",
-    "(DeformablePSROIAlign is something like tf.image.crop_to_bounding_box)"
+    "(DeformablePSROIAlign is something like tf.image.crop_and_resize)\n",
+    "\n",
+    "(EquiConv is proposed by https://arxiv.org/pdf/1903.08094.pdf, which is a special case\n",
+    "of DeformableConv2D)"
    ],
    "metadata": {
     "collapsed": false
@@ -84,7 +87,10 @@
     "featuremap = tf.random.uniform(shape=[batch_size, 20, 20, channels], dtype=tf.float32)\n",
     "\n",
     "deformable_layer = tfa.layers.DeformableConv2D(filters, kernel_size, padding=padding)\n",
+    "equi_conv_layer = tfa.layers.EquiConv(64, (3, 3), 1, 1, (1, 1),\n",
+    "                         1, False, \"same\", \"channels_last\")\n",
     "result = deformable_layer(featuremap)\n",
+    "equi_result = equi_conv_layer(featuremap)\n",
     "\n",
     "image_featuremap = tf.random.normal(shape=[2, 64, 100, 100])\n",
     "rois = tf.convert_to_tensor([[0, 1, 1, 800, 800], [1, 2, 2, 400, 400]], dtype=tf.float32)\n",
diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index 5483f6a64f..df303a61f9 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -7,6 +7,7 @@ py_library(
     srcs = [
         "__init__.py",
         "deformable_conv2d.py",
+        "equi_conv.py",
         "gelu.py",
         "maxout.py",
         "multihead_attention.py",
@@ -168,3 +169,15 @@ py_test(
         ":layers",
     ],
 )
+
+py_test(
+    name = "equi_conv_test",
+    size = "small",
+    srcs = [
+        "equi_conv_test.py",
+    ],
+    main = "equi_conv_test.py",
+    deps = [
+        ":layers",
+    ],
+)
diff --git a/tensorflow_addons/layers/__init__.py b/tensorflow_addons/layers/__init__.py
index 733cb79c76..a9b8ce4d39 100644
--- a/tensorflow_addons/layers/__init__.py
+++ b/tensorflow_addons/layers/__init__.py
@@ -28,3 +28,4 @@
 from tensorflow_addons.layers.wrappers import WeightNormalization
 from tensorflow_addons.layers.deformable_conv2d import DeformableConv2D
 from tensorflow_addons.layers.deformable_conv2d import DeformablePSROIAlign
+from tensorflow_addons.layers.equi_conv import EquiConv
diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py
index f49fb7a2d3..bcd250a765 100644
--- a/tensorflow_addons/layers/deformable_conv2d.py
+++ b/tensorflow_addons/layers/deformable_conv2d.py
@@ -16,6 +16,7 @@
 
 import tensorflow as tf
 from typeguard import typechecked
+from tensorflow_addons.utils import types
 from tensorflow_addons.utils.resource_loader import LazySO
 from tensorflow.python.keras.utils import conv_utils
 
@@ -93,7 +94,7 @@ def _deformable_conv2d_back_prop(op, grad):
     num_groups = op.get_attr("num_groups")
     deformable_groups = op.get_attr("deformable_groups")
     """
-    REGISTER_OP("Addons>DeformableConv2DBackProp")
+    REGISTER_OP("AddonsDeformableConv2DBackProp")
         .Input("input: T")
         .Input("filter: T")
         .Input("offset: T")
@@ -147,6 +148,12 @@ def __init__(
         padding: str = "valid",
         data_format: str = "channels_last",
         dilations: tuple = (1, 1),
+        kernel_initializer: types.Initializer = None,
+        bias_initializer: types.Initializer = None,
+        kernel_regularizer: types.Regularizer = None,
+        bias_regularizer: types.Regularizer = None,
+        kernel_constraint: types.Constraint = None,
+        bias_constraint: types.Constraint = None,
     ):
         super(DeformableConv2D, self).__init__()
         self.filters = filters
@@ -159,13 +166,25 @@ def __init__(
         self.padding = padding
         self.data_format = data_format
         self.dilations = dilations
+        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+        self.bias_initializer = tf.keras.initializers.get(bias_initializer)
+        self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+        self.bias_regularizer = tf.keras.regularizers.get(bias_regularizer)
+        self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+        self.bias_constraint = tf.keras.constraints.get(bias_constraint)
         self.conv_offset = tf.keras.layers.Conv2D(
             self.deformable_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
             kernel_size=self.kernel_size,
             strides=(1, 1),
             padding=self.padding,
-            use_bias=True,
+            use_bias=self.use_bias,
             data_format="channels_last",
+            kernel_initializer=self.kernel_initializer,
+            bias_initializer=self.bias_initializer,
+            kernel_regularizer=self.kernel_regularizer,
+            bias_regularizer=self.bias_regularizer,
+            kernel_constraint=self.kernel_constraint,
+            bias_constraint=self.bias_constraint,
         )
 
     def build(self, input_shape):
@@ -174,26 +193,22 @@ def build(self, input_shape):
         else:
             channel = int(input_shape[1])
         if self.data_format == "channels_last":
-            self.filter = tf.Variable(
-                initial_value=tf.random.normal(
-                    shape=[
-                        self.kernel_size[0],
-                        self.kernel_size[1],
-                        channel,
-                        self.filters,
-                    ]
-                )
+            self.filter = self.add_weight(
+                name="filter",
+                shape=[self.kernel_size[0], self.kernel_size[1], channel, self.filters],
+                initializer=self.kernel_initializer,
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+                trainable=True,
             )
         else:
-            self.filter = tf.Variable(
-                initial_value=tf.random.normal(
-                    shape=[
-                        self.filters,
-                        channel,
-                        self.kernel_size[0],
-                        self.kernel_size[1],
-                    ]
-                )
+            self.filter = self.add_weight(
+                name="filter",
+                shape=[self.filters, channel, self.kernel_size[0], self.kernel_size[1]],
+                initializer=self.kernel_initializer,
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+                trainable=True,
             )
         self.built = True
 
diff --git a/tensorflow_addons/layers/equi_conv.py b/tensorflow_addons/layers/equi_conv.py
new file mode 100644
index 0000000000..06773ab0bd
--- /dev/null
+++ b/tensorflow_addons/layers/equi_conv.py
@@ -0,0 +1,333 @@
+import tensorflow as tf
+import numpy as np
+import math
+
+from typeguard import typechecked
+
+from tensorflow_addons.utils import types
+from tensorflow.python.keras.utils import conv_utils
+
+from tensorflow_addons.layers.deformable_conv2d import _deformable_conv2d
+
+
+def rotation_matrix(axis, theta):
+    """
+    Return the rotation matrix associated with counterclockwise rotation about
+    the given axis by theta radians.
+    """
+    axis = np.asarray(axis)
+    axis = axis / math.sqrt(np.dot(axis, axis))
+    a = math.cos(theta / 2.0)
+    b, c, d = -axis * math.sin(theta / 2.0)
+    aa, bb, cc, dd = a * a, b * b, c * c, d * d
+    bc, ad, ac, ab, bd, cd = b * c, a * d, a * c, a * b, b * d, c * d
+    return np.array(
+        [
+            [aa + bb - cc - dd, 2 * (bc + ad), 2 * (bd - ac)],
+            [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
+            [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc],
+        ]
+    )
+
+
+def equi_coord(pano_W, pano_H, k_W, k_H, u, v):
+    fov_w = k_W * np.deg2rad(360.0 / float(pano_W))
+    focal = (float(k_W) / 2) / np.tan(fov_w / 2)
+    c_x = 0
+    c_y = 0
+
+    u_r, v_r = u, v
+    u_r, v_r = u_r - float(pano_W) / 2.0, v_r - float(pano_H) / 2.0
+    phi, theta = u_r / (pano_W) * (np.pi) * 2, -v_r / (pano_H) * (np.pi)
+
+    ROT = rotation_matrix((0, 1, 0), phi)
+    ROT = np.matmul(ROT, rotation_matrix((1, 0, 0), theta))  # np.eye(3)
+
+    h_range = np.array(range(k_H))
+    w_range = np.array(range(k_W))
+    w_ones = np.ones(k_W)
+    h_ones = np.ones(k_H)
+    h_grid = (
+        np.matmul(np.expand_dims(h_range, -1), np.expand_dims(w_ones, 0))
+        + 0.5
+        - float(k_H) / 2
+    )
+    w_grid = (
+        np.matmul(np.expand_dims(h_ones, -1), np.expand_dims(w_range, 0))
+        + 0.5
+        - float(k_W) / 2
+    )
+
+    K = np.array([[focal, 0, c_x], [0, focal, c_y], [0.0, 0.0, 1.0]])
+    inv_K = np.linalg.inv(K)
+    rays = np.stack([w_grid, h_grid, np.ones(h_grid.shape)], 0)
+    rays = np.matmul(inv_K, rays.reshape(3, k_H * k_W))
+    rays /= np.linalg.norm(rays, axis=0, keepdims=True)
+    rays = np.matmul(ROT, rays)
+    rays = rays.reshape((3, k_H, k_W))
+
+    phi = np.arctan2(rays[0, ...], rays[2, ...])
+    theta = np.arcsin(np.clip(rays[1, ...], -1, 1))
+    x = (pano_W) / (2.0 * np.pi) * phi + float(pano_W) / 2.0
+    y = (pano_H) / (np.pi) * theta + float(pano_H) / 2.0
+
+    roi_y = h_grid + v_r + float(pano_H) / 2.0
+    roi_x = w_grid + u_r + float(pano_W) / 2.0
+
+    new_roi_y = y
+    new_roi_x = x
+
+    offsets_x = new_roi_x - roi_x
+    offsets_y = new_roi_y - roi_y
+
+    return offsets_x, offsets_y
+
+
+def equi_coord_fixed_resoltuion(pano_W, pano_H, k_W, k_H, u, v, pano_Hf=-1, pano_Wf=-1):
+    pano_Hf = pano_H if pano_Hf <= 0 else pano_H / pano_Hf
+    pano_Wf = pano_W if pano_Wf <= 0 else pano_W / pano_Wf
+    fov_w = k_W * np.deg2rad(360.0 / float(pano_Wf))
+    focal = (float(k_W) / 2) / np.tan(fov_w / 2)
+    c_x = 0
+    c_y = 0
+
+    u_r, v_r = u, v
+    u_r, v_r = u_r - float(pano_W) / 2.0, v_r - float(pano_H) / 2.0
+    phi, theta = u_r / (pano_W) * (np.pi) * 2, -v_r / (pano_H) * (np.pi)
+
+    ROT = rotation_matrix((0, 1, 0), phi)
+    ROT = np.matmul(ROT, rotation_matrix((1, 0, 0), theta))  # np.eye(3)
+
+    h_range = np.array(range(k_H))
+    w_range = np.array(range(k_W))
+    w_ones = np.ones(k_W)
+    h_ones = np.ones(k_H)
+    h_grid = (
+        np.matmul(np.expand_dims(h_range, -1), np.expand_dims(w_ones, 0))
+        + 0.5
+        - float(k_H) / 2
+    )
+    w_grid = (
+        np.matmul(np.expand_dims(h_ones, -1), np.expand_dims(w_range, 0))
+        + 0.5
+        - float(k_W) / 2
+    )
+
+    K = np.array([[focal, 0, c_x], [0, focal, c_y], [0.0, 0.0, 1.0]])
+    inv_K = np.linalg.inv(K)
+    rays = np.stack([w_grid, h_grid, np.ones(h_grid.shape)], 0)
+    rays = np.matmul(inv_K, rays.reshape(3, k_H * k_W))
+    rays /= np.linalg.norm(rays, axis=0, keepdims=True)
+    rays = np.matmul(ROT, rays)
+    rays = rays.reshape((3, k_H, k_W))
+
+    phi = np.arctan2(rays[0, ...], rays[2, ...])
+    theta = np.arcsin(np.clip(rays[1, ...], -1, 1))
+    x = (pano_W) / (2.0 * np.pi) * phi + float(pano_W) / 2.0
+    y = (pano_H) / (np.pi) * theta + float(pano_H) / 2.0
+
+    roi_y = h_grid + v_r + float(pano_H) / 2.0
+    roi_x = w_grid + u_r + float(pano_W) / 2.0
+
+    new_roi_y = y
+    new_roi_x = x
+
+    offsets_x = new_roi_x - roi_x
+    offsets_y = new_roi_y - roi_y
+
+    return offsets_x, offsets_y
+
+
+def distortion_aware_map(pano_W, pano_H, k_W, k_H, s_width=1, s_height=1, bs=16):
+    offset = np.zeros(shape=[pano_H, pano_W, k_H * k_W * 2])
+
+    for v in range(0, pano_H, s_height):
+        for u in range(0, pano_W, s_width):
+            offsets_x, offsets_y = equi_coord_fixed_resoltuion(
+                pano_W, pano_H, k_W, k_H, u, v, 1, 1
+            )
+            offsets = np.concatenate(
+                (np.expand_dims(offsets_y, -1), np.expand_dims(offsets_x, -1)), axis=-1
+            )
+            total_offsets = offsets.flatten().astype("float32")
+            offset[v, u, :] = total_offsets
+
+    offset = tf.constant(offset)
+    offset = tf.expand_dims(offset, 0)
+    offset = tf.tile(offset, multiples=[bs, 1, 1, 1])
+    offset = tf.cast(offset, tf.float32)
+
+    return offset
+
+
+class EquiConv(tf.keras.layers.Layer):
+    @typechecked
+    def __init__(
+        self,
+        filters: int,
+        kernel_size: tuple = (3, 3),
+        num_groups: int = 1,
+        deformable_groups: int = 1,
+        strides: tuple = (1, 1),
+        im2col: int = 1,
+        use_bias: bool = False,
+        padding: str = "valid",
+        data_format: str = "channels_last",
+        dilations: tuple = (1, 1),
+        use_relu: bool = False,
+        kernel_initializer: types.Initializer = None,
+        kernel_regularizer: types.Regularizer = None,
+        kernel_constraint: types.Constraint = None,
+        **kwargs
+    ):
+        super(EquiConv, self).__init__(**kwargs)
+        self.filters = filters
+        self.kernel_size = kernel_size
+        self.num_groups = num_groups
+        self.deformable_groups = deformable_groups
+        self.strides = strides
+        self.im2col = im2col
+        self.use_bias = use_bias
+        self.padding = padding
+        self.data_format = data_format
+        self.dilations = dilations
+        self.use_relu = use_relu
+        self.kernel_initializer = tf.keras.initializers.get(kernel_initializer)
+        self.kernel_regularizer = tf.keras.regularizers.get(kernel_regularizer)
+        self.kernel_constraint = tf.keras.constraints.get(kernel_constraint)
+        if self.padding == "valid":
+            self.tf_pad = "VALID"
+        else:
+            self.tf_pad = "SAME"
+
+    def build(self, input_shape):
+        if self.data_format == "channels_last":
+            channel = int(input_shape[-1])
+        else:
+            channel = int(input_shape[1])
+        self.kernel = self.add_weight(
+            shape=[self.filters, channel, self.kernel_size[0], self.kernel_size[1]],
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            constraint=self.kernel_constraint,
+            trainable=True,
+        )
+        if self.use_bias:
+            self.bias = self.add_weight(
+                shape=[1, self.filters, 1, 1],
+                initializer=self.kernel_initializer,
+                regularizer=self.kernel_regularizer,
+                constraint=self.kernel_constraint,
+                trainable=True,
+            )
+        self.built = True
+
+    def compute_output_shape(self, input_shape):
+        input_shape = tf.TensorShape(input_shape).as_list()
+        if self.data_format == "channels_last":
+            space = input_shape[1:-1]
+            new_space = []
+            for i in range(len(space)):
+                new_dim = conv_utils.conv_output_length(
+                    space[i],
+                    self.kernel_size[i],
+                    padding=self.padding,
+                    stride=self.strides[i],
+                    dilation=self.dilation_rate[i],
+                )
+                new_space.append(new_dim)
+            return tf.TensorShape([input_shape[0]] + new_space + [self.filters])
+        else:
+            space = input_shape[2:]
+            new_space = []
+            for i in range(len(space)):
+                new_dim = conv_utils.conv_output_length(
+                    space[i],
+                    self.kernel_size[i],
+                    padding=self.padding,
+                    stride=self.strides[i],
+                    dilation=self.dilation_rate[i],
+                )
+                new_space.append(new_dim)
+            return tf.TensorShape([input_shape[0], self.filters] + new_space)
+
+    def call(self, inputs, **kwargs):
+        if self.data_format == "channels_first":
+            data = tf.transpose(inputs, [0, 2, 3, 1])
+        else:
+            data = inputs
+        n, h, w, c_i = tuple(data.get_shape().as_list())
+        data_shape = tf.shape(data)
+        """
+        The original implement in paper here bs is set as self.batch_size, here wo use data_shape[0],
+        because self.batch_size if constant value and can't changed, but actually image batch_size can
+        change in train and test period, so we use tf.shape to get actual dynamic batch_size.
+        """
+        offset = tf.stop_gradient(
+            distortion_aware_map(
+                w,
+                h,
+                self.kernel_size[0],
+                self.kernel_size[1],
+                s_width=self.strides[0],
+                s_height=self.strides[1],
+                bs=data_shape[0],
+            )
+        )
+        mask = tf.stop_gradient(
+            tf.zeros(
+                shape=[
+                    data_shape[0],
+                    data_shape[1],
+                    data_shape[2],
+                    self.kernel_size[0] * self.kernel_size[1],
+                ]
+            )
+        )
+        data = tf.transpose(data, [0, 3, 1, 2])
+        offset = tf.transpose(offset, [0, 3, 1, 2])
+        mask = tf.transpose(mask, [0, 3, 1, 2])
+        res = _deformable_conv2d(
+            data,
+            self.kernel,
+            offset,
+            mask,
+            [1, 1, self.strides[0], self.strides[1]],
+            num_groups=self.num_groups,
+            deformable_groups=self.deformable_groups,
+            padding=self.tf_pad,
+            data_format="NCHW",
+        )
+        if self.use_bias:
+            res = tf.add(res, self.bias)
+        if self.use_relu:
+            res = tf.nn.relu(res)
+        if self.data_format == "channels_last":
+            return tf.transpose(res, [0, 2, 3, 1])
+        else:
+            return res
+
+    def get_config(self):
+        config = {
+            "filters": self.filters,
+            "kernel_size": self.kernel_size,
+            "num_groups": self.num_groups,
+            "deformable_groups": self.deformable_groups,
+            "strides": self.strides,
+            "im2col": self.im2col,
+            "use_bias": self.use_bias,
+            "padding": self.padding,
+            "data_format": self.data_format,
+            "dilations": self.dilations,
+            "use_relu": self.use_relu,
+            "kernel_initializer": tf.keras.initializers.serialize(
+                self.kernel_initializer
+            ),
+            "kernel_regularizer": tf.keras.regularizers.serialize(
+                self.kernel_regularizer
+            ),
+            "kernel_constraint": tf.keras.constraints.serialize(self.kernel_constraint),
+            "tf_pad": self.tf_pad,
+        }
+        base_config = super().get_config()
+        return {**base_config, **config}
diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py
new file mode 100644
index 0000000000..4438416852
--- /dev/null
+++ b/tensorflow_addons/layers/equi_conv_test.py
@@ -0,0 +1,23 @@
+import tensorflow as tf
+from tensorflow_addons.layers.equi_conv import EquiConv
+
+from tensorflow_addons.utils import test_utils
+
+
+@test_utils.run_all_in_graph_and_eager_modes
+class EquiConvTest(tf.test.TestCase):
+    def testKerasNHWC(self):
+        input = tf.ones(shape=[16, 400, 400, 3])
+        layer = EquiConv(64, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last")
+        res = layer(input)
+        self.assertAllEqual(tf.shape(input), tf.shape(res))
+
+    def testKerasNCHW(self):
+        input = tf.ones(shape=[16, 3, 400, 400])
+        layer = EquiConv(64, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first")
+        res = layer(input)
+        self.assertAllEqual(tf.shape(input), tf.shape(res))
+
+
+if __name__ == "__main__":
+    tf.test.main()

From 057d927b765a5a31251150ea8c9e7ab7c8b5da6c Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sat, 22 Feb 2020 11:41:30 +0800
Subject: [PATCH 17/22] fix timeout problem

---
 tensorflow_addons/layers/equi_conv_test.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py
index 4438416852..4c61f6a868 100644
--- a/tensorflow_addons/layers/equi_conv_test.py
+++ b/tensorflow_addons/layers/equi_conv_test.py
@@ -7,14 +7,14 @@
 @test_utils.run_all_in_graph_and_eager_modes
 class EquiConvTest(tf.test.TestCase):
     def testKerasNHWC(self):
-        input = tf.ones(shape=[16, 400, 400, 3])
-        layer = EquiConv(64, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last")
+        input = tf.ones(shape=[1, 10, 10, 3])
+        layer = EquiConv(16, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last")
         res = layer(input)
         self.assertAllEqual(tf.shape(input), tf.shape(res))
 
     def testKerasNCHW(self):
-        input = tf.ones(shape=[16, 3, 400, 400])
-        layer = EquiConv(64, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first")
+        input = tf.ones(shape=[1, 3, 10, 10])
+        layer = EquiConv(16, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first")
         res = layer(input)
         self.assertAllEqual(tf.shape(input), tf.shape(res))
 

From 663622a6c3f394d848d57641c107f521ff0f41a1 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sat, 22 Feb 2020 11:55:06 +0800
Subject: [PATCH 18/22] fix test problem

---
 tensorflow_addons/layers/equi_conv_test.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py
index 4c61f6a868..5285c5e823 100644
--- a/tensorflow_addons/layers/equi_conv_test.py
+++ b/tensorflow_addons/layers/equi_conv_test.py
@@ -7,14 +7,20 @@
 @test_utils.run_all_in_graph_and_eager_modes
 class EquiConvTest(tf.test.TestCase):
     def testKerasNHWC(self):
-        input = tf.ones(shape=[1, 10, 10, 3])
-        layer = EquiConv(16, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last")
+        channel = 32
+        input = tf.ones(shape=[1, 10, 10, channel])
+        layer = EquiConv(
+            channel, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_last"
+        )
         res = layer(input)
         self.assertAllEqual(tf.shape(input), tf.shape(res))
 
     def testKerasNCHW(self):
-        input = tf.ones(shape=[1, 3, 10, 10])
-        layer = EquiConv(16, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first")
+        channel = 32
+        input = tf.ones(shape=[1, channel, 10, 10])
+        layer = EquiConv(
+            channel, (3, 3), 1, 1, (1, 1), 1, False, "same", "channels_first"
+        )
         res = layer(input)
         self.assertAllEqual(tf.shape(input), tf.shape(res))
 

From 6a72bcc9618ab702a21ff58e6db70010b0515114 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Fri, 27 Mar 2020 10:16:06 +0800
Subject: [PATCH 19/22] fix problem

---
 tensorflow_addons/layers/BUILD | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index df303a61f9..a2580d1a9e 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -159,6 +159,11 @@ py_test(
         "multihead_attention_test.py",
     ],
     main = "multihead_attention_test.py",
+    deps = [
+        ":layers",
+    ],
+)
+py_test(
     name = "deformable_conv_test",
     size = "small",
     srcs = [

From dc3e3abb34a45e9b0fcbf5015c25477c71e0eb28 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sat, 28 Mar 2020 00:34:05 +0800
Subject: [PATCH 20/22] format code

---
 tensorflow_addons/layers/deformable_conv2d_test.py | 5 +++--
 tensorflow_addons/layers/equi_conv_test.py         | 5 ++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py
index a66d34bd91..53f8283ed4 100644
--- a/tensorflow_addons/layers/deformable_conv2d_test.py
+++ b/tensorflow_addons/layers/deformable_conv2d_test.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import tensorflow as tf
+import pytest
 from tensorflow_addons.layers.deformable_conv2d import (
     DeformableConv2D,
     DeformablePSROIAlign,
@@ -23,7 +24,7 @@
 from tensorflow_addons.utils import test_utils
 
 
-@test_utils.run_all_in_graph_and_eager_modes
+@pytest.mark.usefixtures("maybe_run_functions_eagerly")
 class DeformableConv2DTest(tf.test.TestCase):
     def _forward(
         self,
@@ -213,7 +214,7 @@ def testKerasNHWC(self):
             self._keras(data_format="channels_last", use_gpu=True)
 
 
-@test_utils.run_all_in_graph_and_eager_modes
+@pytest.mark.usefixtures("maybe_run_functions_eagerly")
 class DeformablePSROIAlignTest(tf.test.TestCase):
     def _forward_simple(self, data_format, use_gpu=False):
         featuremap = tf.random.normal(shape=[1, 64, 100, 100])
diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py
index 5285c5e823..090e242e40 100644
--- a/tensorflow_addons/layers/equi_conv_test.py
+++ b/tensorflow_addons/layers/equi_conv_test.py
@@ -1,10 +1,9 @@
 import tensorflow as tf
+import pytest
 from tensorflow_addons.layers.equi_conv import EquiConv
 
-from tensorflow_addons.utils import test_utils
 
-
-@test_utils.run_all_in_graph_and_eager_modes
+@pytest.mark.usefixtures("maybe_run_functions_eagerly")
 class EquiConvTest(tf.test.TestCase):
     def testKerasNHWC(self):
         channel = 32

From 4aad322548e1060b752179944588f7ec698f2e90 Mon Sep 17 00:00:00 2001
From: public <975759105@qq.com>
Date: Sat, 28 Mar 2020 00:54:09 +0800
Subject: [PATCH 21/22] manully format code

---
 tensorflow_addons/layers/BUILD                     | 1 +
 tensorflow_addons/layers/deformable_conv2d_test.py | 7 ++++---
 tensorflow_addons/layers/equi_conv_test.py         | 6 ++++--
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tensorflow_addons/layers/BUILD b/tensorflow_addons/layers/BUILD
index a2580d1a9e..cc477b1d0c 100644
--- a/tensorflow_addons/layers/BUILD
+++ b/tensorflow_addons/layers/BUILD
@@ -163,6 +163,7 @@ py_test(
         ":layers",
     ],
 )
+
 py_test(
     name = "deformable_conv_test",
     size = "small",
diff --git a/tensorflow_addons/layers/deformable_conv2d_test.py b/tensorflow_addons/layers/deformable_conv2d_test.py
index 53f8283ed4..cd3d8897d6 100644
--- a/tensorflow_addons/layers/deformable_conv2d_test.py
+++ b/tensorflow_addons/layers/deformable_conv2d_test.py
@@ -16,6 +16,7 @@
 import numpy as np
 import tensorflow as tf
 import pytest
+import sys
 from tensorflow_addons.layers.deformable_conv2d import (
     DeformableConv2D,
     DeformablePSROIAlign,
@@ -24,7 +25,7 @@
 from tensorflow_addons.utils import test_utils
 
 
-@pytest.mark.usefixtures("maybe_run_functions_eagerly")
+@test_utils.run_all_in_graph_and_eager_modes
 class DeformableConv2DTest(tf.test.TestCase):
     def _forward(
         self,
@@ -214,7 +215,7 @@ def testKerasNHWC(self):
             self._keras(data_format="channels_last", use_gpu=True)
 
 
-@pytest.mark.usefixtures("maybe_run_functions_eagerly")
+@test_utils.run_all_in_graph_and_eager_modes
 class DeformablePSROIAlignTest(tf.test.TestCase):
     def _forward_simple(self, data_format, use_gpu=False):
         featuremap = tf.random.normal(shape=[1, 64, 100, 100])
@@ -266,4 +267,4 @@ def testKerasNHWC(self):
 
 
 if __name__ == "__main__":
-    tf.test.main()
+    sys.exit(pytest.main([__file__]))
diff --git a/tensorflow_addons/layers/equi_conv_test.py b/tensorflow_addons/layers/equi_conv_test.py
index 090e242e40..f618d8d068 100644
--- a/tensorflow_addons/layers/equi_conv_test.py
+++ b/tensorflow_addons/layers/equi_conv_test.py
@@ -1,9 +1,11 @@
 import tensorflow as tf
 import pytest
+import sys
 from tensorflow_addons.layers.equi_conv import EquiConv
+from tensorflow_addons.utils import test_utils
 
 
-@pytest.mark.usefixtures("maybe_run_functions_eagerly")
+@test_utils.run_all_in_graph_and_eager_modes
 class EquiConvTest(tf.test.TestCase):
     def testKerasNHWC(self):
         channel = 32
@@ -25,4 +27,4 @@ def testKerasNCHW(self):
 
 
 if __name__ == "__main__":
-    tf.test.main()
+    sys.exit(pytest.main([__file__]))

From 430cc6a19a47e797d796d3f5b87dcc8a95d75b23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Libor=20Van=C4=9Bk?=
 <35312583+liborvaneksw@users.noreply.github.com>
Date: Mon, 8 Jun 2020 18:38:19 +0200
Subject: [PATCH 22/22] Update deformable_conv2d.py

DeformableConv2D: serialization fixes
---
 tensorflow_addons/layers/deformable_conv2d.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow_addons/layers/deformable_conv2d.py b/tensorflow_addons/layers/deformable_conv2d.py
index bcd250a765..3b699dc804 100644
--- a/tensorflow_addons/layers/deformable_conv2d.py
+++ b/tensorflow_addons/layers/deformable_conv2d.py
@@ -19,6 +19,7 @@
 from tensorflow_addons.utils import types
 from tensorflow_addons.utils.resource_loader import LazySO
 from tensorflow.python.keras.utils import conv_utils
+from typing import Union
 
 _deformable_conv2d_ops_so = LazySO("custom_ops/layers/_deformable_conv2d_ops.so")
 
@@ -139,23 +140,24 @@ class DeformableConv2D(tf.keras.layers.Layer):
     def __init__(
         self,
         filters: int,
-        kernel_size: tuple = (3, 3),
+        kernel_size: Union[tuple, list] = (3, 3),
         num_groups: int = 1,
         deformable_groups: int = 1,
-        strides: tuple = (1, 1),
+        strides: Union[tuple, list] = (1, 1),
         im2col: int = 1,
         use_bias: bool = False,
         padding: str = "valid",
         data_format: str = "channels_last",
-        dilations: tuple = (1, 1),
+        dilations: Union[tuple, list] = (1, 1),
         kernel_initializer: types.Initializer = None,
         bias_initializer: types.Initializer = None,
         kernel_regularizer: types.Regularizer = None,
         bias_regularizer: types.Regularizer = None,
         kernel_constraint: types.Constraint = None,
         bias_constraint: types.Constraint = None,
+        **kwargs
     ):
-        super(DeformableConv2D, self).__init__()
+        super(DeformableConv2D, self).__init__(**kwargs)
         self.filters = filters
         self.kernel_size = kernel_size
         self.num_groups = num_groups