From f3b6ee4c5e9f19e3cdc875824fb38af749881fa7 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 28 Oct 2021 08:37:17 +0000
Subject: [PATCH 01/30] add cpu version, using set: sum, min, max

---
 .../fused/fused_gather_scatter_op.cc          | 181 +++++++++++++++++
 .../operators/fused/fused_gather_scatter_op.h | 184 ++++++++++++++++++
 2 files changed, 365 insertions(+)
 create mode 100644 paddle/fluid/operators/fused/fused_gather_scatter_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_gather_scatter_op.h

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
new file mode 100644
index 00000000000000..0eea9b90a4a94e
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
@@ -0,0 +1,181 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/fused_gather_scatter_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedGatherScatterOP : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"), true,
+        platform::errors::InvalidArgument(
+            "Input(X) of FusedGatherScatterOp should not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Gather_index"), true,
+        platform::errors::InvalidArgument(
+            "Input(Gather_indx) of FusedGatherScatterOp should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Scatter_index"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Scatter_index) of FusedGatherScatterOp should "
+                          "not be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasOutput("Out"), true,
+        platform::errors::InvalidArgument(
+            "Output(Out) of FusedGatherScatterOp should not be null."));
+
+    auto gather_index_dims = ctx->GetInputDim("Gather_index");
+    if (gather_index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(gather_index_dims[1], 1,
+                        platform::errors::InvalidArgument(
+                            "The last dim of gather_index should be 1 when it "
+                            "is 2D, but we get %d",
+                            gather_index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          gather_index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The gather_index should be 1D, when it is not 2D, but we get %d",
+              gather_index_dims.size()));
+    }
+
+    auto scatter_index_dims = ctx->GetInputDim("Scatter_index");
+    if (scatter_index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(scatter_index_dims[1], 1,
+                        platform::errors::InvalidArgument(
+                            "The last dim of scatter_index should be 1 when it "
+                            "is 2D, but we get %d",
+                            scatter_index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          scatter_index_dims.size(), 1,
+          platform::errors::InvalidArgument("The scatter_index should be 1D, "
+                                            "when it is not 2D, but we get %d",
+                                            scatter_index_dims.size()));
+    }
+
+    auto dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
+};
+
+class FusedGatherScatterGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto in_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+class FusedGatherScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor with data type float32, "
+             "float64 or float16");
+    AddInput("Gather_index", "The gather index tensor.");
+    AddInput("Scatter_index", "The scatter index tensor.");
+    AddOutput("Out", "Output tensor of fused_gather_scatter op.");
+    AddAttr<std::string>(
+        "pool_type",
+        "(string, default 'SUM')"
+        "We use Gather_index to gather correspoinding place of X. "
+        "Then we need to use different pool type to scatter the result.")
+        .SetDefault("SUM")
+        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
+    // TODO(daisiming): Add a simple example here.
+    AddComment(R"DOC(
+Fused Gather Scatter Operator.
+
+$Out = Scatter(Gather(X, Gather_index), Scatter_index, pool_type)$
+
+This operator helps perform fused computation of gather operator and scatter operator, so as to 
+decrease intermediate GPU memory occupation of using gather op and scatter op successively.
+
+Example:
+
+pass
+)DOC");
+  }
+};
+
+template <typename T>
+class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("fused_gather_scatter_grad");
+    op->SetInput("Gather_index", this->Input("Gather_index"));
+    op->SetInput("Scatter_index", this->Input("Scatter_index"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+
+REGISTER_OPERATOR(
+    fused_gather_scatter, ops::FusedGatherScatterOP,
+    ops::FusedGatherScatterOpMaker,
+    ops::FusedGatherScatterGradOpMaker<paddle::framework::OpDesc>,
+    ops::FusedGatherScatterGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(fused_gather_scatter_grad, ops::FusedGatherScatterGradOp);
+REGISTER_OP_CPU_KERNEL(fused_gather_scatter,
+                       ops::FusedGatherScatterOpKernel<CPU, float, int>,
+                       ops::FusedGatherScatterOpKernel<CPU, double, int>,
+                       ops::FusedGatherScatterOpKernel<CPU, int, int>,
+                       ops::FusedGatherScatterOpKernel<CPU, int64_t, int>,
+                       ops::FusedGatherScatterOpKernel<CPU, float, int64_t>,
+                       ops::FusedGatherScatterOpKernel<CPU, double, int64_t>,
+                       ops::FusedGatherScatterOpKernel<CPU, int, int64_t>,
+                       ops::FusedGatherScatterOpKernel<CPU, int64_t, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    fused_gather_scatter_grad,
+    ops::FusedGatherScatterGradOpKernel<CPU, float, int>,
+    ops::FusedGatherScatterGradOpKernel<CPU, double, int>,
+    ops::FusedGatherScatterGradOpKernel<CPU, int, int>,
+    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int>,
+    ops::FusedGatherScatterGradOpKernel<CPU, float, int64_t>,
+    ops::FusedGatherScatterGradOpKernel<CPU, double, int64_t>,
+    ops::FusedGatherScatterGradOpKernel<CPU, int, int64_t>,
+    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int64_t>);
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
new file mode 100644
index 00000000000000..b782a478753338
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct FusedGatherScatterSumFunctor {
+  void operator()(const int& first_flag, const Tensor& src_slice,
+                  Tensor* dst_slice) {
+    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
+    eigen_dst += eigen_src;
+  }
+};
+
+template <typename T>
+struct FusedGatherScatterMinFunctor {
+  void operator()(const int& first_flag, const Tensor& src_slice,
+                  Tensor* dst_slice) {
+    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag == 0) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMin(eigen_src);
+    }
+  }
+};
+
+template <typename T>
+struct FusedGatherScatterMaxFunctor {
+  void operator()(const int& first_flag, const Tensor& src_slice,
+                  Tensor* dst_slice) {
+    auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
+    auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
+    if (first_flag == 0) {
+      eigen_dst += eigen_src;
+    } else {
+      eigen_dst = eigen_dst.cwiseMax(eigen_src);
+    }
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+void elementwise_inner_operation(const Tensor& src, Tensor* dst,
+                                 const IndexT& src_index,
+                                 const IndexT& dst_index,
+                                 const bool& first_flag, Functor functor) {
+  auto src_slice = src.Slice(src_index, src_index + 1);
+  auto dst_slice = dst->Slice(dst_index, dst_index + 1);
+
+  functor(first_flag, src_slice, &dst_slice);
+}
+
+template <typename T, typename IndexT, typename Functor>
+void gather_scatter_cpu_for_loop(const int index_size, const IndexT* g_index,
+                                 const IndexT* s_index, const Tensor& src,
+                                 Tensor* dst, const std::string& pool_type) {
+  Functor functor;
+  if (pool_type == "MIN" || pool_type == "MAX") {
+    std::set<IndexT> existed_dst;
+    for (int i = 0; i < index_size; ++i) {
+      IndexT src_ptr = g_index[i];
+      IndexT dst_ptr = s_index[i];
+      int nRet = std::count(existed_dst.begin(), existed_dst.end(), dst_ptr);
+      if (nRet == 0) {
+        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_ptr,
+                                                        dst_ptr, 0, functor);
+        existed_dst.insert(dst_ptr);
+      } else {
+        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_ptr,
+                                                        dst_ptr, 1, functor);
+      }
+    }
+  } else if (pool_type == "SUM" || pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      IndexT src_ptr = g_index[i];
+      IndexT dst_ptr = s_index[i];
+      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_ptr,
+                                                      dst_ptr, 0, functor);
+    }
+  }
+}
+
+template <typename DeviceContext, typename T, typename IndexT>
+class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* gather_index = ctx.Input<Tensor>("Gather_index");
+    auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
+    auto* Y = ctx.Output<Tensor>("Out");
+
+    int index_size = gather_index->dims()[0];
+
+    T* p_output = Y->mutable_data<T>(ctx.GetPlace());
+    auto src_dims = X->dims();
+    int64_t memset_size = 1;
+    for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+    const size_t& memset_bytes = memset_size * sizeof(T);
+    memset(p_output, 0, memset_bytes);
+
+    const IndexT* g_index = gather_index->data<IndexT>();
+    const IndexT* s_index = scatter_index->data<IndexT>();
+
+    std::string pool_type = ctx.Attr<std::string>("pool_type");
+    if (pool_type == "SUM") {
+      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
+          index_size, g_index, s_index, *X, Y, pool_type);
+    } else if (pool_type == "MIN") {
+      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMinFunctor<T>>(
+          index_size, g_index, s_index, *X, Y, pool_type);
+    } else if (pool_type == "MAX") {
+      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMaxFunctor<T>>(
+          index_size, g_index, s_index, *X, Y, pool_type);
+    } else if (pool_type == "MEAN") {
+      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
+          index_size, g_index, s_index, *X, Y, pool_type);
+      // TODO(daisiming): Add mean operation.
+    }
+  }
+};
+
+template <typename DeviceContext, typename T, typename IndexT>
+class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* gather_index = ctx.Input<Tensor>("Gather_index");
+    auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
+    auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    int index_size = gather_index->dims()[0];
+
+    T* p_output = Y->mutable_data<T>(ctx.GetPlace());
+    auto src_dims = X->dims();
+    int64_t memset_size = 1;
+    for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+    const size_t& memset_bytes = memset_size * sizeof(T);
+    memset(p_output, 0, memset_bytes);
+
+    const IndexT* g_index = gather_index->data<IndexT>();
+    const IndexT* s_index = scatter_index->data<IndexT>();
+
+    std::string pool_type = ctx.Attr<std::string>("pool_type");
+    if (pool_type == "SUM") {
+      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
+          index_size, g_index, s_index, *X, Y, pool_type);
+    } else if (pool_type == "MIN") {
+      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMinFunctor<T>>(
+          index_size, g_index, s_index, *X, Y, pool_type);
+    } else if (pool_type == "MAX") {
+      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMaxFunctor<T>>(
+          index_size, g_index, s_index, *X, Y, pool_type);
+    } else if (pool_type == "MEAN") {
+      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
+          index_size, g_index, s_index, *X, Y, pool_type);
+      // TODO(daisiming): Add mean operation.
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle

From 470aa0fd23f4b26faba65a3faab67f97e2a4148f Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Fri, 29 Oct 2021 07:40:39 +0000
Subject: [PATCH 02/30] add cpu version: mean

---
 .../operators/fused/fused_gather_scatter_op.h | 34 +++++++++++++++++--
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index b782a478753338..563ae656fc9b8d 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -73,7 +73,7 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst,
 }
 
 template <typename T, typename IndexT, typename Functor>
-void gather_scatter_cpu_for_loop(const int index_size, const IndexT* g_index,
+void gather_scatter_cpu_for_loop(const int& index_size, const IndexT* g_index,
                                  const IndexT* s_index, const Tensor& src,
                                  Tensor* dst, const std::string& pool_type) {
   Functor functor;
@@ -102,6 +102,16 @@ void gather_scatter_cpu_for_loop(const int index_size, const IndexT* g_index,
   }
 }
 
+template <typename T, typename IndexT>
+void cal_mean_cpu_for_loop(const int& size, Tensor* dst, int* count) {
+  for (int i = 0; i < size; ++i) {
+    if (count[i] == 0) continue;
+    auto dst_slice = dst->Slice(i, i + 1);
+    auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
+    eigen_dst = eigen_dst / static_cast<T>(count[i]);
+  }
+}
+
 template <typename DeviceContext, typename T, typename IndexT>
 class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
  public:
@@ -136,7 +146,16 @@ class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
     } else if (pool_type == "MEAN") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
           index_size, g_index, s_index, *X, Y, pool_type);
-      // TODO(daisiming): Add mean operation.
+      int count[src_dims[0]];
+      memset(count, 0, src_dims[0] * sizeof(int));
+      for (int i = 0; i < index_size; ++i) {
+        IndexT dst_ptr = s_index[i];
+        count[dst_ptr] += 1;
+      }
+      for (int i = 0; i < src_dims[0]; ++i) {
+        VLOG(0) << count[i];
+      }
+      cal_mean_cpu_for_loop<T, IndexT>(src_dims[0], Y, count);
     }
   }
 };
@@ -175,7 +194,16 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
     } else if (pool_type == "MEAN") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
           index_size, g_index, s_index, *X, Y, pool_type);
-      // TODO(daisiming): Add mean operation.
+      int count[src_dims[0]];
+      memset(count, 0, src_dims[0] * sizeof(int));
+      for (int i = 0; i < index_size; ++i) {
+        IndexT dst_ptr = s_index[i];
+        count[dst_ptr] += 1;
+      }
+      for (int i = 0; i < src_dims[0]; ++i) {
+        VLOG(0) << count[i];
+      }
+      cal_mean_cpu_for_loop<T, IndexT>(src_dims[0], Y, count);
     }
   }
 };

From b31111c87b37b1df7c2a47cd9fdc1acd4d5c086b Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Fri, 29 Oct 2021 10:13:15 +0000
Subject: [PATCH 03/30] improve cpu code and fix dynamic memory allcation
 problem

---
 .../fused/fused_gather_scatter_op.cc          |  32 +++--
 .../operators/fused/fused_gather_scatter_op.h | 120 ++++++++----------
 2 files changed, 75 insertions(+), 77 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
index 0eea9b90a4a94e..789ee7e3f217ed 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
@@ -160,22 +160,28 @@ REGISTER_OPERATOR(
     ops::FusedGatherScatterGradOpMaker<paddle::framework::OpDesc>,
     ops::FusedGatherScatterGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(fused_gather_scatter_grad, ops::FusedGatherScatterGradOp);
-REGISTER_OP_CPU_KERNEL(fused_gather_scatter,
-                       ops::FusedGatherScatterOpKernel<CPU, float, int>,
-                       ops::FusedGatherScatterOpKernel<CPU, double, int>,
-                       ops::FusedGatherScatterOpKernel<CPU, int, int>,
-                       ops::FusedGatherScatterOpKernel<CPU, int64_t, int>,
-                       ops::FusedGatherScatterOpKernel<CPU, float, int64_t>,
-                       ops::FusedGatherScatterOpKernel<CPU, double, int64_t>,
-                       ops::FusedGatherScatterOpKernel<CPU, int, int64_t>,
-                       ops::FusedGatherScatterOpKernel<CPU, int64_t, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    fused_gather_scatter, ops::FusedGatherScatterOpKernel<CPU, float, int>,
+    ops::FusedGatherScatterOpKernel<CPU, float, int64_t>,
+    ops::FusedGatherScatterOpKernel<CPU, double, int>,
+    ops::FusedGatherScatterOpKernel<CPU, double, int64_t>,
+    ops::FusedGatherScatterOpKernel<CPU, int, int>,
+    ops::FusedGatherScatterOpKernel<CPU, int, int64_t>,
+    ops::FusedGatherScatterOpKernel<CPU, int64_t, int>,
+    ops::FusedGatherScatterOpKernel<CPU, int64_t, int64_t>,
+    ops::FusedGatherScatterOpKernel<CPU, paddle::platform::float16, int>,
+    ops::FusedGatherScatterOpKernel<CPU, paddle::platform::float16, int64_t>);
+
 REGISTER_OP_CPU_KERNEL(
     fused_gather_scatter_grad,
     ops::FusedGatherScatterGradOpKernel<CPU, float, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, double, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, int, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int>,
     ops::FusedGatherScatterGradOpKernel<CPU, float, int64_t>,
+    ops::FusedGatherScatterGradOpKernel<CPU, double, int>,
     ops::FusedGatherScatterGradOpKernel<CPU, double, int64_t>,
+    ops::FusedGatherScatterGradOpKernel<CPU, int, int>,
     ops::FusedGatherScatterGradOpKernel<CPU, int, int64_t>,
-    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int64_t>);
+    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int>,
+    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int64_t>,
+    ops::FusedGatherScatterGradOpKernel<CPU, paddle::platform::float16, int>,
+    ops::FusedGatherScatterGradOpKernel<CPU, paddle::platform::float16,
+                                        int64_t>);
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index 563ae656fc9b8d..401657a4a5d5a4 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -25,7 +25,7 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 struct FusedGatherScatterSumFunctor {
-  void operator()(const int& first_flag, const Tensor& src_slice,
+  void operator()(const bool& first_flag, const Tensor& src_slice,
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
     auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
@@ -35,11 +35,11 @@ struct FusedGatherScatterSumFunctor {
 
 template <typename T>
 struct FusedGatherScatterMinFunctor {
-  void operator()(const int& first_flag, const Tensor& src_slice,
+  void operator()(const bool& first_flag, const Tensor& src_slice,
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
     auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag == 0) {
+    if (first_flag) {
       eigen_dst += eigen_src;
     } else {
       eigen_dst = eigen_dst.cwiseMin(eigen_src);
@@ -53,7 +53,7 @@ struct FusedGatherScatterMaxFunctor {
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
     auto eigen_dst = framework::EigenVector<T>::Flatten(*dst_slice);
-    if (first_flag == 0) {
+    if (first_flag) {
       eigen_dst += eigen_src;
     } else {
       eigen_dst = eigen_dst.cwiseMax(eigen_src);
@@ -73,42 +73,54 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst,
 }
 
 template <typename T, typename IndexT, typename Functor>
-void gather_scatter_cpu_for_loop(const int& index_size, const IndexT* g_index,
-                                 const IndexT* s_index, const Tensor& src,
-                                 Tensor* dst, const std::string& pool_type) {
+void gather_scatter_cpu_for_loop(const int& index_size, const int& input_size,
+                                 const IndexT* g_index, const IndexT* s_index,
+                                 const Tensor& src, Tensor* dst,
+                                 const std::string& pool_type) {
   Functor functor;
   if (pool_type == "MIN" || pool_type == "MAX") {
     std::set<IndexT> existed_dst;
+    bool in_set = false;
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_ptr = g_index[i];
-      IndexT dst_ptr = s_index[i];
-      int nRet = std::count(existed_dst.begin(), existed_dst.end(), dst_ptr);
-      if (nRet == 0) {
-        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_ptr,
-                                                        dst_ptr, 0, functor);
-        existed_dst.insert(dst_ptr);
+      IndexT src_idx = g_index[i];
+      IndexT dst_idx = s_index[i];
+      in_set = existed_dst.find(dst_idx) != existed_dst.end();
+      if (!in_set) {
+        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
+                                                        dst_idx, true, functor);
+        existed_dst.emplace(dst_idx);
       } else {
-        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_ptr,
-                                                        dst_ptr, 1, functor);
+        elementwise_inner_operation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, false, functor);
       }
     }
-  } else if (pool_type == "SUM" || pool_type == "MEAN") {
+  } else if (pool_type == "SUM") {
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_ptr = g_index[i];
-      IndexT dst_ptr = s_index[i];
-      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_ptr,
-                                                      dst_ptr, 0, functor);
+      IndexT src_idx = g_index[i];
+      IndexT dst_idx = s_index[i];
+      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
+                                                      dst_idx, false, functor);
     }
-  }
-}
-
-template <typename T, typename IndexT>
-void cal_mean_cpu_for_loop(const int& size, Tensor* dst, int* count) {
-  for (int i = 0; i < size; ++i) {
-    if (count[i] == 0) continue;
-    auto dst_slice = dst->Slice(i, i + 1);
-    auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-    eigen_dst = eigen_dst / static_cast<T>(count[i]);
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      IndexT src_idx = g_index[i];
+      IndexT dst_idx = s_index[i];
+      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
+                                                      dst_idx, false, functor);
+    }
+    int* count = new int[input_size];
+    memset(count, 0, input_size * sizeof(int));
+    for (int i = 0; i < index_size; ++i) {
+      IndexT dst_idx = s_index[i];
+      count[dst_idx] += 1;
+    }
+    for (int i = 0; i < input_size; ++i) {
+      if (count[i] == 0) continue;
+      auto dst_slice = dst->Slice(i, i + 1);
+      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst = eigen_dst / static_cast<T>(count[i]);
+    }
+    delete[] count;
   }
 }
 
@@ -121,10 +133,10 @@ class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
     auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
     auto* Y = ctx.Output<Tensor>("Out");
 
-    int index_size = gather_index->dims()[0];
+    const int& index_size = gather_index->dims()[0];
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-    auto src_dims = X->dims();
+    const auto& src_dims = X->dims();
     int64_t memset_size = 1;
     for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
     const size_t& memset_bytes = memset_size * sizeof(T);
@@ -136,26 +148,16 @@ class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
     std::string pool_type = ctx.Attr<std::string>("pool_type");
     if (pool_type == "SUM") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
-          index_size, g_index, s_index, *X, Y, pool_type);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MIN") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMinFunctor<T>>(
-          index_size, g_index, s_index, *X, Y, pool_type);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MAX") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMaxFunctor<T>>(
-          index_size, g_index, s_index, *X, Y, pool_type);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
-          index_size, g_index, s_index, *X, Y, pool_type);
-      int count[src_dims[0]];
-      memset(count, 0, src_dims[0] * sizeof(int));
-      for (int i = 0; i < index_size; ++i) {
-        IndexT dst_ptr = s_index[i];
-        count[dst_ptr] += 1;
-      }
-      for (int i = 0; i < src_dims[0]; ++i) {
-        VLOG(0) << count[i];
-      }
-      cal_mean_cpu_for_loop<T, IndexT>(src_dims[0], Y, count);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     }
   }
 };
@@ -169,10 +171,10 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
     auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
     auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    int index_size = gather_index->dims()[0];
+    const int& index_size = gather_index->dims()[0];
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-    auto src_dims = X->dims();
+    const auto& src_dims = X->dims();
     int64_t memset_size = 1;
     for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
     const size_t& memset_bytes = memset_size * sizeof(T);
@@ -181,29 +183,19 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
     const IndexT* g_index = gather_index->data<IndexT>();
     const IndexT* s_index = scatter_index->data<IndexT>();
 
-    std::string pool_type = ctx.Attr<std::string>("pool_type");
+    const std::string& pool_type = ctx.Attr<std::string>("pool_type");
     if (pool_type == "SUM") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
-          index_size, g_index, s_index, *X, Y, pool_type);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MIN") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMinFunctor<T>>(
-          index_size, g_index, s_index, *X, Y, pool_type);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MAX") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMaxFunctor<T>>(
-          index_size, g_index, s_index, *X, Y, pool_type);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
-          index_size, g_index, s_index, *X, Y, pool_type);
-      int count[src_dims[0]];
-      memset(count, 0, src_dims[0] * sizeof(int));
-      for (int i = 0; i < index_size; ++i) {
-        IndexT dst_ptr = s_index[i];
-        count[dst_ptr] += 1;
-      }
-      for (int i = 0; i < src_dims[0]; ++i) {
-        VLOG(0) << count[i];
-      }
-      cal_mean_cpu_for_loop<T, IndexT>(src_dims[0], Y, count);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     }
   }
 };

From a54860c7c1c5e776e2f59135130b45c7730d1396 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 3 Nov 2021 12:50:22 +0000
Subject: [PATCH 04/30] fix arg error, add index judge, delete fp16

---
 .../fused/fused_gather_scatter_op.cc          | 37 +++++++++----------
 .../operators/fused/fused_gather_scatter_op.h |  3 +-
 2 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
index 789ee7e3f217ed..cd7459ac70f3ed 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
@@ -29,7 +29,7 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         ctx->HasInput("Gather_index"), true,
         platform::errors::InvalidArgument(
-            "Input(Gather_indx) of FusedGatherScatterOp should not be null."));
+            "Input(Gather_index) of FusedGatherScatterOp should not be null."));
     PADDLE_ENFORCE_EQ(ctx->HasInput("Scatter_index"), true,
                       platform::errors::InvalidArgument(
                           "Input(Scatter_index) of FusedGatherScatterOp should "
@@ -43,14 +43,14 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel {
     if (gather_index_dims.size() == 2) {
       PADDLE_ENFORCE_EQ(gather_index_dims[1], 1,
                         platform::errors::InvalidArgument(
-                            "The last dim of gather_index should be 1 when it "
+                            "The last dim of Gather_index should be 1 when it "
                             "is 2D, but we get %d",
                             gather_index_dims[1]));
     } else {
       PADDLE_ENFORCE_EQ(
           gather_index_dims.size(), 1,
           platform::errors::InvalidArgument(
-              "The gather_index should be 1D, when it is not 2D, but we get %d",
+              "The Gather_index should be 1D, when it is not 2D, but we get %d",
               gather_index_dims.size()));
     }
 
@@ -58,17 +58,19 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel {
     if (scatter_index_dims.size() == 2) {
       PADDLE_ENFORCE_EQ(scatter_index_dims[1], 1,
                         platform::errors::InvalidArgument(
-                            "The last dim of scatter_index should be 1 when it "
+                            "The last dim of Scatter_index should be 1 when it "
                             "is 2D, but we get %d",
                             scatter_index_dims[1]));
     } else {
       PADDLE_ENFORCE_EQ(
           scatter_index_dims.size(), 1,
-          platform::errors::InvalidArgument("The scatter_index should be 1D, "
+          platform::errors::InvalidArgument("The Scatter_index should be 1D, "
                                             "when it is not 2D, but we get %d",
                                             scatter_index_dims.size()));
     }
 
+    // TODO(daisiming): If the shape of scatter_index and gather_index should be
+    // same?
     auto dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", dims);
   }
@@ -160,17 +162,15 @@ REGISTER_OPERATOR(
     ops::FusedGatherScatterGradOpMaker<paddle::framework::OpDesc>,
     ops::FusedGatherScatterGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(fused_gather_scatter_grad, ops::FusedGatherScatterGradOp);
-REGISTER_OP_CPU_KERNEL(
-    fused_gather_scatter, ops::FusedGatherScatterOpKernel<CPU, float, int>,
-    ops::FusedGatherScatterOpKernel<CPU, float, int64_t>,
-    ops::FusedGatherScatterOpKernel<CPU, double, int>,
-    ops::FusedGatherScatterOpKernel<CPU, double, int64_t>,
-    ops::FusedGatherScatterOpKernel<CPU, int, int>,
-    ops::FusedGatherScatterOpKernel<CPU, int, int64_t>,
-    ops::FusedGatherScatterOpKernel<CPU, int64_t, int>,
-    ops::FusedGatherScatterOpKernel<CPU, int64_t, int64_t>,
-    ops::FusedGatherScatterOpKernel<CPU, paddle::platform::float16, int>,
-    ops::FusedGatherScatterOpKernel<CPU, paddle::platform::float16, int64_t>);
+REGISTER_OP_CPU_KERNEL(fused_gather_scatter,
+                       ops::FusedGatherScatterOpKernel<CPU, float, int>,
+                       ops::FusedGatherScatterOpKernel<CPU, float, int64_t>,
+                       ops::FusedGatherScatterOpKernel<CPU, double, int>,
+                       ops::FusedGatherScatterOpKernel<CPU, double, int64_t>,
+                       ops::FusedGatherScatterOpKernel<CPU, int, int>,
+                       ops::FusedGatherScatterOpKernel<CPU, int, int64_t>,
+                       ops::FusedGatherScatterOpKernel<CPU, int64_t, int>,
+                       ops::FusedGatherScatterOpKernel<CPU, int64_t, int64_t>);
 
 REGISTER_OP_CPU_KERNEL(
     fused_gather_scatter_grad,
@@ -181,7 +181,4 @@ REGISTER_OP_CPU_KERNEL(
     ops::FusedGatherScatterGradOpKernel<CPU, int, int>,
     ops::FusedGatherScatterGradOpKernel<CPU, int, int64_t>,
     ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int64_t>,
-    ops::FusedGatherScatterGradOpKernel<CPU, paddle::platform::float16, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, paddle::platform::float16,
-                                        int64_t>);
+    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int64_t>);
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index 401657a4a5d5a4..8e901aac98961c 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -73,7 +73,7 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst,
 }
 
 template <typename T, typename IndexT, typename Functor>
-void gather_scatter_cpu_for_loop(const int& index_size, const int& input_size,
+void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
                                  const IndexT* g_index, const IndexT* s_index,
                                  const Tensor& src, Tensor* dst,
                                  const std::string& pool_type) {
@@ -134,6 +134,7 @@ class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
     auto* Y = ctx.Output<Tensor>("Out");
 
     const int& index_size = gather_index->dims()[0];
+    if (index_size == 0) return;
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
     const auto& src_dims = X->dims();

From 2f79165017f8202e16c6a660490b42ca317e1323 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 4 Nov 2021 09:10:18 +0000
Subject: [PATCH 05/30] fix bug in CudaAtomicMax and CudaAtomicMin

---
 paddle/fluid/platform/cuda_primitives.h | 28 +++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index 4708a99e8fc4ca..d443e78ed874f3 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -178,9 +178,17 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) {
   // Here, we check long long int must be int64_t.
   static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
-  return CudaAtomicMax(
-      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));            // NOLINT
+  long long int res = *address;  // NOLINT
+  while (val > res) {
+    long long int old = res;                                           // NOLINT
+    res = (long long int)atomicCAS((unsigned long long int *)address,  // NOLINT
+                                   (unsigned long long int)old,        // NOLINT
+                                   (unsigned long long int)val);       // NOLINT
+    if (res == old) {
+      break;
+    }
+  }
+  return res;
 }
 
 CUDA_ATOMIC_WRAPPER(Max, float) {
@@ -254,9 +262,17 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) {
   // Here, we check long long int must be int64_t.
   static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
-  return CudaAtomicMin(
-      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));            // NOLINT
+  long long int res = *address;  // NOLINT
+  while (val < res) {
+    long long int old = res;                                           // NOLINT
+    res = (long long int)atomicCAS((unsigned long long int *)address,  // NOLINT
+                                   (unsigned long long int)old,        // NOLINT
+                                   (unsigned long long int)val);       // NOLINT
+    if (res == old) {
+      break;
+    }
+  }
+  return res;
 }
 
 CUDA_ATOMIC_WRAPPER(Min, float) {

From 8316d6eb8fa2f25476dd570374f831f7ca516144 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 8 Nov 2021 07:42:07 +0000
Subject: [PATCH 06/30] add CUDA version

---
 .../fused/fused_gather_scatter_op.cu          | 349 ++++++++++++++++++
 .../operators/fused/fused_gather_scatter_op.h |   2 +-
 2 files changed, 350 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/fused/fused_gather_scatter_op.cu

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
new file mode 100644
index 00000000000000..4f0514ebf47f46
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
@@ -0,0 +1,349 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <thrust/fill.h>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/fused/fused_gather_scatter_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, typename IndexT>
+struct GatherScatterSumCUDAFunctor {
+  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GatherScatterMaxCUDAFunctor {
+  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT>
+struct GatherScatterMinCUDAFunctor {
+  DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
+                                const IndexT& out_i) {
+    paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
+  }
+};
+
+template <typename T, typename IndexT, typename Functor>
+__global__ void GatherScatterCUDAKernel(const T* params,
+                                        const IndexT* gather_indices,
+                                        const IndexT* scatter_indices,
+                                        T* output, size_t index_size,
+                                        size_t slice_size, Functor functor) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT gather_i = gather_indices[indices_i];
+    IndexT scatter_i = scatter_indices[indices_i];
+    int64_t in_i = gather_i * slice_size + slice_i;
+    int64_t out_i = scatter_i * slice_size + slice_i;
+    functor(params, output, in_i, out_i);
+  }
+}
+
+template <typename T>
+__global__ void InputResetCUDAKernel(T* output, size_t input_size,
+                                     size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::min() ||
+        *(output + i) == std::numeric_limits<T>::max()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void ComputeCountCUDAKernel(int* count,
+                                       const IndexT* scatter_indices,
+                                       size_t index_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
+    IndexT scatter_i = scatter_indices[i];
+    paddle::platform::CudaAtomicAdd(count + scatter_i, 1);
+  }
+}
+
+template <typename T>
+__global__ void ManipulateMeanCUDAKernel(T* output, int* count,
+                                         size_t input_size, size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    int64_t c_index = i / slice_size;
+    if (*(count + c_index) > 1) {
+      *(output + i) = *(output + i) / *(count + c_index);
+    }
+  }
+}
+
+template <typename DeviceContext, typename T, typename IndexT>
+class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>("X");
+    auto* gather_index = ctx.Input<Tensor>("Gather_index");
+    auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
+    auto* Y = ctx.Output<Tensor>("Out");
+    std::string pool_type = ctx.Attr<std::string>("pool_type");
+
+    const int& index_size = gather_index->dims()[0];
+    if (index_size == 0) return;
+
+    T* p_output = Y->mutable_data<T>(ctx.GetPlace());
+    const auto& src_dims = X->dims();
+    int64_t memset_size = 1;
+    for (int i = 0; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+    const size_t& memset_bytes = memset_size * sizeof(T);
+    if (pool_type == "SUM" || pool_type == "MEAN") {
+      cudaMemset(p_output, 0, memset_bytes);
+    } else if (pool_type == "MAX") {
+      thrust::device_ptr<T> p_output_ptr(p_output);
+      thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
+                   std::numeric_limits<T>::min());
+    } else if (pool_type == "MIN") {
+      thrust::device_ptr<T> p_output_ptr(p_output);
+      thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
+                   std::numeric_limits<T>::max());
+    }
+
+    int64_t slice_size = 1;
+    for (int i = 1; i < src_dims.size(); ++i) {
+      slice_size *= src_dims[i];
+    }
+    const T* p_src = X->data<T>();
+    const IndexT* g_index = gather_index->data<IndexT>();
+    const IndexT* s_index = scatter_index->data<IndexT>();
+
+    int block = 512;
+    int64_t n = slice_size * index_size;
+    int64_t grid = (n + block - 1) / block;
+    int64_t input_size = src_dims[0];
+    if (pool_type == "SUM") {
+      GatherScatterSumCUDAFunctor<T, IndexT> functor;
+      GatherScatterCUDAKernel<T, IndexT,
+                              GatherScatterSumCUDAFunctor<T, IndexT>><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, functor);
+    } else if (pool_type == "MAX") {
+      GatherScatterMaxCUDAFunctor<T, IndexT> functor;
+      GatherScatterCUDAKernel<T, IndexT,
+                              GatherScatterMaxCUDAFunctor<T, IndexT>><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, functor);
+
+      int64_t grid_max = (input_size * slice_size + block - 1) / block;
+      InputResetCUDAKernel<
+          T><<<grid_max, block, 0,
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(p_output, input_size, slice_size);
+    } else if (pool_type == "MIN") {
+      GatherScatterMinCUDAFunctor<T, IndexT> functor;
+      GatherScatterCUDAKernel<T, IndexT,
+                              GatherScatterMinCUDAFunctor<T, IndexT>><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, functor);
+
+      int64_t grid_min = (input_size * slice_size + block - 1) / block;
+      InputResetCUDAKernel<
+          T><<<grid_min, block, 0,
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(p_output, input_size, slice_size);
+    } else if (pool_type == "MEAN") {
+      GatherScatterSumCUDAFunctor<T, IndexT> functor;
+      GatherScatterCUDAKernel<T, IndexT,
+                              GatherScatterSumCUDAFunctor<T, IndexT>><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, functor);
+
+      Tensor count;
+      int* count_ptr = count.mutable_data<int>({input_size}, ctx.GetPlace());
+      cudaMemset(count_ptr, 0, input_size * sizeof(int));
+      int64_t grid_count = (index_size + block - 1) / block;
+      ComputeCountCUDAKernel<
+          T, IndexT><<<grid_count, block, 0,
+                       reinterpret_cast<const platform::CUDADeviceContext&>(
+                           ctx.device_context())
+                           .stream()>>>(count_ptr, s_index, index_size);
+
+      int64_t grid_mean = (input_size * slice_size + block - 1) / block;
+      ManipulateMeanCUDAKernel<
+          T><<<grid_mean, block, 0,
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(p_output, count_ptr, input_size, slice_size);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T, typename IndexT>
+class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* gather_index = ctx.Input<Tensor>("Gather_index");
+    auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
+    auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
+    std::string pool_type = ctx.Attr<std::string>("pool_type");
+
+    const int& index_size = gather_index->dims()[0];
+    if (index_size == 0) return;
+
+    T* p_output = Y->mutable_data<T>(ctx.GetPlace());
+    const auto& src_dims = X->dims();
+    int64_t memset_size = 1;
+    for (int i = 0; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+    const size_t& memset_bytes = memset_size * sizeof(T);
+    if (pool_type == "SUM" || pool_type == "MEAN") {
+      cudaMemset(p_output, 0, memset_bytes);
+    } else if (pool_type == "MAX") {
+      thrust::device_ptr<T> p_output_ptr(p_output);
+      thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
+                   std::numeric_limits<T>::min());
+    } else if (pool_type == "MIN") {
+      thrust::device_ptr<T> p_output_ptr(p_output);
+      thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
+                   std::numeric_limits<T>::max());
+    }
+
+    int64_t slice_size = 1;
+    for (int i = 1; i < src_dims.size(); ++i) {
+      slice_size *= src_dims[i];
+    }
+    const T* p_src = X->data<T>();
+    const IndexT* g_index = gather_index->data<IndexT>();
+    const IndexT* s_index = scatter_index->data<IndexT>();
+
+    int block = 512;
+    int64_t n = slice_size * index_size;
+    int64_t grid = (n + block - 1) / block;
+    int64_t input_size = src_dims[0];
+    if (pool_type == "SUM") {
+      GatherScatterSumCUDAFunctor<T, IndexT> functor;
+      GatherScatterCUDAKernel<T, IndexT,
+                              GatherScatterSumCUDAFunctor<T, IndexT>><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, functor);
+    } else if (pool_type == "MAX") {
+      GatherScatterMaxCUDAFunctor<T, IndexT> functor;
+      GatherScatterCUDAKernel<T, IndexT,
+                              GatherScatterMaxCUDAFunctor<T, IndexT>><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, functor);
+
+      int64_t grid_max = (input_size * slice_size + block - 1) / block;
+      InputResetCUDAKernel<
+          T><<<grid_max, block, 0,
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(p_output, input_size, slice_size);
+    } else if (pool_type == "MIN") {
+      GatherScatterMinCUDAFunctor<T, IndexT> functor;
+      GatherScatterCUDAKernel<T, IndexT,
+                              GatherScatterMinCUDAFunctor<T, IndexT>><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, functor);
+
+      int64_t grid_min = (input_size * slice_size + block - 1) / block;
+      InputResetCUDAKernel<
+          T><<<grid_min, block, 0,
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(p_output, input_size, slice_size);
+    } else if (pool_type == "MEAN") {
+      GatherScatterSumCUDAFunctor<T, IndexT> functor;
+      GatherScatterCUDAKernel<T, IndexT,
+                              GatherScatterSumCUDAFunctor<T, IndexT>><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, functor);
+
+      Tensor count;
+      int* count_ptr = count.mutable_data<int>({input_size}, ctx.GetPlace());
+      cudaMemset(count_ptr, 0, input_size * sizeof(int));
+      int64_t grid_count = (index_size + block - 1) / block;
+      ComputeCountCUDAKernel<
+          T, IndexT><<<grid_count, block, 0,
+                       reinterpret_cast<const platform::CUDADeviceContext&>(
+                           ctx.device_context())
+                           .stream()>>>(count_ptr, s_index, index_size);
+
+      int64_t grid_mean = (input_size * slice_size + block - 1) / block;
+      ManipulateMeanCUDAKernel<
+          T><<<grid_mean, block, 0,
+               reinterpret_cast<const platform::CUDADeviceContext&>(
+                   ctx.device_context())
+                   .stream()>>>(p_output, count_ptr, input_size, slice_size);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+using CUDA = paddle::platform::CUDADeviceContext;
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_gather_scatter, ops::FusedGatherScatterOpCUDAKernel<CUDA, float, int>,
+    ops::FusedGatherScatterOpCUDAKernel<CUDA, float, int64_t>,
+    ops::FusedGatherScatterOpCUDAKernel<CUDA, double, int>,
+    ops::FusedGatherScatterOpCUDAKernel<CUDA, double, int64_t>,
+    ops::FusedGatherScatterOpCUDAKernel<CUDA, int, int>,
+    ops::FusedGatherScatterOpCUDAKernel<CUDA, int, int64_t>,
+    ops::FusedGatherScatterOpCUDAKernel<CUDA, int64_t, int>,
+    ops::FusedGatherScatterOpCUDAKernel<CUDA, int64_t, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(
+    fused_gather_scatter_grad,
+    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, float, int>,
+    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, float, int64_t>,
+    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, double, int>,
+    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, double, int64_t>,
+    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, int, int>,
+    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, int, int64_t>,
+    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, int64_t, int>,
+    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, int64_t, int64_t>);
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index 8e901aac98961c..8a8b9eaf9654fc 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -115,7 +115,7 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
       count[dst_idx] += 1;
     }
     for (int i = 0; i < input_size; ++i) {
-      if (count[i] == 0) continue;
+      if (count[i] <= 1) continue;
       auto dst_slice = dst->Slice(i, i + 1);
       auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
       eigen_dst = eigen_dst / static_cast<T>(count[i]);

From 80accd81f522287605978890c6c138de1d799b5e Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Tue, 9 Nov 2021 06:13:28 +0000
Subject: [PATCH 07/30] fix grad_op bug for index

---
 .../fluid/operators/fused/fused_gather_scatter_op.cu   |  4 ++--
 paddle/fluid/operators/fused/fused_gather_scatter_op.h | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
index 4f0514ebf47f46..b82c4e7da835ee 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
@@ -215,8 +215,8 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* gather_index = ctx.Input<Tensor>("Gather_index");
-    auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
+    auto* gather_index = ctx.Input<Tensor>("Scatter_index");
+    auto* scatter_index = ctx.Input<Tensor>("Gather_index");
     auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
     std::string pool_type = ctx.Attr<std::string>("pool_type");
 
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index 8a8b9eaf9654fc..e74a861d63e86b 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -80,11 +80,10 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
   Functor functor;
   if (pool_type == "MIN" || pool_type == "MAX") {
     std::set<IndexT> existed_dst;
-    bool in_set = false;
     for (int i = 0; i < index_size; ++i) {
       IndexT src_idx = g_index[i];
       IndexT dst_idx = s_index[i];
-      in_set = existed_dst.find(dst_idx) != existed_dst.end();
+      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
       if (!in_set) {
         elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
                                                         dst_idx, true, functor);
@@ -146,7 +145,7 @@ class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
     const IndexT* g_index = gather_index->data<IndexT>();
     const IndexT* s_index = scatter_index->data<IndexT>();
 
-    std::string pool_type = ctx.Attr<std::string>("pool_type");
+    const std::string& pool_type = ctx.Attr<std::string>("pool_type");
     if (pool_type == "SUM") {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
@@ -168,11 +167,12 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* gather_index = ctx.Input<Tensor>("Gather_index");
-    auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
+    auto* gather_index = ctx.Input<Tensor>("Scatter_index");
+    auto* scatter_index = ctx.Input<Tensor>("Gather_index");
     auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     const int& index_size = gather_index->dims()[0];
+    if (index_size == 0) return;
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
     const auto& src_dims = X->dims();

From 868a50a16b82b3bfe26af0f262ea332c9acb76e9 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 11 Nov 2021 04:56:07 +0000
Subject: [PATCH 08/30] add op test, add correct cpu grad op

---
 .../fused/fused_gather_scatter_op.cc          |  43 +++--
 .../operators/fused/fused_gather_scatter_op.h |  77 ++++++--
 .../unittests/test_fused_gather_scatter_op.py | 182 ++++++++++++++++++
 3 files changed, 273 insertions(+), 29 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
index cd7459ac70f3ed..e55216e2e82a2d 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
@@ -22,22 +22,13 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"), true,
-        platform::errors::InvalidArgument(
-            "Input(X) of FusedGatherScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Gather_index"), true,
-        platform::errors::InvalidArgument(
-            "Input(Gather_index) of FusedGatherScatterOp should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Scatter_index"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(Scatter_index) of FusedGatherScatterOp should "
-                          "not be null."));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"), true,
-        platform::errors::InvalidArgument(
-            "Output(Out) of FusedGatherScatterOp should not be null."));
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGatherScatter");
+    OP_INOUT_CHECK(ctx->HasInput("Gather_index"), "Input", "Gather_index",
+                   "FusedGatherScatter");
+    OP_INOUT_CHECK(ctx->HasInput("Scatter_index"), "Input", "Scatter_index",
+                   "FusedGatherScatter");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
+                   "FusedGatherScatter");
 
     auto gather_index_dims = ctx->GetInputDim("Gather_index");
     if (gather_index_dims.size() == 2) {
@@ -73,6 +64,12 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel {
     // same?
     auto dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", dims);
+
+    if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
+      OP_INOUT_CHECK(ctx->HasOutput("Scatter_count"), "Output", "Scatter_count",
+                     "FusedGatherScatter");
+      ctx->SetOutputDim("Scatter_count", {dims[0]});
+    }
   }
 
  protected:
@@ -111,6 +108,9 @@ class FusedGatherScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Gather_index", "The gather index tensor.");
     AddInput("Scatter_index", "The scatter index tensor.");
     AddOutput("Out", "Output tensor of fused_gather_scatter op.");
+    AddOutput("Scatter_count",
+              "Count tensor of Scatter index, mainly for MEAN pool_type.")
+        .AsIntermediate();
     AddAttr<std::string>(
         "pool_type",
         "(string, default 'SUM')"
@@ -144,6 +144,17 @@ class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("fused_gather_scatter_grad");
     op->SetInput("Gather_index", this->Input("Gather_index"));
     op->SetInput("Scatter_index", this->Input("Scatter_index"));
+
+    if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") {
+      op->SetInput("Scatter_count", this->Output("Scatter_count"));
+    }
+
+    if (BOOST_GAT_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
+        BOOST_GAT_CONST(std::string, this->GetAttr("pool_type")) == "MAX") {
+      op->SetInput("X", this->Input("X"));
+      op->SetInput("Out", this->Output("Out"));
+    }
+
     op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     op->SetAttrMap(this->Attrs());
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index e74a861d63e86b..cac1309c06e33b 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -76,7 +76,8 @@ template <typename T, typename IndexT, typename Functor>
 void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
                                  const IndexT* g_index, const IndexT* s_index,
                                  const Tensor& src, Tensor* dst,
-                                 const std::string& pool_type) {
+                                 const std::string& pool_type,
+                                 int* scatter_count = NULL) {
   Functor functor;
   if (pool_type == "MIN" || pool_type == "MAX") {
     std::set<IndexT> existed_dst;
@@ -107,19 +108,59 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
       elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
                                                       dst_idx, false, functor);
     }
-    int* count = new int[input_size];
-    memset(count, 0, input_size * sizeof(int));
     for (int i = 0; i < index_size; ++i) {
       IndexT dst_idx = s_index[i];
-      count[dst_idx] += 1;
+      *(scatter_count + dst_idx) += 1;
     }
     for (int i = 0; i < input_size; ++i) {
-      if (count[i] <= 1) continue;
+      if (*(scatter_count + i) == 0) continue;
       auto dst_slice = dst->Slice(i, i + 1);
       auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst = eigen_dst / static_cast<T>(count[i]);
+      eigen_dst = eigen_dst / static_cast<T>(*(scatter_count + i));
+    }
+  }
+}
+
+template <typename T, typename IndexT, typename Functor>
+void gather_scatter_cpu_for_loop_grad(const int& input_size,
+                                      const int& index_size,
+                                      const IndexT* g_index,
+                                      const IndexT* s_index, const Tensor& src,
+                                      Tensor* dst, const std::string& pool_type,
+                                      const int* scatter_count = NULL) {
+  Functor functor;
+  if (pool_type == "SUM") {
+    for (int i = 0; i < index_size; ++i) {
+      IndexT src_idx = g_index[i];
+      IndexT dst_idx = s_index[i];
+      elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
+                                                      dst_idx, false, functor);
+    }
+  } else if (pool_type == "MEAN") {
+    for (int i = 0; i < index_size; ++i) {
+      IndexT src_idx = g_index[i];
+      IndexT dst_idx = s_index[i];
+      auto src_slice = src.Slice(src_idx, src_idx + 1);
+      auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
+      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += (eigen_src / static_cast<T>(scatter_count[src_idx]));
+    }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    std::set<IndexT> existed_dst;
+    for (int i = 0; i < index_size; ++i) {
+      IndexT src_idx = g_index[i];
+      IndexT dst_idx = s_index[i];
+      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
+      if (!in_set) {
+        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
+                                                        dst_idx, true, functor);
+        existed_dst.emplace(dst_idx);
+      } else {
+        elementwise_inner_operation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, false, functor);
+      }
     }
-    delete[] count;
   }
 }
 
@@ -156,8 +197,12 @@ class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMaxFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
+      auto* scatter_count = ctx.Output<Tensor>("Scatter_count");
+      int* p_scatter_count = scatter_count->mutable_data<int>(ctx.GetPlace());
+      memset(p_scatter_count, 0, src_dims[0] * sizeof(int));
       gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type,
+          p_scatter_count);
     }
   }
 };
@@ -186,17 +231,23 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
 
     const std::string& pool_type = ctx.Attr<std::string>("pool_type");
     if (pool_type == "SUM") {
-      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
+      gather_scatter_cpu_for_loop_grad<T, IndexT,
+                                       FusedGatherScatterSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MIN") {
-      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMinFunctor<T>>(
+      gather_scatter_cpu_for_loop_grad<T, IndexT,
+                                       FusedGatherScatterMinFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MAX") {
-      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMaxFunctor<T>>(
+      gather_scatter_cpu_for_loop_grad<T, IndexT,
+                                       FusedGatherScatterMaxFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
-      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
+      auto* scatter_count = ctx.Input<Tensor>("Scatter_count");
+      const int* s_count = scatter_count->data<int>();
+      gather_scatter_cpu_for_loop_grad<T, IndexT,
+                                       FusedGatherScatterSumFunctor<T>>(
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count);
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
new file mode 100644
index 00000000000000..cbd5590d6750db
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+"""
+class TestFusedGatherScatterMaxOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "fused_gather_scatter"
+        x = np.random.random((10, 20)).astype("float64")
+        index = np.random.randint(0, 10, (15, 2))
+        gather_index = index[:, 0]
+        scatter_index = index[:, 1]
+
+        self.inputs = {
+            'X': x,
+            'Gather_index': gather_index,
+            'Scatter_index': scatter_index
+        }
+
+        self.attrs = {'pool_type': 'MAX'}
+
+        out, scatter_count = compute_gather_scatter(self.inputs, self.attrs)
+
+        self.outputs = {'Out': out, 'Scatter_count': scatter_count}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestFusedGatherScatterMinOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "fused_gather_scatter"
+        x = np.random.random((10, 20)).astype("float64")
+        index = np.random.randint(0, 10, (15, 2))
+        gather_index = index[:, 0]
+        scatter_index = index[:, 1]
+
+        self.inputs = {
+            'X': x,
+            'Gather_index': gather_index,
+            'Scatter_index': scatter_index
+        }
+
+        self.attrs = {'pool_type': 'MIN'}
+
+        out, scatter_count = compute_gather_scatter(self.inputs, self.attrs)
+
+        self.outputs = {'Out': out, 'Scatter_count': scatter_count}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+"""
+
+
+class TestFusedGatherScatterSumOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "fused_gather_scatter"
+        x = np.random.random((10, 20)).astype("float64")
+        index = np.random.randint(0, 10, (15, 2))
+        gather_index = index[:, 0]
+        scatter_index = index[:, 1]
+
+        self.inputs = {
+            'X': x,
+            'Gather_index': gather_index,
+            'Scatter_index': scatter_index
+        }
+
+        self.attrs = {'pool_type': 'SUM'}
+
+        out, _ = compute_gather_scatter(self.inputs, self.attrs)
+
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestFusedGatherScatterMeanOp(OpTest):
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "fused_gather_scatter"
+        x = np.random.random((10, 20)).astype("float64")
+        index = np.random.randint(0, 10, (15, 2))
+        gather_index = index[:, 0]
+        scatter_index = index[:, 1]
+
+        self.inputs = {
+            'X': x,
+            'Gather_index': gather_index,
+            'Scatter_index': scatter_index
+        }
+
+        self.attrs = {'pool_type': 'MEAN'}
+
+        out, scatter_count = compute_gather_scatter(self.inputs, self.attrs)
+
+        self.outputs = {'Out': out, 'Scatter_count': scatter_count}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+def compute_gather_scatter(inputs, attributes):
+    x = inputs['X']
+    gather_index = inputs['Gather_index']
+    scatter_index = inputs['Scatter_index']
+
+    pool_type = attributes['pool_type']
+
+    gather_x = x[gather_index]
+    target_shape = list(x.shape)
+    if pool_type == 'SUM':
+        results = np.zeros(target_shape, dtype=x.dtype)
+        for index, s_id in enumerate(scatter_index):
+            results[s_id, :] += gather_x[index, :]
+    elif pool_type == 'MEAN':
+        results = np.zeros(target_shape, dtype=x.dtype)
+        count = np.zeros(target_shape[0], dtype=np.int32)
+        for index, s_id in enumerate(scatter_index):
+            results[s_id, :] += gather_x[index, :]
+            count[s_id] += 1
+        results = results / count.reshape([-1, 1])
+        results[np.isnan(results)] = 0
+    elif pool_type == 'MAX':
+        results = np.zeros(target_shape, dtype=x.dtype)
+        first_set = set()
+        for index, s_id in enumerate(scatter_index):
+            if s_id not in first_set:
+                results[s_id, :] += gather_x[index, :]
+                first_set.add(s_id)
+            else:
+                results[s_id, :] = np.maximum(results[s_id, :],
+                                              gather_x[index, :])
+    elif pool_type == 'MIN':
+        results = np.zeros(target_shape, dtype=x.dtype)
+        first_set = set()
+        for index, s_id in enumerate(scatter_index):
+            if s_id not in first_set:
+                results[s_id, :] += gather_x[index, :]
+                first_set.add(s_id)
+            else:
+                results[s_id, :] = np.minimum(results[s_id, :],
+                                              gather_x[index, :])
+    else:
+        raise ValueError(
+            "Invalid pool_type, only SUM, MEAN, MAX, MIN supported!")
+
+    count = np.zeros(target_shape[0], dtype=np.int32)
+    for index, s_id in enumerate(scatter_index):
+        count[s_id] += 1
+
+    return results, count

From d2da1ee74f7fbfbda143f75129f0b00e7a5af54c Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 11 Nov 2021 12:10:53 +0000
Subject: [PATCH 09/30] Add correct CUDA Mean grad

---
 .../fused/fused_gather_scatter_op.cc          |  4 +-
 .../fused/fused_gather_scatter_op.cu          | 59 +++++++++++--------
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
index e55216e2e82a2d..dc5b7a95265751 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
@@ -149,8 +149,8 @@ class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker<T> {
       op->SetInput("Scatter_count", this->Output("Scatter_count"));
     }
 
-    if (BOOST_GAT_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
-        BOOST_GAT_CONST(std::string, this->GetAttr("pool_type")) == "MAX") {
+    if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
+        BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") {
       op->SetInput("X", this->Input("X"));
       op->SetInput("Out", this->Output("Out"));
     }
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
index b82c4e7da835ee..21247599300065 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
@@ -67,6 +67,7 @@ __global__ void GatherScatterCUDAKernel(const T* params,
   }
 }
 
+// For min and max
 template <typename T>
 __global__ void InputResetCUDAKernel(T* output, size_t input_size,
                                      size_t slice_size) {
@@ -78,6 +79,7 @@ __global__ void InputResetCUDAKernel(T* output, size_t input_size,
   }
 }
 
+// Get scatter_count
 template <typename T, typename IndexT>
 __global__ void ComputeCountCUDAKernel(int* count,
                                        const IndexT* scatter_indices,
@@ -88,6 +90,7 @@ __global__ void ComputeCountCUDAKernel(int* count,
   }
 }
 
+// For forward mean
 template <typename T>
 __global__ void ManipulateMeanCUDAKernel(T* output, int* count,
                                          size_t input_size, size_t slice_size) {
@@ -99,6 +102,26 @@ __global__ void ManipulateMeanCUDAKernel(T* output, int* count,
   }
 }
 
+// For backward mean
+template <typename T, typename IndexT>
+__global__ void ManipulateMeanGradCUDAKernel(const T* params,
+                                             const IndexT* gather_indices,
+                                             const IndexT* scatter_indices,
+                                             T* output, size_t index_size,
+                                             size_t slice_size,
+                                             const int* scatter_count) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT gather_i = gather_indices[indices_i];
+    IndexT scatter_i = scatter_indices[indices_i];
+    int64_t in_i = gather_i * slice_size + slice_i;
+    int64_t out_i = scatter_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(output + out_i,
+                                    *(params + in_i) / scatter_count[gather_i]);
+  }
+}
+
 template <typename DeviceContext, typename T, typename IndexT>
 class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -190,22 +213,23 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
                               .stream()>>>(p_src, g_index, s_index, p_output,
                                            index_size, slice_size, functor);
 
-      Tensor count;
-      int* count_ptr = count.mutable_data<int>({input_size}, ctx.GetPlace());
-      cudaMemset(count_ptr, 0, input_size * sizeof(int));
+      auto* scatter_count = ctx.Output<Tensor>("Scatter_count");
+      int* p_scatter_count = scatter_count->mutable_data<int>(ctx.GetPlace());
+      cudaMemset(p_scatter_count, 0, input_size * sizeof(int));
       int64_t grid_count = (index_size + block - 1) / block;
       ComputeCountCUDAKernel<
           T, IndexT><<<grid_count, block, 0,
                        reinterpret_cast<const platform::CUDADeviceContext&>(
                            ctx.device_context())
-                           .stream()>>>(count_ptr, s_index, index_size);
+                           .stream()>>>(p_scatter_count, s_index, index_size);
 
       int64_t grid_mean = (input_size * slice_size + block - 1) / block;
       ManipulateMeanCUDAKernel<
           T><<<grid_mean, block, 0,
                reinterpret_cast<const platform::CUDADeviceContext&>(
                    ctx.device_context())
-                   .stream()>>>(p_output, count_ptr, input_size, slice_size);
+                   .stream()>>>(p_output, p_scatter_count, input_size,
+                                slice_size);
     }
   }
 };
@@ -293,30 +317,13 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
                    ctx.device_context())
                    .stream()>>>(p_output, input_size, slice_size);
     } else if (pool_type == "MEAN") {
-      GatherScatterSumCUDAFunctor<T, IndexT> functor;
-      GatherScatterCUDAKernel<T, IndexT,
-                              GatherScatterSumCUDAFunctor<T, IndexT>><<<
+      auto* scatter_count = ctx.Input<Tensor>("Scatter_count");
+      const int* s_count = scatter_count->data<int>();
+      ManipulateMeanGradCUDAKernel<T, IndexT><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(p_src, g_index, s_index, p_output,
-                                           index_size, slice_size, functor);
-
-      Tensor count;
-      int* count_ptr = count.mutable_data<int>({input_size}, ctx.GetPlace());
-      cudaMemset(count_ptr, 0, input_size * sizeof(int));
-      int64_t grid_count = (index_size + block - 1) / block;
-      ComputeCountCUDAKernel<
-          T, IndexT><<<grid_count, block, 0,
-                       reinterpret_cast<const platform::CUDADeviceContext&>(
-                           ctx.device_context())
-                           .stream()>>>(count_ptr, s_index, index_size);
-
-      int64_t grid_mean = (input_size * slice_size + block - 1) / block;
-      ManipulateMeanCUDAKernel<
-          T><<<grid_mean, block, 0,
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(p_output, count_ptr, input_size, slice_size);
+                                           index_size, slice_size, s_count);
     }
   }
 };

From 0235923e63c5c0ac4296ef153e90db12ff44dfb9 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Fri, 12 Nov 2021 09:59:35 +0000
Subject: [PATCH 10/30] [Add] Successful MEAN and SUM

---
 .../operators/fused/fused_gather_scatter_op.h | 85 +++++++++----------
 .../unittests/test_fused_gather_scatter_op.py | 70 ++++++++++-----
 2 files changed, 90 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index cac1309c06e33b..73ae5a168a30e7 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -79,22 +79,7 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
                                  const std::string& pool_type,
                                  int* scatter_count = NULL) {
   Functor functor;
-  if (pool_type == "MIN" || pool_type == "MAX") {
-    std::set<IndexT> existed_dst;
-    for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = g_index[i];
-      IndexT dst_idx = s_index[i];
-      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
-      if (!in_set) {
-        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                        dst_idx, true, functor);
-        existed_dst.emplace(dst_idx);
-      } else {
-        elementwise_inner_operation<T, IndexT, Functor>(
-            src, dst, src_idx, dst_idx, false, functor);
-      }
-    }
-  } else if (pool_type == "SUM") {
+  if (pool_type == "SUM") {
     for (int i = 0; i < index_size; ++i) {
       IndexT src_idx = g_index[i];
       IndexT dst_idx = s_index[i];
@@ -118,18 +103,32 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
       auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
       eigen_dst = eigen_dst / static_cast<T>(*(scatter_count + i));
     }
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    std::set<IndexT> existed_dst;
+    for (int i = 0; i < index_size; ++i) {
+      IndexT src_idx = g_index[i];
+      IndexT dst_idx = s_index[i];
+      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
+      if (!in_set) {
+        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
+                                                        dst_idx, true, functor);
+        existed_dst.emplace(dst_idx);
+      } else {
+        elementwise_inner_operation<T, IndexT, Functor>(
+            src, dst, src_idx, dst_idx, false, functor);
+      }
+    }
   }
 }
 
 template <typename T, typename IndexT, typename Functor>
-void gather_scatter_cpu_for_loop_grad(const int& input_size,
-                                      const int& index_size,
-                                      const IndexT* g_index,
-                                      const IndexT* s_index, const Tensor& src,
-                                      Tensor* dst, const std::string& pool_type,
-                                      const int* scatter_count = NULL) {
-  Functor functor;
+void gather_scatter_cpu_for_loop_grad(
+    const int& input_size, const int& index_size, const IndexT* g_index,
+    const IndexT* s_index, const Tensor& src, Tensor* dst,
+    const std::string& pool_type, const int* scatter_count = nullptr,
+    const Tensor* input = nullptr, const Tensor* output = nullptr) {
   if (pool_type == "SUM") {
+    Functor functor;
     for (int i = 0; i < index_size; ++i) {
       IndexT src_idx = g_index[i];
       IndexT dst_idx = s_index[i];
@@ -147,19 +146,19 @@ void gather_scatter_cpu_for_loop_grad(const int& input_size,
       eigen_dst += (eigen_src / static_cast<T>(scatter_count[src_idx]));
     }
   } else if (pool_type == "MIN" || pool_type == "MAX") {
-    std::set<IndexT> existed_dst;
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = g_index[i];
-      IndexT dst_idx = s_index[i];
-      bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
-      if (!in_set) {
-        elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
-                                                        dst_idx, true, functor);
-        existed_dst.emplace(dst_idx);
-      } else {
-        elementwise_inner_operation<T, IndexT, Functor>(
-            src, dst, src_idx, dst_idx, false, functor);
-      }
+      auto forward_src_idx = s_index[i];
+      auto forward_dst_idx = g_index[i];
+      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
+      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto eigen_input = framework::EigenVector<T>::Flatten(input_slice);
+      auto eigen_output = framework::EigenVector<T>::Flatten(output_slice);
+
+      auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1);
+      auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1);
+      auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
+      auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
+      eigen_dst += eigen_src * (eigen_output == eigen_input);
     }
   }
 }
@@ -234,20 +233,20 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
       gather_scatter_cpu_for_loop_grad<T, IndexT,
                                        FusedGatherScatterSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
-    } else if (pool_type == "MIN") {
-      gather_scatter_cpu_for_loop_grad<T, IndexT,
-                                       FusedGatherScatterMinFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
-    } else if (pool_type == "MAX") {
-      gather_scatter_cpu_for_loop_grad<T, IndexT,
-                                       FusedGatherScatterMaxFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
       auto* scatter_count = ctx.Input<Tensor>("Scatter_count");
       const int* s_count = scatter_count->data<int>();
       gather_scatter_cpu_for_loop_grad<T, IndexT,
                                        FusedGatherScatterSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count);
+    } else if (pool_type == "MIN" || pool_type == "MAX") {
+      auto* input = ctx.Input<Tensor>("X");
+      auto* output = ctx.Input<Tensor>("Out");
+      // Functor not used here.
+      gather_scatter_cpu_for_loop_grad<T, IndexT,
+                                       FusedGatherScatterMinFunctor<T>>(
+          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, nullptr,
+          input, output);
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
index cbd5590d6750db..63c4161fb53d40 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
@@ -16,7 +16,8 @@
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
-"""
+
+
 class TestFusedGatherScatterMaxOp(OpTest):
     def setUp(self):
         paddle.enable_static()
@@ -34,9 +35,10 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MAX'}
 
-        out, scatter_count = compute_gather_scatter(self.inputs, self.attrs)
+        out, gradient = compute_gather_scatter_for_min_max(self.inputs,
+                                                           self.attrs)
 
-        self.outputs = {'Out': out, 'Scatter_count': scatter_count}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
@@ -62,16 +64,16 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MIN'}
 
-        out, scatter_count = compute_gather_scatter(self.inputs, self.attrs)
+        out, self.gradient = compute_gather_scatter_for_min_max(self.inputs,
+                                                                self.attrs)
 
-        self.outputs = {'Out': out, 'Scatter_count': scatter_count}
+        self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
-"""
+        self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient])
 
 
 class TestFusedGatherScatterSumOp(OpTest):
@@ -91,7 +93,7 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'SUM'}
 
-        out, _ = compute_gather_scatter(self.inputs, self.attrs)
+        out, _ = compute_gather_scatter_for_sum_mean(self.inputs, self.attrs)
 
         self.outputs = {'Out': out}
 
@@ -119,7 +121,8 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MEAN'}
 
-        out, scatter_count = compute_gather_scatter(self.inputs, self.attrs)
+        out, scatter_count = compute_gather_scatter_for_sum_mean(self.inputs,
+                                                                 self.attrs)
 
         self.outputs = {'Out': out, 'Scatter_count': scatter_count}
 
@@ -130,7 +133,7 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-def compute_gather_scatter(inputs, attributes):
+def compute_gather_scatter_for_sum_mean(inputs, attributes):
     x = inputs['X']
     gather_index = inputs['Gather_index']
     scatter_index = inputs['Scatter_index']
@@ -139,20 +142,41 @@ def compute_gather_scatter(inputs, attributes):
 
     gather_x = x[gather_index]
     target_shape = list(x.shape)
+    results = np.zeros(target_shape, dtype=x.dtype)
     if pool_type == 'SUM':
-        results = np.zeros(target_shape, dtype=x.dtype)
         for index, s_id in enumerate(scatter_index):
             results[s_id, :] += gather_x[index, :]
     elif pool_type == 'MEAN':
-        results = np.zeros(target_shape, dtype=x.dtype)
         count = np.zeros(target_shape[0], dtype=np.int32)
         for index, s_id in enumerate(scatter_index):
             results[s_id, :] += gather_x[index, :]
             count[s_id] += 1
         results = results / count.reshape([-1, 1])
         results[np.isnan(results)] = 0
-    elif pool_type == 'MAX':
-        results = np.zeros(target_shape, dtype=x.dtype)
+    else:
+        raise ValueError("Invalid pool_type, only SUM, MEAN supported!")
+
+    count = np.zeros(target_shape[0], dtype=np.int32)
+    for index, s_id in enumerate(scatter_index):
+        count[s_id] += 1
+
+    return results, count
+
+
+def compute_gather_scatter_for_min_max(inputs, attributes):
+    x = inputs['X']
+    gather_index = inputs['Gather_index']
+    scatter_index = inputs['Scatter_index']
+
+    pool_type = attributes['pool_type']
+
+    gather_x = x[gather_index]
+    target_shape = list(x.shape)
+    results = np.zeros(target_shape, dtype=x.dtype)
+    gradient = np.zeros_like(x)
+
+    # Calculate forward output 
+    if pool_type == "MAX":
         first_set = set()
         for index, s_id in enumerate(scatter_index):
             if s_id not in first_set:
@@ -161,8 +185,7 @@ def compute_gather_scatter(inputs, attributes):
             else:
                 results[s_id, :] = np.maximum(results[s_id, :],
                                               gather_x[index, :])
-    elif pool_type == 'MIN':
-        results = np.zeros(target_shape, dtype=x.dtype)
+    elif pool_type == "MIN":
         first_set = set()
         for index, s_id in enumerate(scatter_index):
             if s_id not in first_set:
@@ -172,11 +195,14 @@ def compute_gather_scatter(inputs, attributes):
                 results[s_id, :] = np.minimum(results[s_id, :],
                                               gather_x[index, :])
     else:
-        raise ValueError(
-            "Invalid pool_type, only SUM, MEAN, MAX, MIN supported!")
+        raise ValueError("Invalid pool_type, only MAX, MIN supported!")
 
-    count = np.zeros(target_shape[0], dtype=np.int32)
-    for index, s_id in enumerate(scatter_index):
-        count[s_id] += 1
+    # Calculate backward gradient
+    index_size = len(gather_index)
+    for i in range(index_size):
+        forward_src_idx = scatter_index[i]
+        forward_dst_idx = gather_index[i]
+        gradient[forward_src_idx] += 1 * (
+            x[forward_src_idx] == results[forward_dst_idx])
 
-    return results, count
+    return results, gradient

From 484a84bed837c8905fd4cd68fc625b515565adba Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 15 Nov 2021 03:43:00 +0000
Subject: [PATCH 11/30] [Add] Successful MIN and MAX in CPU

---
 .../operators/fused/fused_gather_scatter_op.h     |  1 +
 .../unittests/test_fused_gather_scatter_op.py     | 15 +++++++--------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index 73ae5a168a30e7..0303030f701308 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -236,6 +236,7 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
     } else if (pool_type == "MEAN") {
       auto* scatter_count = ctx.Input<Tensor>("Scatter_count");
       const int* s_count = scatter_count->data<int>();
+      // Functor not used here.
       gather_scatter_cpu_for_loop_grad<T, IndexT,
                                        FusedGatherScatterSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count);
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
index 63c4161fb53d40..49c7a6894beaed 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
@@ -22,7 +22,7 @@ class TestFusedGatherScatterMaxOp(OpTest):
     def setUp(self):
         paddle.enable_static()
         self.op_type = "fused_gather_scatter"
-        x = np.random.random((10, 20)).astype("float64")
+        x = np.random.randint(0, 50, (10, 10)).astype("float64")
         index = np.random.randint(0, 10, (15, 2))
         gather_index = index[:, 0]
         scatter_index = index[:, 1]
@@ -35,16 +35,15 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MAX'}
 
-        out, gradient = compute_gather_scatter_for_min_max(self.inputs,
-                                                           self.attrs)
-
+        out, self.gradient = compute_gather_scatter_for_min_max(self.inputs,
+                                                                self.attrs)
         self.outputs = {'Out': out}
 
     def test_check_output(self):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient])
 
 
 class TestFusedGatherScatterMinOp(OpTest):
@@ -200,9 +199,9 @@ def compute_gather_scatter_for_min_max(inputs, attributes):
     # Calculate backward gradient
     index_size = len(gather_index)
     for i in range(index_size):
-        forward_src_idx = scatter_index[i]
-        forward_dst_idx = gather_index[i]
+        forward_src_idx = gather_index[i]
+        forward_dst_idx = scatter_index[i]
         gradient[forward_src_idx] += 1 * (
             x[forward_src_idx] == results[forward_dst_idx])
 
-    return results, gradient
+    return results, gradient / results.size

From a16e4129269b5859fc2b4622e1b86b2a025f31b4 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 15 Nov 2021 07:34:16 +0000
Subject: [PATCH 12/30] [Add] Successful MIN and MAX in CUDA

---
 .../fused/fused_gather_scatter_op.cu          | 72 ++++++++-----------
 .../operators/fused/fused_gather_scatter_op.h |  4 +-
 .../unittests/test_fused_gather_scatter_op.py |  2 +-
 3 files changed, 34 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
index 21247599300065..e0774158477296 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
@@ -122,6 +122,25 @@ __global__ void ManipulateMeanGradCUDAKernel(const T* params,
   }
 }
 
+// For backward min and max
+template <typename T, typename IndexT>
+__global__ void ManipulateMinMaxGradCUDAKernel(
+    const T* params, const IndexT* gather_indices,
+    const IndexT* scatter_indices, T* output, size_t index_size,
+    size_t slice_size, const T* ptr_input, const T* ptr_output) {
+  CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
+    int64_t indices_i = i / slice_size;
+    int64_t slice_i = i - indices_i * slice_size;
+    IndexT gather_i = gather_indices[indices_i];
+    IndexT scatter_i = scatter_indices[indices_i];
+    int64_t in_i = gather_i * slice_size + slice_i;
+    int64_t out_i = scatter_i * slice_size + slice_i;
+    paddle::platform::CudaAtomicAdd(
+        output + out_i,
+        *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
+  }
+}
+
 template <typename DeviceContext, typename T, typename IndexT>
 class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
@@ -254,17 +273,7 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
       memset_size *= src_dims[i];
     }
     const size_t& memset_bytes = memset_size * sizeof(T);
-    if (pool_type == "SUM" || pool_type == "MEAN") {
-      cudaMemset(p_output, 0, memset_bytes);
-    } else if (pool_type == "MAX") {
-      thrust::device_ptr<T> p_output_ptr(p_output);
-      thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                   std::numeric_limits<T>::min());
-    } else if (pool_type == "MIN") {
-      thrust::device_ptr<T> p_output_ptr(p_output);
-      thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                   std::numeric_limits<T>::max());
-    }
+    cudaMemset(p_output, 0, memset_bytes);
 
     int64_t slice_size = 1;
     for (int i = 1; i < src_dims.size(); ++i) {
@@ -286,36 +295,6 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
                               ctx.device_context())
                               .stream()>>>(p_src, g_index, s_index, p_output,
                                            index_size, slice_size, functor);
-    } else if (pool_type == "MAX") {
-      GatherScatterMaxCUDAFunctor<T, IndexT> functor;
-      GatherScatterCUDAKernel<T, IndexT,
-                              GatherScatterMaxCUDAFunctor<T, IndexT>><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
-                                           index_size, slice_size, functor);
-
-      int64_t grid_max = (input_size * slice_size + block - 1) / block;
-      InputResetCUDAKernel<
-          T><<<grid_max, block, 0,
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(p_output, input_size, slice_size);
-    } else if (pool_type == "MIN") {
-      GatherScatterMinCUDAFunctor<T, IndexT> functor;
-      GatherScatterCUDAKernel<T, IndexT,
-                              GatherScatterMinCUDAFunctor<T, IndexT>><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
-                                           index_size, slice_size, functor);
-
-      int64_t grid_min = (input_size * slice_size + block - 1) / block;
-      InputResetCUDAKernel<
-          T><<<grid_min, block, 0,
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(p_output, input_size, slice_size);
     } else if (pool_type == "MEAN") {
       auto* scatter_count = ctx.Input<Tensor>("Scatter_count");
       const int* s_count = scatter_count->data<int>();
@@ -324,6 +303,17 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
                               ctx.device_context())
                               .stream()>>>(p_src, g_index, s_index, p_output,
                                            index_size, slice_size, s_count);
+    } else if (pool_type == "MAX" || pool_type == "MIN") {
+      auto* input = ctx.Input<Tensor>("X");
+      auto* output = ctx.Input<Tensor>("Out");
+      const T* ptr_input = input->data<T>();
+      const T* ptr_output = output->data<T>();
+      ManipulateMinMaxGradCUDAKernel<T, IndexT><<<
+          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(p_src, g_index, s_index, p_output,
+                                           index_size, slice_size, ptr_input,
+                                           ptr_output);
     }
   }
 };
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
index 0303030f701308..15d241eb5105e5 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h
@@ -241,8 +241,8 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
                                        FusedGatherScatterSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count);
     } else if (pool_type == "MIN" || pool_type == "MAX") {
-      auto* input = ctx.Input<Tensor>("X");
-      auto* output = ctx.Input<Tensor>("Out");
+      const auto* input = ctx.Input<Tensor>("X");
+      const auto* output = ctx.Input<Tensor>("Out");
       // Functor not used here.
       gather_scatter_cpu_for_loop_grad<T, IndexT,
                                        FusedGatherScatterMinFunctor<T>>(
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
index 49c7a6894beaed..f3b307296ec45a 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
@@ -22,7 +22,7 @@ class TestFusedGatherScatterMaxOp(OpTest):
     def setUp(self):
         paddle.enable_static()
         self.op_type = "fused_gather_scatter"
-        x = np.random.randint(0, 50, (10, 10)).astype("float64")
+        x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2))
         gather_index = index[:, 0]
         scatter_index = index[:, 1]

From 071425f74f0d758793057ba606ea034eff34e68c Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 15 Nov 2021 09:28:20 +0000
Subject: [PATCH 13/30] fix windows dtype ci

---
 .../fluid/tests/unittests/test_fused_gather_scatter_op.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
index f3b307296ec45a..fc21dfacf79f15 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
@@ -23,7 +23,7 @@ def setUp(self):
         paddle.enable_static()
         self.op_type = "fused_gather_scatter"
         x = np.random.random((10, 20)).astype("float64")
-        index = np.random.randint(0, 10, (15, 2))
+        index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         gather_index = index[:, 0]
         scatter_index = index[:, 1]
 
@@ -51,7 +51,7 @@ def setUp(self):
         paddle.enable_static()
         self.op_type = "fused_gather_scatter"
         x = np.random.random((10, 20)).astype("float64")
-        index = np.random.randint(0, 10, (15, 2))
+        index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         gather_index = index[:, 0]
         scatter_index = index[:, 1]
 
@@ -80,7 +80,7 @@ def setUp(self):
         paddle.enable_static()
         self.op_type = "fused_gather_scatter"
         x = np.random.random((10, 20)).astype("float64")
-        index = np.random.randint(0, 10, (15, 2))
+        index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         gather_index = index[:, 0]
         scatter_index = index[:, 1]
 
@@ -108,7 +108,7 @@ def setUp(self):
         paddle.enable_static()
         self.op_type = "fused_gather_scatter"
         x = np.random.random((10, 20)).astype("float64")
-        index = np.random.randint(0, 10, (15, 2))
+        index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         gather_index = index[:, 0]
         scatter_index = index[:, 1]
 

From f9666b9b6d7da56d423bdeb39ebb32c664797903 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 15 Nov 2021 11:27:40 +0000
Subject: [PATCH 14/30] fix ROCM ci by adding HIP flag

---
 .../operators/fused/fused_gather_scatter_op.cu   | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
index e0774158477296..2f2d885a4a9670 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
+++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
@@ -162,7 +162,12 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
     }
     const size_t& memset_bytes = memset_size * sizeof(T);
     if (pool_type == "SUM" || pool_type == "MEAN") {
+#ifdef PADDLE_WITH_HIP
+      hipMemset(p_output, 0, memset_bytes);
+#else
       cudaMemset(p_output, 0, memset_bytes);
+#endif
+
     } else if (pool_type == "MAX") {
       thrust::device_ptr<T> p_output_ptr(p_output);
       thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
@@ -234,7 +239,13 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
 
       auto* scatter_count = ctx.Output<Tensor>("Scatter_count");
       int* p_scatter_count = scatter_count->mutable_data<int>(ctx.GetPlace());
+
+#ifdef PADDLE_WITH_HIP
+      hipMemset(p_scatter_count, 0, input_size * sizeof(int));
+#else
       cudaMemset(p_scatter_count, 0, input_size * sizeof(int));
+#endif
+
       int64_t grid_count = (index_size + block - 1) / block;
       ComputeCountCUDAKernel<
           T, IndexT><<<grid_count, block, 0,
@@ -273,7 +284,12 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
       memset_size *= src_dims[i];
     }
     const size_t& memset_bytes = memset_size * sizeof(T);
+
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_output, 0, memset_bytes);
+#else
     cudaMemset(p_output, 0, memset_bytes);
+#endif
 
     int64_t slice_size = 1;
     for (int i = 1; i < src_dims.size(); ++i) {

From 66e9af05c5e9c5454f9f445f292eea879a014191 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Tue, 16 Nov 2021 02:43:19 +0000
Subject: [PATCH 15/30] rename fused_gather_scatter to send_recv

---
 ...d_gather_scatter_op.cc => send_recv_op.cc} | 128 +++++++++---------
 ...d_gather_scatter_op.cu => send_recv_op.cu} |  93 ++++++-------
 ...sed_gather_scatter_op.h => send_recv_op.h} |  35 +++--
 ...her_scatter_op.py => test_send_recv_op.py} |  40 +++---
 4 files changed, 140 insertions(+), 156 deletions(-)
 rename paddle/fluid/operators/{fused/fused_gather_scatter_op.cc => send_recv_op.cc} (51%)
 rename paddle/fluid/operators/{fused/fused_gather_scatter_op.cu => send_recv_op.cu} (81%)
 rename paddle/fluid/operators/{fused/fused_gather_scatter_op.h => send_recv_op.h} (89%)
 rename python/paddle/fluid/tests/unittests/{test_fused_gather_scatter_op.py => test_send_recv_op.py} (87%)

diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/send_recv_op.cc
similarity index 51%
rename from paddle/fluid/operators/fused/fused_gather_scatter_op.cc
rename to paddle/fluid/operators/send_recv_op.cc
index dc5b7a95265751..06cbff6328c113 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc
+++ b/paddle/fluid/operators/send_recv_op.cc
@@ -12,62 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fused/fused_gather_scatter_op.h"
+#include "paddle/fluid/operators/send_recv_op.h"
 
 namespace paddle {
 namespace operators {
 
-class FusedGatherScatterOP : public framework::OperatorWithKernel {
+class SendRecvOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGatherScatter");
-    OP_INOUT_CHECK(ctx->HasInput("Gather_index"), "Input", "Gather_index",
-                   "FusedGatherScatter");
-    OP_INOUT_CHECK(ctx->HasInput("Scatter_index"), "Input", "Scatter_index",
-                   "FusedGatherScatter");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out",
-                   "FusedGatherScatter");
-
-    auto gather_index_dims = ctx->GetInputDim("Gather_index");
-    if (gather_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(gather_index_dims[1], 1,
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SendRecv");
+    OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index",
+                   "SendRecv");
+    OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index",
+                   "SendRecv");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SendRecv");
+
+    auto src_index_dims = ctx->GetInputDim("Src_index");
+    if (src_index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(src_index_dims[1], 1,
                         platform::errors::InvalidArgument(
-                            "The last dim of Gather_index should be 1 when it "
+                            "The last dim of Src_index should be 1 when it "
                             "is 2D, but we get %d",
-                            gather_index_dims[1]));
+                            src_index_dims[1]));
     } else {
       PADDLE_ENFORCE_EQ(
-          gather_index_dims.size(), 1,
+          src_index_dims.size(), 1,
           platform::errors::InvalidArgument(
-              "The Gather_index should be 1D, when it is not 2D, but we get %d",
-              gather_index_dims.size()));
+              "The Src_index should be 1D, when it is not 2D, but we get %d",
+              src_index_dims.size()));
     }
 
-    auto scatter_index_dims = ctx->GetInputDim("Scatter_index");
-    if (scatter_index_dims.size() == 2) {
-      PADDLE_ENFORCE_EQ(scatter_index_dims[1], 1,
+    auto dst_index_dims = ctx->GetInputDim("Dst_index");
+    if (dst_index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(dst_index_dims[1], 1,
                         platform::errors::InvalidArgument(
-                            "The last dim of Scatter_index should be 1 when it "
+                            "The last dim of Dst_index should be 1 when it "
                             "is 2D, but we get %d",
-                            scatter_index_dims[1]));
+                            dst_index_dims[1]));
     } else {
       PADDLE_ENFORCE_EQ(
-          scatter_index_dims.size(), 1,
-          platform::errors::InvalidArgument("The Scatter_index should be 1D, "
+          dst_index_dims.size(), 1,
+          platform::errors::InvalidArgument("The Dst_index should be 1D, "
                                             "when it is not 2D, but we get %d",
-                                            scatter_index_dims.size()));
+                                            dst_index_dims.size()));
     }
 
-    // TODO(daisiming): If the shape of scatter_index and gather_index should be
-    // same?
+    // TODO(daisiming): If the shape of src_index and dst_index should be same?
     auto dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", dims);
 
     if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
       OP_INOUT_CHECK(ctx->HasOutput("Scatter_count"), "Output", "Scatter_count",
-                     "FusedGatherScatter");
+                     "SendRecv");
       ctx->SetOutputDim("Scatter_count", {dims[0]});
     }
   }
@@ -81,7 +79,7 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel {
   }
 };
 
-class FusedGatherScatterGradOp : public framework::OperatorWithKernel {
+class SendRecvGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -99,28 +97,28 @@ class FusedGatherScatterGradOp : public framework::OperatorWithKernel {
   }
 };
 
-class FusedGatherScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
              "The input tensor with data type float32, "
              "float64 or float16");
-    AddInput("Gather_index", "The gather index tensor.");
-    AddInput("Scatter_index", "The scatter index tensor.");
-    AddOutput("Out", "Output tensor of fused_gather_scatter op.");
+    AddInput("Src_index", "The source index tensor.");
+    AddInput("Dst_index", "The destination index tensor.");
+    AddOutput("Out", "Output tensor of send_recv op.");
     AddOutput("Scatter_count",
-              "Count tensor of Scatter index, mainly for MEAN pool_type.")
+              "Count tensor of Dst_index, mainly for MEAN pool_type.")
         .AsIntermediate();
     AddAttr<std::string>(
         "pool_type",
         "(string, default 'SUM')"
-        "We use Gather_index to gather correspoinding place of X. "
+        "We use Src_index to gather correspoinding place of X. "
         "Then we need to use different pool type to scatter the result.")
         .SetDefault("SUM")
         .InEnum({"SUM", "MEAN", "MIN", "MAX"});
     // TODO(daisiming): Add a simple example here.
     AddComment(R"DOC(
-Fused Gather Scatter Operator.
+SendRecv Operator.
 
 $Out = Scatter(Gather(X, Gather_index), Scatter_index, pool_type)$
 
@@ -135,15 +133,15 @@ pass
 };
 
 template <typename T>
-class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker<T> {
+class SendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
  protected:
   void Apply(GradOpPtr<T> op) const override {
-    op->SetType("fused_gather_scatter_grad");
-    op->SetInput("Gather_index", this->Input("Gather_index"));
-    op->SetInput("Scatter_index", this->Input("Scatter_index"));
+    op->SetType("send_recv_grad");
+    op->SetInput("Src_index", this->Input("Src_index"));
+    op->SetInput("Dst_index", this->Input("Dst_index"));
 
     if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") {
       op->SetInput("Scatter_count", this->Output("Scatter_count"));
@@ -167,29 +165,25 @@ class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
-REGISTER_OPERATOR(
-    fused_gather_scatter, ops::FusedGatherScatterOP,
-    ops::FusedGatherScatterOpMaker,
-    ops::FusedGatherScatterGradOpMaker<paddle::framework::OpDesc>,
-    ops::FusedGatherScatterGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(fused_gather_scatter_grad, ops::FusedGatherScatterGradOp);
-REGISTER_OP_CPU_KERNEL(fused_gather_scatter,
-                       ops::FusedGatherScatterOpKernel<CPU, float, int>,
-                       ops::FusedGatherScatterOpKernel<CPU, float, int64_t>,
-                       ops::FusedGatherScatterOpKernel<CPU, double, int>,
-                       ops::FusedGatherScatterOpKernel<CPU, double, int64_t>,
-                       ops::FusedGatherScatterOpKernel<CPU, int, int>,
-                       ops::FusedGatherScatterOpKernel<CPU, int, int64_t>,
-                       ops::FusedGatherScatterOpKernel<CPU, int64_t, int>,
-                       ops::FusedGatherScatterOpKernel<CPU, int64_t, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(
-    fused_gather_scatter_grad,
-    ops::FusedGatherScatterGradOpKernel<CPU, float, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, float, int64_t>,
-    ops::FusedGatherScatterGradOpKernel<CPU, double, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, double, int64_t>,
-    ops::FusedGatherScatterGradOpKernel<CPU, int, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, int, int64_t>,
-    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int>,
-    ops::FusedGatherScatterGradOpKernel<CPU, int64_t, int64_t>);
+REGISTER_OPERATOR(send_recv, ops::SendRecvOP, ops::SendRecvOpMaker,
+                  ops::SendRecvGradOpMaker<paddle::framework::OpDesc>,
+                  ops::SendRecvGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(send_recv_grad, ops::SendRecvGradOp);
+REGISTER_OP_CPU_KERNEL(send_recv, ops::SendRecvOpKernel<CPU, float, int>,
+                       ops::SendRecvOpKernel<CPU, float, int64_t>,
+                       ops::SendRecvOpKernel<CPU, double, int>,
+                       ops::SendRecvOpKernel<CPU, double, int64_t>,
+                       ops::SendRecvOpKernel<CPU, int, int>,
+                       ops::SendRecvOpKernel<CPU, int, int64_t>,
+                       ops::SendRecvOpKernel<CPU, int64_t, int>,
+                       ops::SendRecvOpKernel<CPU, int64_t, int64_t>);
+
+REGISTER_OP_CPU_KERNEL(send_recv_grad,
+                       ops::SendRecvGradOpKernel<CPU, float, int>,
+                       ops::SendRecvGradOpKernel<CPU, float, int64_t>,
+                       ops::SendRecvGradOpKernel<CPU, double, int>,
+                       ops::SendRecvGradOpKernel<CPU, double, int64_t>,
+                       ops::SendRecvGradOpKernel<CPU, int, int>,
+                       ops::SendRecvGradOpKernel<CPU, int, int64_t>,
+                       ops::SendRecvGradOpKernel<CPU, int64_t, int>,
+                       ops::SendRecvGradOpKernel<CPU, int64_t, int64_t>);
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/send_recv_op.cu
similarity index 81%
rename from paddle/fluid/operators/fused/fused_gather_scatter_op.cu
rename to paddle/fluid/operators/send_recv_op.cu
index 2f2d885a4a9670..1014794c8bc3ec 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu
+++ b/paddle/fluid/operators/send_recv_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/fused/fused_gather_scatter_op.h"
+#include "paddle/fluid/operators/send_recv_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -27,7 +27,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T, typename IndexT>
-struct GatherScatterSumCUDAFunctor {
+struct SendRecvSumCUDAFunctor {
   DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
                                 const IndexT& out_i) {
     paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
@@ -35,7 +35,7 @@ struct GatherScatterSumCUDAFunctor {
 };
 
 template <typename T, typename IndexT>
-struct GatherScatterMaxCUDAFunctor {
+struct SendRecvMaxCUDAFunctor {
   DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
                                 const IndexT& out_i) {
     paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
@@ -43,7 +43,7 @@ struct GatherScatterMaxCUDAFunctor {
 };
 
 template <typename T, typename IndexT>
-struct GatherScatterMinCUDAFunctor {
+struct SendRecvMinCUDAFunctor {
   DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
                                 const IndexT& out_i) {
     paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
@@ -51,11 +51,11 @@ struct GatherScatterMinCUDAFunctor {
 };
 
 template <typename T, typename IndexT, typename Functor>
-__global__ void GatherScatterCUDAKernel(const T* params,
-                                        const IndexT* gather_indices,
-                                        const IndexT* scatter_indices,
-                                        T* output, size_t index_size,
-                                        size_t slice_size, Functor functor) {
+__global__ void SendRecvCUDAKernel(const T* params,
+                                   const IndexT* gather_indices,
+                                   const IndexT* scatter_indices, T* output,
+                                   size_t index_size, size_t slice_size,
+                                   Functor functor) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;
@@ -142,12 +142,12 @@ __global__ void ManipulateMinMaxGradCUDAKernel(
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
+class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>("X");
-    auto* gather_index = ctx.Input<Tensor>("Gather_index");
-    auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
+    auto* gather_index = ctx.Input<Tensor>("Src_index");
+    auto* scatter_index = ctx.Input<Tensor>("Dst_index");
     auto* Y = ctx.Output<Tensor>("Out");
     std::string pool_type = ctx.Attr<std::string>("pool_type");
 
@@ -191,17 +191,15 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
     int64_t grid = (n + block - 1) / block;
     int64_t input_size = src_dims[0];
     if (pool_type == "SUM") {
-      GatherScatterSumCUDAFunctor<T, IndexT> functor;
-      GatherScatterCUDAKernel<T, IndexT,
-                              GatherScatterSumCUDAFunctor<T, IndexT>><<<
+      SendRecvSumCUDAFunctor<T, IndexT> functor;
+      SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(p_src, g_index, s_index, p_output,
                                            index_size, slice_size, functor);
     } else if (pool_type == "MAX") {
-      GatherScatterMaxCUDAFunctor<T, IndexT> functor;
-      GatherScatterCUDAKernel<T, IndexT,
-                              GatherScatterMaxCUDAFunctor<T, IndexT>><<<
+      SendRecvMaxCUDAFunctor<T, IndexT> functor;
+      SendRecvCUDAKernel<T, IndexT, SendRecvMaxCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(p_src, g_index, s_index, p_output,
@@ -214,9 +212,8 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
                    ctx.device_context())
                    .stream()>>>(p_output, input_size, slice_size);
     } else if (pool_type == "MIN") {
-      GatherScatterMinCUDAFunctor<T, IndexT> functor;
-      GatherScatterCUDAKernel<T, IndexT,
-                              GatherScatterMinCUDAFunctor<T, IndexT>><<<
+      SendRecvMinCUDAFunctor<T, IndexT> functor;
+      SendRecvCUDAKernel<T, IndexT, SendRecvMinCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(p_src, g_index, s_index, p_output,
@@ -229,9 +226,8 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
                    ctx.device_context())
                    .stream()>>>(p_output, input_size, slice_size);
     } else if (pool_type == "MEAN") {
-      GatherScatterSumCUDAFunctor<T, IndexT> functor;
-      GatherScatterCUDAKernel<T, IndexT,
-                              GatherScatterSumCUDAFunctor<T, IndexT>><<<
+      SendRecvSumCUDAFunctor<T, IndexT> functor;
+      SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(p_src, g_index, s_index, p_output,
@@ -265,12 +261,12 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T, typename IndexT>
-class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
+class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* gather_index = ctx.Input<Tensor>("Scatter_index");
-    auto* scatter_index = ctx.Input<Tensor>("Gather_index");
+    auto* gather_index = ctx.Input<Tensor>("Dst_index");
+    auto* scatter_index = ctx.Input<Tensor>("Src_index");
     auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
     std::string pool_type = ctx.Attr<std::string>("pool_type");
 
@@ -304,9 +300,8 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
     int64_t grid = (n + block - 1) / block;
     int64_t input_size = src_dims[0];
     if (pool_type == "SUM") {
-      GatherScatterSumCUDAFunctor<T, IndexT> functor;
-      GatherScatterCUDAKernel<T, IndexT,
-                              GatherScatterSumCUDAFunctor<T, IndexT>><<<
+      SendRecvSumCUDAFunctor<T, IndexT> functor;
+      SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
                               .stream()>>>(p_src, g_index, s_index, p_output,
@@ -340,23 +335,21 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel<T> {
 using CUDA = paddle::platform::CUDADeviceContext;
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(
-    fused_gather_scatter, ops::FusedGatherScatterOpCUDAKernel<CUDA, float, int>,
-    ops::FusedGatherScatterOpCUDAKernel<CUDA, float, int64_t>,
-    ops::FusedGatherScatterOpCUDAKernel<CUDA, double, int>,
-    ops::FusedGatherScatterOpCUDAKernel<CUDA, double, int64_t>,
-    ops::FusedGatherScatterOpCUDAKernel<CUDA, int, int>,
-    ops::FusedGatherScatterOpCUDAKernel<CUDA, int, int64_t>,
-    ops::FusedGatherScatterOpCUDAKernel<CUDA, int64_t, int>,
-    ops::FusedGatherScatterOpCUDAKernel<CUDA, int64_t, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(
-    fused_gather_scatter_grad,
-    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, float, int>,
-    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, float, int64_t>,
-    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, double, int>,
-    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, double, int64_t>,
-    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, int, int>,
-    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, int, int64_t>,
-    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, int64_t, int>,
-    ops::FusedGatherScatterGradOpCUDAKernel<CUDA, int64_t, int64_t>);
+REGISTER_OP_CUDA_KERNEL(send_recv, ops::SendRecvOpCUDAKernel<CUDA, float, int>,
+                        ops::SendRecvOpCUDAKernel<CUDA, float, int64_t>,
+                        ops::SendRecvOpCUDAKernel<CUDA, double, int>,
+                        ops::SendRecvOpCUDAKernel<CUDA, double, int64_t>,
+                        ops::SendRecvOpCUDAKernel<CUDA, int, int>,
+                        ops::SendRecvOpCUDAKernel<CUDA, int, int64_t>,
+                        ops::SendRecvOpCUDAKernel<CUDA, int64_t, int>,
+                        ops::SendRecvOpCUDAKernel<CUDA, int64_t, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(send_recv_grad,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, float, int>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, float, int64_t>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, double, int>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, double, int64_t>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, int, int>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, int, int64_t>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, int64_t, int>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, int64_t, int64_t>);
diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/send_recv_op.h
similarity index 89%
rename from paddle/fluid/operators/fused/fused_gather_scatter_op.h
rename to paddle/fluid/operators/send_recv_op.h
index 15d241eb5105e5..b4c2c0fd003b1b 100644
--- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h
+++ b/paddle/fluid/operators/send_recv_op.h
@@ -24,7 +24,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-struct FusedGatherScatterSumFunctor {
+struct SendRecvSumFunctor {
   void operator()(const bool& first_flag, const Tensor& src_slice,
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
@@ -34,7 +34,7 @@ struct FusedGatherScatterSumFunctor {
 };
 
 template <typename T>
-struct FusedGatherScatterMinFunctor {
+struct SendRecvMinFunctor {
   void operator()(const bool& first_flag, const Tensor& src_slice,
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
@@ -48,7 +48,7 @@ struct FusedGatherScatterMinFunctor {
 };
 
 template <typename T>
-struct FusedGatherScatterMaxFunctor {
+struct SendRecvMaxFunctor {
   void operator()(const int& first_flag, const Tensor& src_slice,
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
@@ -164,12 +164,12 @@ void gather_scatter_cpu_for_loop_grad(
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
+class SendRecvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>("X");
-    auto* gather_index = ctx.Input<Tensor>("Gather_index");
-    auto* scatter_index = ctx.Input<Tensor>("Scatter_index");
+    auto* gather_index = ctx.Input<Tensor>("Src_index");
+    auto* scatter_index = ctx.Input<Tensor>("Dst_index");
     auto* Y = ctx.Output<Tensor>("Out");
 
     const int& index_size = gather_index->dims()[0];
@@ -187,19 +187,19 @@ class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
 
     const std::string& pool_type = ctx.Attr<std::string>("pool_type");
     if (pool_type == "SUM") {
-      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
+      gather_scatter_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MIN") {
-      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMinFunctor<T>>(
+      gather_scatter_cpu_for_loop<T, IndexT, SendRecvMinFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MAX") {
-      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterMaxFunctor<T>>(
+      gather_scatter_cpu_for_loop<T, IndexT, SendRecvMaxFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
       auto* scatter_count = ctx.Output<Tensor>("Scatter_count");
       int* p_scatter_count = scatter_count->mutable_data<int>(ctx.GetPlace());
       memset(p_scatter_count, 0, src_dims[0] * sizeof(int));
-      gather_scatter_cpu_for_loop<T, IndexT, FusedGatherScatterSumFunctor<T>>(
+      gather_scatter_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type,
           p_scatter_count);
     }
@@ -207,12 +207,12 @@ class FusedGatherScatterOpKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T, typename IndexT>
-class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
+class SendRecvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* gather_index = ctx.Input<Tensor>("Scatter_index");
-    auto* scatter_index = ctx.Input<Tensor>("Gather_index");
+    auto* gather_index = ctx.Input<Tensor>("Dst_index");
+    auto* scatter_index = ctx.Input<Tensor>("Src_index");
     auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     const int& index_size = gather_index->dims()[0];
@@ -230,22 +230,19 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel<T> {
 
     const std::string& pool_type = ctx.Attr<std::string>("pool_type");
     if (pool_type == "SUM") {
-      gather_scatter_cpu_for_loop_grad<T, IndexT,
-                                       FusedGatherScatterSumFunctor<T>>(
+      gather_scatter_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
       auto* scatter_count = ctx.Input<Tensor>("Scatter_count");
       const int* s_count = scatter_count->data<int>();
       // Functor not used here.
-      gather_scatter_cpu_for_loop_grad<T, IndexT,
-                                       FusedGatherScatterSumFunctor<T>>(
+      gather_scatter_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count);
     } else if (pool_type == "MIN" || pool_type == "MAX") {
       const auto* input = ctx.Input<Tensor>("X");
       const auto* output = ctx.Input<Tensor>("Out");
       // Functor not used here.
-      gather_scatter_cpu_for_loop_grad<T, IndexT,
-                                       FusedGatherScatterMinFunctor<T>>(
+      gather_scatter_cpu_for_loop_grad<T, IndexT, SendRecvMinFunctor<T>>(
           src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, nullptr,
           input, output);
     }
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_send_recv_op.py
similarity index 87%
rename from python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
rename to python/paddle/fluid/tests/unittests/test_send_recv_op.py
index fc21dfacf79f15..0ad1603c1e5d8a 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_send_recv_op.py
@@ -18,10 +18,10 @@
 import paddle.fluid as fluid
 
 
-class TestFusedGatherScatterMaxOp(OpTest):
+class TestSendRecvMaxOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        self.op_type = "fused_gather_scatter"
+        self.op_type = "send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         gather_index = index[:, 0]
@@ -29,8 +29,8 @@ def setUp(self):
 
         self.inputs = {
             'X': x,
-            'Gather_index': gather_index,
-            'Scatter_index': scatter_index
+            'Src_index': gather_index,
+            'Dst_index': scatter_index
         }
 
         self.attrs = {'pool_type': 'MAX'}
@@ -46,10 +46,10 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient])
 
 
-class TestFusedGatherScatterMinOp(OpTest):
+class TestSendRecvMinOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        self.op_type = "fused_gather_scatter"
+        self.op_type = "send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         gather_index = index[:, 0]
@@ -57,8 +57,8 @@ def setUp(self):
 
         self.inputs = {
             'X': x,
-            'Gather_index': gather_index,
-            'Scatter_index': scatter_index
+            'Src_index': gather_index,
+            'Dst_index': scatter_index
         }
 
         self.attrs = {'pool_type': 'MIN'}
@@ -75,10 +75,10 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient])
 
 
-class TestFusedGatherScatterSumOp(OpTest):
+class TestSendRecvSumOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        self.op_type = "fused_gather_scatter"
+        self.op_type = "send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         gather_index = index[:, 0]
@@ -86,8 +86,8 @@ def setUp(self):
 
         self.inputs = {
             'X': x,
-            'Gather_index': gather_index,
-            'Scatter_index': scatter_index
+            'Src_index': gather_index,
+            'Dst_index': scatter_index
         }
 
         self.attrs = {'pool_type': 'SUM'}
@@ -103,10 +103,10 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestFusedGatherScatterMeanOp(OpTest):
+class TestSendRecvMeanOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        self.op_type = "fused_gather_scatter"
+        self.op_type = "send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         gather_index = index[:, 0]
@@ -114,8 +114,8 @@ def setUp(self):
 
         self.inputs = {
             'X': x,
-            'Gather_index': gather_index,
-            'Scatter_index': scatter_index
+            'Src_index': gather_index,
+            'Dst_index': scatter_index
         }
 
         self.attrs = {'pool_type': 'MEAN'}
@@ -134,8 +134,8 @@ def test_check_grad(self):
 
 def compute_gather_scatter_for_sum_mean(inputs, attributes):
     x = inputs['X']
-    gather_index = inputs['Gather_index']
-    scatter_index = inputs['Scatter_index']
+    gather_index = inputs['Src_index']
+    scatter_index = inputs['Dst_index']
 
     pool_type = attributes['pool_type']
 
@@ -164,8 +164,8 @@ def compute_gather_scatter_for_sum_mean(inputs, attributes):
 
 def compute_gather_scatter_for_min_max(inputs, attributes):
     x = inputs['X']
-    gather_index = inputs['Gather_index']
-    scatter_index = inputs['Scatter_index']
+    gather_index = inputs['Src_index']
+    scatter_index = inputs['Dst_index']
 
     pool_type = attributes['pool_type']
 

From aa3042d32893296144debb65405b266e1f98dada Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Tue, 16 Nov 2021 10:09:32 +0000
Subject: [PATCH 16/30] unify name as send and recv

---
 paddle/fluid/operators/send_recv_op.cc        |  16 ++-
 paddle/fluid/operators/send_recv_op.cu        | 106 ++++++++--------
 paddle/fluid/operators/send_recv_op.h         | 116 +++++++++---------
 .../tests/unittests/test_send_recv_op.py      |  87 ++++++-------
 4 files changed, 151 insertions(+), 174 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc
index 06cbff6328c113..e19575833597f9 100644
--- a/paddle/fluid/operators/send_recv_op.cc
+++ b/paddle/fluid/operators/send_recv_op.cc
@@ -64,9 +64,9 @@ class SendRecvOP : public framework::OperatorWithKernel {
     ctx->SetOutputDim("Out", dims);
 
     if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
-      OP_INOUT_CHECK(ctx->HasOutput("Scatter_count"), "Output", "Scatter_count",
+      OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count",
                      "SendRecv");
-      ctx->SetOutputDim("Scatter_count", {dims[0]});
+      ctx->SetOutputDim("Dst_count", {dims[0]});
     }
   }
 
@@ -106,24 +106,22 @@ class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("Src_index", "The source index tensor.");
     AddInput("Dst_index", "The destination index tensor.");
     AddOutput("Out", "Output tensor of send_recv op.");
-    AddOutput("Scatter_count",
+    AddOutput("Dst_count",
               "Count tensor of Dst_index, mainly for MEAN pool_type.")
         .AsIntermediate();
     AddAttr<std::string>(
         "pool_type",
         "(string, default 'SUM')"
-        "We use Src_index to gather correspoinding place of X. "
-        "Then we need to use different pool type to scatter the result.")
+        "Define different pool types to receive the result tensors")
         .SetDefault("SUM")
         .InEnum({"SUM", "MEAN", "MIN", "MAX"});
     // TODO(daisiming): Add a simple example here.
     AddComment(R"DOC(
 SendRecv Operator.
 
-$Out = Scatter(Gather(X, Gather_index), Scatter_index, pool_type)$
+$Out = Recv(Send(X, Src_index), Dst_index, pool_type)$
 
-This operator helps perform fused computation of gather operator and scatter operator, so as to 
-decrease intermediate GPU memory occupation of using gather op and scatter op successively.
+This operator 
 
 Example:
 
@@ -144,7 +142,7 @@ class SendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("Dst_index", this->Input("Dst_index"));
 
     if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") {
-      op->SetInput("Scatter_count", this->Output("Scatter_count"));
+      op->SetInput("Dst_count", this->Output("Dst_count"));
     }
 
     if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu
index 1014794c8bc3ec..1fcff7e568ce06 100644
--- a/paddle/fluid/operators/send_recv_op.cu
+++ b/paddle/fluid/operators/send_recv_op.cu
@@ -51,18 +51,17 @@ struct SendRecvMinCUDAFunctor {
 };
 
 template <typename T, typename IndexT, typename Functor>
-__global__ void SendRecvCUDAKernel(const T* params,
-                                   const IndexT* gather_indices,
-                                   const IndexT* scatter_indices, T* output,
+__global__ void SendRecvCUDAKernel(const T* params, const IndexT* src_indices,
+                                   const IndexT* dst_indices, T* output,
                                    size_t index_size, size_t slice_size,
                                    Functor functor) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;
-    IndexT gather_i = gather_indices[indices_i];
-    IndexT scatter_i = scatter_indices[indices_i];
-    int64_t in_i = gather_i * slice_size + slice_i;
-    int64_t out_i = scatter_i * slice_size + slice_i;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
     functor(params, output, in_i, out_i);
   }
 }
@@ -79,14 +78,13 @@ __global__ void InputResetCUDAKernel(T* output, size_t input_size,
   }
 }
 
-// Get scatter_count
+// Get dst_count
 template <typename T, typename IndexT>
-__global__ void ComputeCountCUDAKernel(int* count,
-                                       const IndexT* scatter_indices,
+__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices,
                                        size_t index_size) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) {
-    IndexT scatter_i = scatter_indices[i];
-    paddle::platform::CudaAtomicAdd(count + scatter_i, 1);
+    IndexT dst_i = dst_indices[i];
+    paddle::platform::CudaAtomicAdd(count + dst_i, 1);
   }
 }
 
@@ -104,37 +102,34 @@ __global__ void ManipulateMeanCUDAKernel(T* output, int* count,
 
 // For backward mean
 template <typename T, typename IndexT>
-__global__ void ManipulateMeanGradCUDAKernel(const T* params,
-                                             const IndexT* gather_indices,
-                                             const IndexT* scatter_indices,
-                                             T* output, size_t index_size,
-                                             size_t slice_size,
-                                             const int* scatter_count) {
+__global__ void ManipulateMeanGradCUDAKernel(
+    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
+    T* output, size_t index_size, size_t slice_size, const int* dst_count) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;
-    IndexT gather_i = gather_indices[indices_i];
-    IndexT scatter_i = scatter_indices[indices_i];
-    int64_t in_i = gather_i * slice_size + slice_i;
-    int64_t out_i = scatter_i * slice_size + slice_i;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
     paddle::platform::CudaAtomicAdd(output + out_i,
-                                    *(params + in_i) / scatter_count[gather_i]);
+                                    *(params + in_i) / dst_count[src_i]);
   }
 }
 
 // For backward min and max
 template <typename T, typename IndexT>
 __global__ void ManipulateMinMaxGradCUDAKernel(
-    const T* params, const IndexT* gather_indices,
-    const IndexT* scatter_indices, T* output, size_t index_size,
-    size_t slice_size, const T* ptr_input, const T* ptr_output) {
+    const T* params, const IndexT* src_indices, const IndexT* dst_indices,
+    T* output, size_t index_size, size_t slice_size, const T* ptr_input,
+    const T* ptr_output) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;
-    IndexT gather_i = gather_indices[indices_i];
-    IndexT scatter_i = scatter_indices[indices_i];
-    int64_t in_i = gather_i * slice_size + slice_i;
-    int64_t out_i = scatter_i * slice_size + slice_i;
+    IndexT src_i = src_indices[indices_i];
+    IndexT dst_i = dst_indices[indices_i];
+    int64_t in_i = src_i * slice_size + slice_i;
+    int64_t out_i = dst_i * slice_size + slice_i;
     paddle::platform::CudaAtomicAdd(
         output + out_i,
         *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i)));
@@ -146,12 +141,12 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>("X");
-    auto* gather_index = ctx.Input<Tensor>("Src_index");
-    auto* scatter_index = ctx.Input<Tensor>("Dst_index");
+    auto* src_index = ctx.Input<Tensor>("Src_index");
+    auto* dst_index = ctx.Input<Tensor>("Dst_index");
     auto* Y = ctx.Output<Tensor>("Out");
     std::string pool_type = ctx.Attr<std::string>("pool_type");
 
-    const int& index_size = gather_index->dims()[0];
+    const int& index_size = src_index->dims()[0];
     if (index_size == 0) return;
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
@@ -183,8 +178,8 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
       slice_size *= src_dims[i];
     }
     const T* p_src = X->data<T>();
-    const IndexT* g_index = gather_index->data<IndexT>();
-    const IndexT* s_index = scatter_index->data<IndexT>();
+    const IndexT* s_index = src_index->data<IndexT>();
+    const IndexT* d_index = dst_index->data<IndexT>();
 
     int block = 512;
     int64_t n = slice_size * index_size;
@@ -195,14 +190,14 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
       SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
+                              .stream()>>>(p_src, s_index, d_index, p_output,
                                            index_size, slice_size, functor);
     } else if (pool_type == "MAX") {
       SendRecvMaxCUDAFunctor<T, IndexT> functor;
       SendRecvCUDAKernel<T, IndexT, SendRecvMaxCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
+                              .stream()>>>(p_src, s_index, d_index, p_output,
                                            index_size, slice_size, functor);
 
       int64_t grid_max = (input_size * slice_size + block - 1) / block;
@@ -216,7 +211,7 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
       SendRecvCUDAKernel<T, IndexT, SendRecvMinCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
+                              .stream()>>>(p_src, s_index, d_index, p_output,
                                            index_size, slice_size, functor);
 
       int64_t grid_min = (input_size * slice_size + block - 1) / block;
@@ -230,16 +225,16 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
       SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
+                              .stream()>>>(p_src, s_index, d_index, p_output,
                                            index_size, slice_size, functor);
 
-      auto* scatter_count = ctx.Output<Tensor>("Scatter_count");
-      int* p_scatter_count = scatter_count->mutable_data<int>(ctx.GetPlace());
+      auto* dst_count = ctx.Output<Tensor>("Dst_count");
+      int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
 
 #ifdef PADDLE_WITH_HIP
-      hipMemset(p_scatter_count, 0, input_size * sizeof(int));
+      hipMemset(p_dst_count, 0, input_size * sizeof(int));
 #else
-      cudaMemset(p_scatter_count, 0, input_size * sizeof(int));
+      cudaMemset(p_dst_count, 0, input_size * sizeof(int));
 #endif
 
       int64_t grid_count = (index_size + block - 1) / block;
@@ -247,15 +242,14 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
           T, IndexT><<<grid_count, block, 0,
                        reinterpret_cast<const platform::CUDADeviceContext&>(
                            ctx.device_context())
-                           .stream()>>>(p_scatter_count, s_index, index_size);
+                           .stream()>>>(p_dst_count, d_index, index_size);
 
       int64_t grid_mean = (input_size * slice_size + block - 1) / block;
       ManipulateMeanCUDAKernel<
           T><<<grid_mean, block, 0,
                reinterpret_cast<const platform::CUDADeviceContext&>(
                    ctx.device_context())
-                   .stream()>>>(p_output, p_scatter_count, input_size,
-                                slice_size);
+                   .stream()>>>(p_output, p_dst_count, input_size, slice_size);
     }
   }
 };
@@ -265,12 +259,12 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* gather_index = ctx.Input<Tensor>("Dst_index");
-    auto* scatter_index = ctx.Input<Tensor>("Src_index");
+    auto* src_index = ctx.Input<Tensor>("Dst_index");
+    auto* dst_index = ctx.Input<Tensor>("Src_index");
     auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
     std::string pool_type = ctx.Attr<std::string>("pool_type");
 
-    const int& index_size = gather_index->dims()[0];
+    const int& index_size = src_index->dims()[0];
     if (index_size == 0) return;
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
@@ -292,8 +286,8 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
       slice_size *= src_dims[i];
     }
     const T* p_src = X->data<T>();
-    const IndexT* g_index = gather_index->data<IndexT>();
-    const IndexT* s_index = scatter_index->data<IndexT>();
+    const IndexT* s_index = src_index->data<IndexT>();
+    const IndexT* d_index = dst_index->data<IndexT>();
 
     int block = 512;
     int64_t n = slice_size * index_size;
@@ -304,15 +298,15 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
       SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
+                              .stream()>>>(p_src, s_index, d_index, p_output,
                                            index_size, slice_size, functor);
     } else if (pool_type == "MEAN") {
-      auto* scatter_count = ctx.Input<Tensor>("Scatter_count");
-      const int* s_count = scatter_count->data<int>();
+      auto* dst_count = ctx.Input<Tensor>("Dst_count");
+      const int* s_count = dst_count->data<int>();
       ManipulateMeanGradCUDAKernel<T, IndexT><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
+                              .stream()>>>(p_src, s_index, d_index, p_output,
                                            index_size, slice_size, s_count);
     } else if (pool_type == "MAX" || pool_type == "MIN") {
       auto* input = ctx.Input<Tensor>("X");
@@ -322,7 +316,7 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
       ManipulateMinMaxGradCUDAKernel<T, IndexT><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                               ctx.device_context())
-                              .stream()>>>(p_src, g_index, s_index, p_output,
+                              .stream()>>>(p_src, s_index, d_index, p_output,
                                            index_size, slice_size, ptr_input,
                                            ptr_output);
     }
diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h
index b4c2c0fd003b1b..9141953c70fd22 100644
--- a/paddle/fluid/operators/send_recv_op.h
+++ b/paddle/fluid/operators/send_recv_op.h
@@ -73,41 +73,41 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst,
 }
 
 template <typename T, typename IndexT, typename Functor>
-void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
-                                 const IndexT* g_index, const IndexT* s_index,
-                                 const Tensor& src, Tensor* dst,
-                                 const std::string& pool_type,
-                                 int* scatter_count = NULL) {
+void send_recv_cpu_for_loop(const int& input_size, const int& index_size,
+                            const IndexT* s_index, const IndexT* d_index,
+                            const Tensor& src, Tensor* dst,
+                            const std::string& pool_type,
+                            int* dst_count = NULL) {
   Functor functor;
   if (pool_type == "SUM") {
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = g_index[i];
-      IndexT dst_idx = s_index[i];
+      IndexT src_idx = s_index[i];
+      IndexT dst_idx = d_index[i];
       elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
                                                       dst_idx, false, functor);
     }
   } else if (pool_type == "MEAN") {
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = g_index[i];
-      IndexT dst_idx = s_index[i];
+      IndexT src_idx = s_index[i];
+      IndexT dst_idx = d_index[i];
       elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
                                                       dst_idx, false, functor);
     }
     for (int i = 0; i < index_size; ++i) {
-      IndexT dst_idx = s_index[i];
-      *(scatter_count + dst_idx) += 1;
+      IndexT dst_idx = d_index[i];
+      *(dst_count + dst_idx) += 1;
     }
     for (int i = 0; i < input_size; ++i) {
-      if (*(scatter_count + i) == 0) continue;
+      if (*(dst_count + i) == 0) continue;
       auto dst_slice = dst->Slice(i, i + 1);
       auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst = eigen_dst / static_cast<T>(*(scatter_count + i));
+      eigen_dst = eigen_dst / static_cast<T>(*(dst_count + i));
     }
   } else if (pool_type == "MIN" || pool_type == "MAX") {
     std::set<IndexT> existed_dst;
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = g_index[i];
-      IndexT dst_idx = s_index[i];
+      IndexT src_idx = s_index[i];
+      IndexT dst_idx = d_index[i];
       bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
       if (!in_set) {
         elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
@@ -122,33 +122,35 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size,
 }
 
 template <typename T, typename IndexT, typename Functor>
-void gather_scatter_cpu_for_loop_grad(
-    const int& input_size, const int& index_size, const IndexT* g_index,
-    const IndexT* s_index, const Tensor& src, Tensor* dst,
-    const std::string& pool_type, const int* scatter_count = nullptr,
-    const Tensor* input = nullptr, const Tensor* output = nullptr) {
+void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size,
+                                 const IndexT* s_index, const IndexT* d_index,
+                                 const Tensor& src, Tensor* dst,
+                                 const std::string& pool_type,
+                                 const int* dst_count = nullptr,
+                                 const Tensor* input = nullptr,
+                                 const Tensor* output = nullptr) {
   if (pool_type == "SUM") {
     Functor functor;
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = g_index[i];
-      IndexT dst_idx = s_index[i];
+      IndexT src_idx = s_index[i];
+      IndexT dst_idx = d_index[i];
       elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
                                                       dst_idx, false, functor);
     }
   } else if (pool_type == "MEAN") {
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = g_index[i];
-      IndexT dst_idx = s_index[i];
+      IndexT src_idx = s_index[i];
+      IndexT dst_idx = d_index[i];
       auto src_slice = src.Slice(src_idx, src_idx + 1);
       auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
       auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
       auto eigen_dst = framework::EigenVector<T>::Flatten(dst_slice);
-      eigen_dst += (eigen_src / static_cast<T>(scatter_count[src_idx]));
+      eigen_dst += (eigen_src / static_cast<T>(dst_count[src_idx]));
     }
   } else if (pool_type == "MIN" || pool_type == "MAX") {
     for (int i = 0; i < index_size; ++i) {
-      auto forward_src_idx = s_index[i];
-      auto forward_dst_idx = g_index[i];
+      auto forward_src_idx = d_index[i];
+      auto forward_dst_idx = s_index[i];
       auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
       auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
       auto eigen_input = framework::EigenVector<T>::Flatten(input_slice);
@@ -168,11 +170,11 @@ class SendRecvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>("X");
-    auto* gather_index = ctx.Input<Tensor>("Src_index");
-    auto* scatter_index = ctx.Input<Tensor>("Dst_index");
+    auto* src_index = ctx.Input<Tensor>("Src_index");
+    auto* dst_index = ctx.Input<Tensor>("Dst_index");
     auto* Y = ctx.Output<Tensor>("Out");
 
-    const int& index_size = gather_index->dims()[0];
+    const int& index_size = src_index->dims()[0];
     if (index_size == 0) return;
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
@@ -182,26 +184,26 @@ class SendRecvOpKernel : public framework::OpKernel<T> {
     const size_t& memset_bytes = memset_size * sizeof(T);
     memset(p_output, 0, memset_bytes);
 
-    const IndexT* g_index = gather_index->data<IndexT>();
-    const IndexT* s_index = scatter_index->data<IndexT>();
+    const IndexT* s_index = src_index->data<IndexT>();
+    const IndexT* d_index = dst_index->data<IndexT>();
 
     const std::string& pool_type = ctx.Attr<std::string>("pool_type");
     if (pool_type == "SUM") {
-      gather_scatter_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
+      send_recv_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
+          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
     } else if (pool_type == "MIN") {
-      gather_scatter_cpu_for_loop<T, IndexT, SendRecvMinFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
+      send_recv_cpu_for_loop<T, IndexT, SendRecvMinFunctor<T>>(
+          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
     } else if (pool_type == "MAX") {
-      gather_scatter_cpu_for_loop<T, IndexT, SendRecvMaxFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
+      send_recv_cpu_for_loop<T, IndexT, SendRecvMaxFunctor<T>>(
+          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
-      auto* scatter_count = ctx.Output<Tensor>("Scatter_count");
-      int* p_scatter_count = scatter_count->mutable_data<int>(ctx.GetPlace());
-      memset(p_scatter_count, 0, src_dims[0] * sizeof(int));
-      gather_scatter_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type,
-          p_scatter_count);
+      auto* dst_count = ctx.Output<Tensor>("Dst_count");
+      int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
+      memset(p_dst_count, 0, src_dims[0] * sizeof(int));
+      send_recv_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
+          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type,
+          p_dst_count);
     }
   }
 };
@@ -211,11 +213,11 @@ class SendRecvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* gather_index = ctx.Input<Tensor>("Dst_index");
-    auto* scatter_index = ctx.Input<Tensor>("Src_index");
+    auto* src_index = ctx.Input<Tensor>("Dst_index");
+    auto* dst_index = ctx.Input<Tensor>("Src_index");
     auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-    const int& index_size = gather_index->dims()[0];
+    const int& index_size = src_index->dims()[0];
     if (index_size == 0) return;
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
@@ -225,25 +227,25 @@ class SendRecvGradOpKernel : public framework::OpKernel<T> {
     const size_t& memset_bytes = memset_size * sizeof(T);
     memset(p_output, 0, memset_bytes);
 
-    const IndexT* g_index = gather_index->data<IndexT>();
-    const IndexT* s_index = scatter_index->data<IndexT>();
+    const IndexT* s_index = src_index->data<IndexT>();
+    const IndexT* d_index = dst_index->data<IndexT>();
 
     const std::string& pool_type = ctx.Attr<std::string>("pool_type");
     if (pool_type == "SUM") {
-      gather_scatter_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type);
+      send_recv_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
+          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
     } else if (pool_type == "MEAN") {
-      auto* scatter_count = ctx.Input<Tensor>("Scatter_count");
-      const int* s_count = scatter_count->data<int>();
+      auto* dst_count = ctx.Input<Tensor>("Dst_count");
+      const int* s_count = dst_count->data<int>();
       // Functor not used here.
-      gather_scatter_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count);
+      send_recv_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
+          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count);
     } else if (pool_type == "MIN" || pool_type == "MAX") {
       const auto* input = ctx.Input<Tensor>("X");
       const auto* output = ctx.Input<Tensor>("Out");
       // Functor not used here.
-      gather_scatter_cpu_for_loop_grad<T, IndexT, SendRecvMinFunctor<T>>(
-          src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, nullptr,
+      send_recv_cpu_for_loop_grad<T, IndexT, SendRecvMinFunctor<T>>(
+          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr,
           input, output);
     }
   }
diff --git a/python/paddle/fluid/tests/unittests/test_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_send_recv_op.py
index 0ad1603c1e5d8a..aa02942f3736a5 100644
--- a/python/paddle/fluid/tests/unittests/test_send_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_send_recv_op.py
@@ -24,19 +24,15 @@ def setUp(self):
         self.op_type = "send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
-        gather_index = index[:, 0]
-        scatter_index = index[:, 1]
+        src_index = index[:, 0]
+        dst_index = index[:, 1]
 
-        self.inputs = {
-            'X': x,
-            'Src_index': gather_index,
-            'Dst_index': scatter_index
-        }
+        self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index}
 
         self.attrs = {'pool_type': 'MAX'}
 
-        out, self.gradient = compute_gather_scatter_for_min_max(self.inputs,
-                                                                self.attrs)
+        out, self.gradient = compute_send_recv_for_min_max(self.inputs,
+                                                           self.attrs)
         self.outputs = {'Out': out}
 
     def test_check_output(self):
@@ -52,19 +48,15 @@ def setUp(self):
         self.op_type = "send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
-        gather_index = index[:, 0]
-        scatter_index = index[:, 1]
+        src_index = index[:, 0]
+        dst_index = index[:, 1]
 
-        self.inputs = {
-            'X': x,
-            'Src_index': gather_index,
-            'Dst_index': scatter_index
-        }
+        self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index}
 
         self.attrs = {'pool_type': 'MIN'}
 
-        out, self.gradient = compute_gather_scatter_for_min_max(self.inputs,
-                                                                self.attrs)
+        out, self.gradient = compute_send_recv_for_min_max(self.inputs,
+                                                           self.attrs)
 
         self.outputs = {'Out': out}
 
@@ -81,18 +73,14 @@ def setUp(self):
         self.op_type = "send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
-        gather_index = index[:, 0]
-        scatter_index = index[:, 1]
+        src_index = index[:, 0]
+        dst_index = index[:, 1]
 
-        self.inputs = {
-            'X': x,
-            'Src_index': gather_index,
-            'Dst_index': scatter_index
-        }
+        self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index}
 
         self.attrs = {'pool_type': 'SUM'}
 
-        out, _ = compute_gather_scatter_for_sum_mean(self.inputs, self.attrs)
+        out, _ = compute_send_recv_for_sum_mean(self.inputs, self.attrs)
 
         self.outputs = {'Out': out}
 
@@ -109,21 +97,16 @@ def setUp(self):
         self.op_type = "send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
-        gather_index = index[:, 0]
-        scatter_index = index[:, 1]
+        src_index = index[:, 0]
+        dst_index = index[:, 1]
 
-        self.inputs = {
-            'X': x,
-            'Src_index': gather_index,
-            'Dst_index': scatter_index
-        }
+        self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index}
 
         self.attrs = {'pool_type': 'MEAN'}
 
-        out, scatter_count = compute_gather_scatter_for_sum_mean(self.inputs,
-                                                                 self.attrs)
+        out, dst_count = compute_send_recv_for_sum_mean(self.inputs, self.attrs)
 
-        self.outputs = {'Out': out, 'Scatter_count': scatter_count}
+        self.outputs = {'Out': out, 'Dst_count': dst_count}
 
     def test_check_output(self):
         self.check_output()
@@ -132,22 +115,22 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-def compute_gather_scatter_for_sum_mean(inputs, attributes):
+def compute_send_recv_for_sum_mean(inputs, attributes):
     x = inputs['X']
-    gather_index = inputs['Src_index']
-    scatter_index = inputs['Dst_index']
+    src_index = inputs['Src_index']
+    dst_index = inputs['Dst_index']
 
     pool_type = attributes['pool_type']
 
-    gather_x = x[gather_index]
+    gather_x = x[src_index]
     target_shape = list(x.shape)
     results = np.zeros(target_shape, dtype=x.dtype)
     if pool_type == 'SUM':
-        for index, s_id in enumerate(scatter_index):
+        for index, s_id in enumerate(dst_index):
             results[s_id, :] += gather_x[index, :]
     elif pool_type == 'MEAN':
         count = np.zeros(target_shape[0], dtype=np.int32)
-        for index, s_id in enumerate(scatter_index):
+        for index, s_id in enumerate(dst_index):
             results[s_id, :] += gather_x[index, :]
             count[s_id] += 1
         results = results / count.reshape([-1, 1])
@@ -156,20 +139,20 @@ def compute_gather_scatter_for_sum_mean(inputs, attributes):
         raise ValueError("Invalid pool_type, only SUM, MEAN supported!")
 
     count = np.zeros(target_shape[0], dtype=np.int32)
-    for index, s_id in enumerate(scatter_index):
+    for index, s_id in enumerate(dst_index):
         count[s_id] += 1
 
     return results, count
 
 
-def compute_gather_scatter_for_min_max(inputs, attributes):
+def compute_send_recv_for_min_max(inputs, attributes):
     x = inputs['X']
-    gather_index = inputs['Src_index']
-    scatter_index = inputs['Dst_index']
+    src_index = inputs['Src_index']
+    dst_index = inputs['Dst_index']
 
     pool_type = attributes['pool_type']
 
-    gather_x = x[gather_index]
+    gather_x = x[src_index]
     target_shape = list(x.shape)
     results = np.zeros(target_shape, dtype=x.dtype)
     gradient = np.zeros_like(x)
@@ -177,7 +160,7 @@ def compute_gather_scatter_for_min_max(inputs, attributes):
     # Calculate forward output 
     if pool_type == "MAX":
         first_set = set()
-        for index, s_id in enumerate(scatter_index):
+        for index, s_id in enumerate(dst_index):
             if s_id not in first_set:
                 results[s_id, :] += gather_x[index, :]
                 first_set.add(s_id)
@@ -186,7 +169,7 @@ def compute_gather_scatter_for_min_max(inputs, attributes):
                                               gather_x[index, :])
     elif pool_type == "MIN":
         first_set = set()
-        for index, s_id in enumerate(scatter_index):
+        for index, s_id in enumerate(dst_index):
             if s_id not in first_set:
                 results[s_id, :] += gather_x[index, :]
                 first_set.add(s_id)
@@ -197,10 +180,10 @@ def compute_gather_scatter_for_min_max(inputs, attributes):
         raise ValueError("Invalid pool_type, only MAX, MIN supported!")
 
     # Calculate backward gradient
-    index_size = len(gather_index)
+    index_size = len(src_index)
     for i in range(index_size):
-        forward_src_idx = gather_index[i]
-        forward_dst_idx = scatter_index[i]
+        forward_src_idx = src_index[i]
+        forward_dst_idx = dst_index[i]
         gradient[forward_src_idx] += 1 * (
             x[forward_src_idx] == results[forward_dst_idx])
 

From 05c4acf0731e44d8bd75bb385e4deb63efaed997 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Tue, 16 Nov 2021 11:43:53 +0000
Subject: [PATCH 17/30] change zero index return time

---
 paddle/fluid/operators/send_recv_op.cc | 13 +++++--------
 paddle/fluid/operators/send_recv_op.cu |  3 ++-
 paddle/fluid/operators/send_recv_op.h  |  3 ++-
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc
index e19575833597f9..a4bc51170de504 100644
--- a/paddle/fluid/operators/send_recv_op.cc
+++ b/paddle/fluid/operators/send_recv_op.cc
@@ -109,10 +109,10 @@ class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Dst_count",
               "Count tensor of Dst_index, mainly for MEAN pool_type.")
         .AsIntermediate();
-    AddAttr<std::string>(
-        "pool_type",
-        "(string, default 'SUM')"
-        "Define different pool types to receive the result tensors")
+    AddAttr<std::string>("pool_type",
+                         "(string, default 'SUM')"
+                         "Define different pool types to receive the result "
+                         "tensors of Dst_index.")
         .SetDefault("SUM")
         .InEnum({"SUM", "MEAN", "MIN", "MAX"});
     // TODO(daisiming): Add a simple example here.
@@ -121,11 +121,8 @@ SendRecv Operator.
 
 $Out = Recv(Send(X, Src_index), Dst_index, pool_type)$
 
-This operator 
+This operator is mainly used in Graph domain. We use this operator to perform message passing process. We can gather feature according to Src_index, and then use different pool types to define how to receive the result tensors of Dst_index.
 
-Example:
-
-pass
 )DOC");
   }
 };
diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu
index 1fcff7e568ce06..9cdf8acdbe3501 100644
--- a/paddle/fluid/operators/send_recv_op.cu
+++ b/paddle/fluid/operators/send_recv_op.cu
@@ -147,7 +147,6 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
     std::string pool_type = ctx.Attr<std::string>("pool_type");
 
     const int& index_size = src_index->dims()[0];
-    if (index_size == 0) return;
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
     const auto& src_dims = X->dims();
@@ -173,6 +172,8 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
                    std::numeric_limits<T>::max());
     }
 
+    if (index_size == 0) return;
+
     int64_t slice_size = 1;
     for (int i = 1; i < src_dims.size(); ++i) {
       slice_size *= src_dims[i];
diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h
index 9141953c70fd22..64dc474b60e0b7 100644
--- a/paddle/fluid/operators/send_recv_op.h
+++ b/paddle/fluid/operators/send_recv_op.h
@@ -175,7 +175,6 @@ class SendRecvOpKernel : public framework::OpKernel<T> {
     auto* Y = ctx.Output<Tensor>("Out");
 
     const int& index_size = src_index->dims()[0];
-    if (index_size == 0) return;
 
     T* p_output = Y->mutable_data<T>(ctx.GetPlace());
     const auto& src_dims = X->dims();
@@ -184,6 +183,8 @@ class SendRecvOpKernel : public framework::OpKernel<T> {
     const size_t& memset_bytes = memset_size * sizeof(T);
     memset(p_output, 0, memset_bytes);
 
+    if (index_size == 0) return;
+
     const IndexT* s_index = src_index->data<IndexT>();
     const IndexT* d_index = dst_index->data<IndexT>();
 

From c4408b53e714545a4511d9150c2a96bb24033a57 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Tue, 16 Nov 2021 13:52:21 +0000
Subject: [PATCH 18/30] add send_recv incubate api

---
 paddle/fluid/operators/send_recv_op.cc        |  2 +-
 python/paddle/incubate/__init__.py            |  2 +
 python/paddle/incubate/operators/__init__.py  |  1 +
 python/paddle/incubate/operators/send_recv.py | 87 +++++++++++++++++++
 4 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/incubate/operators/send_recv.py

diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc
index a4bc51170de504..24bd47a6bd8265 100644
--- a/paddle/fluid/operators/send_recv_op.cc
+++ b/paddle/fluid/operators/send_recv_op.cc
@@ -121,7 +121,7 @@ SendRecv Operator.
 
 $Out = Recv(Send(X, Src_index), Dst_index, pool_type)$
 
-This operator is mainly used in Graph domain. We use this operator to perform message passing process. We can gather feature according to Src_index, and then use different pool types to define how to receive the result tensors of Dst_index.
+This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` to gather the corresponding positions, and then scatter the corresponding output tensor in different pooling types, like sum, mean, max, or min.
 
 )DOC");
   }
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index f44e38347e5383..6445ee74c0d065 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -18,6 +18,7 @@
 from ..fluid.layer_helper import LayerHelper  # noqa: F401
 from .operators import softmax_mask_fuse_upper_triangle  # noqa: F401
 from .operators import softmax_mask_fuse  # noqa: F401
+from .operators import send_recv
 from .tensor import segment_sum
 from .tensor import segment_mean
 from .tensor import segment_max
@@ -30,6 +31,7 @@
     'ModelAverage',
     'softmax_mask_fuse_upper_triangle',
     'softmax_mask_fuse',
+    'send_recv',
     'segment_sum',
     'segment_mean',
     'segment_max',
diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py
index 9a6710d0950974..90bcaddd79f5da 100644
--- a/python/paddle/incubate/operators/__init__.py
+++ b/python/paddle/incubate/operators/__init__.py
@@ -15,3 +15,4 @@
 from .softmax_mask_fuse_upper_triangle import softmax_mask_fuse_upper_triangle  # noqa: F401
 from .softmax_mask_fuse import softmax_mask_fuse  # noqa: F401
 from .resnet_unit import ResNetUnit  #noqa: F401
+from .send_recv import send_recv  #noqa: F401
diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py
new file mode 100644
index 00000000000000..2f145780e02a5d
--- /dev/null
+++ b/python/paddle/incubate/operators/send_recv.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid import core
+
+
+def send_recv(x, src_index, dst_index, pool_type, name=None):
+    r"""
+
+    Send Recv Operator.
+
+    This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory 
+    consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
+    to gather the corresponding positions, and then scatter the corresponding output tensor in different
+    pooling types, like sum, mean, max, or min.
+
+    Args:
+        x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64.
+        src_index (Tensor): An 1-d tensor, and the available data type is int32, int64.
+        dst_index (Tensor): An 1-d tensor, and should have the same shape as `src_index`. 
+                            The available data type is int32, int64. 
+        pool_type (str): The pooling type of send_recv, including `sum`, `mean`, `max`, `min`.
+        name (str, optional): Name for the operation (optional, default is None).
+                              For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): The output tensor, should have the same shape as input tensor `x`.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+            x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32")
+            indexes = paddle.to_tensor(np.array([[0, 1], [1, 2], [2, 1], [0, 0]]), dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out = paddle.incubate.send_recv(x, src_index, dst_index, pool_type="sum")
+            # Outputs: [[0., 2., 3.], [2., 8., 10.], [2., 6., 7.]]
+
+    """
+
+    if pool_type not in ["sum", "mean", "max", "min"]:
+        raise ValueError(
+            "pool_type should be `sum`, `mean`, `max` or `min`, but received %s"
+            % pool_type)
+
+    if in_dygraph_mode():
+        out, tmp = core.ops.send_recv(x, src_index, dst_index, 'pool_type',
+                                      pool_type.upper())
+        return out
+
+    check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"),
+                             "send_recv")
+    check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"),
+                             "send_recv")
+    check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"),
+                             "send_recv")
+
+    helper = LayerHelper("send_recv", **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    dst_count = helper.create_variable_for_type_inference(
+        dtype="int32", stop_gradient=True)
+    helper.append_op(
+        type="send_recv",
+        inputs={"X": x,
+                "Src_index": src_index,
+                "Dst_index": dst_index},
+        outputs={"Out": out,
+                 "Dst_count": dst_count},
+        attrs={"pool_type": pool_type.upper()})
+    return out

From 8e435f52053804408bd7cd3406da2ac62efc7961 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 17 Nov 2021 03:45:53 +0000
Subject: [PATCH 19/30] fix index data type, add unittest case for API

---
 paddle/fluid/operators/send_recv_op.cc        |  33 +-
 paddle/fluid/operators/send_recv_op.cu        | 379 +++++++++---------
 paddle/fluid/operators/send_recv_op.h         | 170 ++++----
 .../tests/unittests/test_send_recv_op.py      |  81 +++-
 4 files changed, 396 insertions(+), 267 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc
index 24bd47a6bd8265..64bb1fb13beaa6 100644
--- a/paddle/fluid/operators/send_recv_op.cc
+++ b/paddle/fluid/operators/send_recv_op.cc
@@ -121,7 +121,11 @@ SendRecv Operator.
 
 $Out = Recv(Send(X, Src_index), Dst_index, pool_type)$
 
-This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` to gather the corresponding positions, and then scatter the corresponding output tensor in different pooling types, like sum, mean, max, or min.
+This operator is mainly used in Graph domain, and the main purpose is to reduce 
+intermediate memory consumption in the process of message passing. 
+Take `x` as the input tensor, we first use `src_index` to gather corresponding 
+positions, and then scatter the corresponding output tensor in different pooling 
+types, like sum, mean, max, or min.
 
 )DOC");
   }
@@ -164,21 +168,12 @@ REGISTER_OPERATOR(send_recv, ops::SendRecvOP, ops::SendRecvOpMaker,
                   ops::SendRecvGradOpMaker<paddle::framework::OpDesc>,
                   ops::SendRecvGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(send_recv_grad, ops::SendRecvGradOp);
-REGISTER_OP_CPU_KERNEL(send_recv, ops::SendRecvOpKernel<CPU, float, int>,
-                       ops::SendRecvOpKernel<CPU, float, int64_t>,
-                       ops::SendRecvOpKernel<CPU, double, int>,
-                       ops::SendRecvOpKernel<CPU, double, int64_t>,
-                       ops::SendRecvOpKernel<CPU, int, int>,
-                       ops::SendRecvOpKernel<CPU, int, int64_t>,
-                       ops::SendRecvOpKernel<CPU, int64_t, int>,
-                       ops::SendRecvOpKernel<CPU, int64_t, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(send_recv_grad,
-                       ops::SendRecvGradOpKernel<CPU, float, int>,
-                       ops::SendRecvGradOpKernel<CPU, float, int64_t>,
-                       ops::SendRecvGradOpKernel<CPU, double, int>,
-                       ops::SendRecvGradOpKernel<CPU, double, int64_t>,
-                       ops::SendRecvGradOpKernel<CPU, int, int>,
-                       ops::SendRecvGradOpKernel<CPU, int, int64_t>,
-                       ops::SendRecvGradOpKernel<CPU, int64_t, int>,
-                       ops::SendRecvGradOpKernel<CPU, int64_t, int64_t>);
+REGISTER_OP_CPU_KERNEL(send_recv, ops::SendRecvOpKernel<CPU, float>,
+                       ops::SendRecvOpKernel<CPU, double>,
+                       ops::SendRecvOpKernel<CPU, int>,
+                       ops::SendRecvOpKernel<CPU, int64_t>);
+
+REGISTER_OP_CPU_KERNEL(send_recv_grad, ops::SendRecvGradOpKernel<CPU, float>,
+                       ops::SendRecvGradOpKernel<CPU, double>,
+                       ops::SendRecvGradOpKernel<CPU, int>,
+                       ops::SendRecvGradOpKernel<CPU, int64_t>);
diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu
index 9cdf8acdbe3501..15dcda7e1cb077 100644
--- a/paddle/fluid/operators/send_recv_op.cu
+++ b/paddle/fluid/operators/send_recv_op.cu
@@ -137,189 +137,218 @@ __global__ void ManipulateMinMaxGradCUDAKernel(
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<Tensor>("X");
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto* dst_index = ctx.Input<Tensor>("Dst_index");
-    auto* Y = ctx.Output<Tensor>("Out");
-    std::string pool_type = ctx.Attr<std::string>("pool_type");
+void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx) {
+  auto* X = ctx.Input<Tensor>("X");
+  auto* src_index = ctx.Input<Tensor>("Src_index");
+  auto* dst_index = ctx.Input<Tensor>("Dst_index");
+  auto* Y = ctx.Output<Tensor>("Out");
+  std::string pool_type = ctx.Attr<std::string>("pool_type");
+
+  const int& index_size = src_index->dims()[0];
+
+  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
+  const auto& src_dims = X->dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  if (pool_type == "SUM" || pool_type == "MEAN") {
+#ifdef PADDLE_WITH_HIP
+    hipMemset(p_output, 0, memset_bytes);
+#else
+    cudaMemset(p_output, 0, memset_bytes);
+#endif
+  } else if (pool_type == "MAX") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
+                 std::numeric_limits<T>::min());
+  } else if (pool_type == "MIN") {
+    thrust::device_ptr<T> p_output_ptr(p_output);
+    thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
+                 std::numeric_limits<T>::max());
+  }
 
-    const int& index_size = src_index->dims()[0];
+  if (index_size == 0) return;
+
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = X->data<T>();
+  const IndexT* s_index = src_index->data<IndexT>();
+  const IndexT* d_index = dst_index->data<IndexT>();
+
+  int block = 512;
+  int64_t n = slice_size * index_size;
+  int64_t grid = (n + block - 1) / block;
+  int64_t input_size = src_dims[0];
+  if (pool_type == "SUM") {
+    SendRecvSumCUDAFunctor<T, IndexT> functor;
+    SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
+        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(p_src, s_index, d_index, p_output,
+                                         index_size, slice_size, functor);
+  } else if (pool_type == "MAX") {
+    SendRecvMaxCUDAFunctor<T, IndexT> functor;
+    SendRecvCUDAKernel<T, IndexT, SendRecvMaxCUDAFunctor<T, IndexT>><<<
+        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(p_src, s_index, d_index, p_output,
+                                         index_size, slice_size, functor);
+
+    int64_t grid_max = (input_size * slice_size + block - 1) / block;
+    InputResetCUDAKernel<
+        T><<<grid_max, block, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(
+                 ctx.device_context())
+                 .stream()>>>(p_output, input_size, slice_size);
+  } else if (pool_type == "MIN") {
+    SendRecvMinCUDAFunctor<T, IndexT> functor;
+    SendRecvCUDAKernel<T, IndexT, SendRecvMinCUDAFunctor<T, IndexT>><<<
+        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(p_src, s_index, d_index, p_output,
+                                         index_size, slice_size, functor);
+
+    int64_t grid_min = (input_size * slice_size + block - 1) / block;
+    InputResetCUDAKernel<
+        T><<<grid_min, block, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(
+                 ctx.device_context())
+                 .stream()>>>(p_output, input_size, slice_size);
+  } else if (pool_type == "MEAN") {
+    SendRecvSumCUDAFunctor<T, IndexT> functor;
+    SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
+        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(p_src, s_index, d_index, p_output,
+                                         index_size, slice_size, functor);
+
+    auto* dst_count = ctx.Output<Tensor>("Dst_count");
+    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
 
-    T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-    const auto& src_dims = X->dims();
-    int64_t memset_size = 1;
-    for (int i = 0; i < src_dims.size(); ++i) {
-      memset_size *= src_dims[i];
-    }
-    const size_t& memset_bytes = memset_size * sizeof(T);
-    if (pool_type == "SUM" || pool_type == "MEAN") {
 #ifdef PADDLE_WITH_HIP
-      hipMemset(p_output, 0, memset_bytes);
+    hipMemset(p_dst_count, 0, input_size * sizeof(int));
 #else
-      cudaMemset(p_output, 0, memset_bytes);
+    cudaMemset(p_dst_count, 0, input_size * sizeof(int));
 #endif
 
-    } else if (pool_type == "MAX") {
-      thrust::device_ptr<T> p_output_ptr(p_output);
-      thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                   std::numeric_limits<T>::min());
-    } else if (pool_type == "MIN") {
-      thrust::device_ptr<T> p_output_ptr(p_output);
-      thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size,
-                   std::numeric_limits<T>::max());
-    }
-
-    if (index_size == 0) return;
+    int64_t grid_count = (index_size + block - 1) / block;
+    ComputeCountCUDAKernel<
+        T, IndexT><<<grid_count, block, 0,
+                     reinterpret_cast<const platform::CUDADeviceContext&>(
+                         ctx.device_context())
+                         .stream()>>>(p_dst_count, d_index, index_size);
+
+    int64_t grid_mean = (input_size * slice_size + block - 1) / block;
+    ManipulateMeanCUDAKernel<
+        T><<<grid_mean, block, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(
+                 ctx.device_context())
+                 .stream()>>>(p_output, p_dst_count, input_size, slice_size);
+  }
+}
 
-    int64_t slice_size = 1;
-    for (int i = 1; i < src_dims.size(); ++i) {
-      slice_size *= src_dims[i];
-    }
-    const T* p_src = X->data<T>();
-    const IndexT* s_index = src_index->data<IndexT>();
-    const IndexT* d_index = dst_index->data<IndexT>();
-
-    int block = 512;
-    int64_t n = slice_size * index_size;
-    int64_t grid = (n + block - 1) / block;
-    int64_t input_size = src_dims[0];
-    if (pool_type == "SUM") {
-      SendRecvSumCUDAFunctor<T, IndexT> functor;
-      SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, s_index, d_index, p_output,
-                                           index_size, slice_size, functor);
-    } else if (pool_type == "MAX") {
-      SendRecvMaxCUDAFunctor<T, IndexT> functor;
-      SendRecvCUDAKernel<T, IndexT, SendRecvMaxCUDAFunctor<T, IndexT>><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, s_index, d_index, p_output,
-                                           index_size, slice_size, functor);
-
-      int64_t grid_max = (input_size * slice_size + block - 1) / block;
-      InputResetCUDAKernel<
-          T><<<grid_max, block, 0,
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(p_output, input_size, slice_size);
-    } else if (pool_type == "MIN") {
-      SendRecvMinCUDAFunctor<T, IndexT> functor;
-      SendRecvCUDAKernel<T, IndexT, SendRecvMinCUDAFunctor<T, IndexT>><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, s_index, d_index, p_output,
-                                           index_size, slice_size, functor);
-
-      int64_t grid_min = (input_size * slice_size + block - 1) / block;
-      InputResetCUDAKernel<
-          T><<<grid_min, block, 0,
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(p_output, input_size, slice_size);
-    } else if (pool_type == "MEAN") {
-      SendRecvSumCUDAFunctor<T, IndexT> functor;
-      SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, s_index, d_index, p_output,
-                                           index_size, slice_size, functor);
-
-      auto* dst_count = ctx.Output<Tensor>("Dst_count");
-      int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
+template <typename DeviceContext, typename T, typename IndexT>
+void SendRecvGradOpCUDAKernelLaunchHelper(
+    const framework::ExecutionContext& ctx) {
+  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto* src_index = ctx.Input<Tensor>("Dst_index");
+  auto* dst_index = ctx.Input<Tensor>("Src_index");
+  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
+  std::string pool_type = ctx.Attr<std::string>("pool_type");
+
+  const int& index_size = src_index->dims()[0];
+  if (index_size == 0) return;
+
+  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
+  const auto& src_dims = X->dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) {
+    memset_size *= src_dims[i];
+  }
+  const size_t& memset_bytes = memset_size * sizeof(T);
 
 #ifdef PADDLE_WITH_HIP
-      hipMemset(p_dst_count, 0, input_size * sizeof(int));
+  hipMemset(p_output, 0, memset_bytes);
 #else
-      cudaMemset(p_dst_count, 0, input_size * sizeof(int));
+  cudaMemset(p_output, 0, memset_bytes);
 #endif
 
-      int64_t grid_count = (index_size + block - 1) / block;
-      ComputeCountCUDAKernel<
-          T, IndexT><<<grid_count, block, 0,
-                       reinterpret_cast<const platform::CUDADeviceContext&>(
-                           ctx.device_context())
-                           .stream()>>>(p_dst_count, d_index, index_size);
-
-      int64_t grid_mean = (input_size * slice_size + block - 1) / block;
-      ManipulateMeanCUDAKernel<
-          T><<<grid_mean, block, 0,
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(p_output, p_dst_count, input_size, slice_size);
+  int64_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) {
+    slice_size *= src_dims[i];
+  }
+  const T* p_src = X->data<T>();
+  const IndexT* s_index = src_index->data<IndexT>();
+  const IndexT* d_index = dst_index->data<IndexT>();
+
+  int block = 512;
+  int64_t n = slice_size * index_size;
+  int64_t grid = (n + block - 1) / block;
+  int64_t input_size = src_dims[0];
+
+  if (pool_type == "SUM") {
+    SendRecvSumCUDAFunctor<T, IndexT> functor;
+    SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
+        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(p_src, s_index, d_index, p_output,
+                                         index_size, slice_size, functor);
+  } else if (pool_type == "MEAN") {
+    auto* dst_count = ctx.Input<Tensor>("Dst_count");
+    const int* s_count = dst_count->data<int>();
+    ManipulateMeanGradCUDAKernel<T, IndexT><<<
+        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(p_src, s_index, d_index, p_output,
+                                         index_size, slice_size, s_count);
+  } else if (pool_type == "MAX" || pool_type == "MIN") {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Input<Tensor>("Out");
+    const T* ptr_input = input->data<T>();
+    const T* ptr_output = output->data<T>();
+    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<
+        grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream()>>>(p_src, s_index, d_index, p_output,
+                                         index_size, slice_size, ptr_input,
+                                         ptr_output);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* src_index = ctx.Input<Tensor>("Src_index");
+    auto index_type = src_index->type();
+
+    if (index_type == framework::proto::VarType::INT32) {
+      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(ctx);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
+          "got %s.",
+          index_type));
     }
   }
 };
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename DeviceContext, typename T>
 class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto* dst_index = ctx.Input<Tensor>("Src_index");
-    auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-    std::string pool_type = ctx.Attr<std::string>("pool_type");
-
-    const int& index_size = src_index->dims()[0];
-    if (index_size == 0) return;
-
-    T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-    const auto& src_dims = X->dims();
-    int64_t memset_size = 1;
-    for (int i = 0; i < src_dims.size(); ++i) {
-      memset_size *= src_dims[i];
-    }
-    const size_t& memset_bytes = memset_size * sizeof(T);
-
-#ifdef PADDLE_WITH_HIP
-    hipMemset(p_output, 0, memset_bytes);
-#else
-    cudaMemset(p_output, 0, memset_bytes);
-#endif
-
-    int64_t slice_size = 1;
-    for (int i = 1; i < src_dims.size(); ++i) {
-      slice_size *= src_dims[i];
-    }
-    const T* p_src = X->data<T>();
-    const IndexT* s_index = src_index->data<IndexT>();
-    const IndexT* d_index = dst_index->data<IndexT>();
-
-    int block = 512;
-    int64_t n = slice_size * index_size;
-    int64_t grid = (n + block - 1) / block;
-    int64_t input_size = src_dims[0];
-    if (pool_type == "SUM") {
-      SendRecvSumCUDAFunctor<T, IndexT> functor;
-      SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, s_index, d_index, p_output,
-                                           index_size, slice_size, functor);
-    } else if (pool_type == "MEAN") {
-      auto* dst_count = ctx.Input<Tensor>("Dst_count");
-      const int* s_count = dst_count->data<int>();
-      ManipulateMeanGradCUDAKernel<T, IndexT><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, s_index, d_index, p_output,
-                                           index_size, slice_size, s_count);
-    } else if (pool_type == "MAX" || pool_type == "MIN") {
-      auto* input = ctx.Input<Tensor>("X");
-      auto* output = ctx.Input<Tensor>("Out");
-      const T* ptr_input = input->data<T>();
-      const T* ptr_output = output->data<T>();
-      ManipulateMinMaxGradCUDAKernel<T, IndexT><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(p_src, s_index, d_index, p_output,
-                                           index_size, slice_size, ptr_input,
-                                           ptr_output);
+    auto index_type = src_index->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(ctx);
     }
   }
 };
@@ -330,21 +359,13 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
 using CUDA = paddle::platform::CUDADeviceContext;
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(send_recv, ops::SendRecvOpCUDAKernel<CUDA, float, int>,
-                        ops::SendRecvOpCUDAKernel<CUDA, float, int64_t>,
-                        ops::SendRecvOpCUDAKernel<CUDA, double, int>,
-                        ops::SendRecvOpCUDAKernel<CUDA, double, int64_t>,
-                        ops::SendRecvOpCUDAKernel<CUDA, int, int>,
-                        ops::SendRecvOpCUDAKernel<CUDA, int, int64_t>,
-                        ops::SendRecvOpCUDAKernel<CUDA, int64_t, int>,
-                        ops::SendRecvOpCUDAKernel<CUDA, int64_t, int64_t>);
+REGISTER_OP_CUDA_KERNEL(send_recv, ops::SendRecvOpCUDAKernel<CUDA, float>,
+                        ops::SendRecvOpCUDAKernel<CUDA, double>,
+                        ops::SendRecvOpCUDAKernel<CUDA, int>,
+                        ops::SendRecvOpCUDAKernel<CUDA, int64_t>);
 
 REGISTER_OP_CUDA_KERNEL(send_recv_grad,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, float, int>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, float, int64_t>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, double, int>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, double, int64_t>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, int, int>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, int, int64_t>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, int64_t, int>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, int64_t, int64_t>);
+                        ops::SendRecvGradOpCUDAKernel<CUDA, float>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, double>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, int>,
+                        ops::SendRecvGradOpCUDAKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h
index 64dc474b60e0b7..151b10f26eead0 100644
--- a/paddle/fluid/operators/send_recv_op.h
+++ b/paddle/fluid/operators/send_recv_op.h
@@ -166,88 +166,122 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size,
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-class SendRecvOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<Tensor>("X");
-    auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto* dst_index = ctx.Input<Tensor>("Dst_index");
-    auto* Y = ctx.Output<Tensor>("Out");
+void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx) {
+  auto* X = ctx.Input<Tensor>("X");
+  auto* src_index = ctx.Input<Tensor>("Src_index");
+  auto* dst_index = ctx.Input<Tensor>("Dst_index");
+  auto* Y = ctx.Output<Tensor>("Out");
+
+  const int& index_size = src_index->dims()[0];
+
+  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
+  const auto& src_dims = X->dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
+
+  if (index_size == 0) return;
+
+  const IndexT* s_index = src_index->data<IndexT>();
+  const IndexT* d_index = dst_index->data<IndexT>();
+  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
+  if (pool_type == "SUM") {
+    send_recv_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
+  } else if (pool_type == "MIN") {
+    send_recv_cpu_for_loop<T, IndexT, SendRecvMinFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
+  } else if (pool_type == "MAX") {
+    send_recv_cpu_for_loop<T, IndexT, SendRecvMaxFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
+  } else if (pool_type == "MEAN") {
+    auto* dst_count = ctx.Output<Tensor>("Dst_count");
+    int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
+    memset(p_dst_count, 0, src_dims[0] * sizeof(int));
+    send_recv_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type,
+        p_dst_count);
+  }
+}
 
-    const int& index_size = src_index->dims()[0];
+template <typename DeviceContext, typename T, typename IndexT>
+void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx) {
+  auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
+  auto* src_index = ctx.Input<Tensor>("Dst_index");
+  auto* dst_index = ctx.Input<Tensor>("Src_index");
+  auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+  const int& index_size = src_index->dims()[0];
 
-    T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-    const auto& src_dims = X->dims();
-    int64_t memset_size = 1;
-    for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-    const size_t& memset_bytes = memset_size * sizeof(T);
-    memset(p_output, 0, memset_bytes);
+  T* p_output = Y->mutable_data<T>(ctx.GetPlace());
+  const auto& src_dims = X->dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  const size_t& memset_bytes = memset_size * sizeof(T);
+  memset(p_output, 0, memset_bytes);
 
-    if (index_size == 0) return;
+  if (index_size == 0) return;
 
-    const IndexT* s_index = src_index->data<IndexT>();
-    const IndexT* d_index = dst_index->data<IndexT>();
+  const IndexT* s_index = src_index->data<IndexT>();
+  const IndexT* d_index = dst_index->data<IndexT>();
 
-    const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-    if (pool_type == "SUM") {
-      send_recv_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
-          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-    } else if (pool_type == "MIN") {
-      send_recv_cpu_for_loop<T, IndexT, SendRecvMinFunctor<T>>(
-          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-    } else if (pool_type == "MAX") {
-      send_recv_cpu_for_loop<T, IndexT, SendRecvMaxFunctor<T>>(
-          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-    } else if (pool_type == "MEAN") {
-      auto* dst_count = ctx.Output<Tensor>("Dst_count");
-      int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
-      memset(p_dst_count, 0, src_dims[0] * sizeof(int));
-      send_recv_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
-          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type,
-          p_dst_count);
+  const std::string& pool_type = ctx.Attr<std::string>("pool_type");
+  if (pool_type == "SUM") {
+    send_recv_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
+  } else if (pool_type == "MEAN") {
+    auto* dst_count = ctx.Input<Tensor>("Dst_count");
+    const int* s_count = dst_count->data<int>();
+    // Functor not used here.
+    send_recv_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count);
+  } else if (pool_type == "MIN" || pool_type == "MAX") {
+    const auto* input = ctx.Input<Tensor>("X");
+    const auto* output = ctx.Input<Tensor>("Out");
+    // Functor not used here.
+    send_recv_cpu_for_loop_grad<T, IndexT, SendRecvMinFunctor<T>>(
+        src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr,
+        input, output);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class SendRecvOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* src_index = ctx.Input<Tensor>("Src_index");
+    auto index_type = src_index->type();
+
+    if (index_type == framework::proto::VarType::INT32) {
+      SendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      SendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
+          "got %s.",
+          index_type));
     }
   }
 };
 
-template <typename DeviceContext, typename T, typename IndexT>
+template <typename DeviceContext, typename T>
 class SendRecvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* src_index = ctx.Input<Tensor>("Dst_index");
-    auto* dst_index = ctx.Input<Tensor>("Src_index");
-    auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    const int& index_size = src_index->dims()[0];
-    if (index_size == 0) return;
+    auto index_type = src_index->type();
 
-    T* p_output = Y->mutable_data<T>(ctx.GetPlace());
-    const auto& src_dims = X->dims();
-    int64_t memset_size = 1;
-    for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
-    const size_t& memset_bytes = memset_size * sizeof(T);
-    memset(p_output, 0, memset_bytes);
-
-    const IndexT* s_index = src_index->data<IndexT>();
-    const IndexT* d_index = dst_index->data<IndexT>();
-
-    const std::string& pool_type = ctx.Attr<std::string>("pool_type");
-    if (pool_type == "SUM") {
-      send_recv_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
-          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
-    } else if (pool_type == "MEAN") {
-      auto* dst_count = ctx.Input<Tensor>("Dst_count");
-      const int* s_count = dst_count->data<int>();
-      // Functor not used here.
-      send_recv_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
-          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count);
-    } else if (pool_type == "MIN" || pool_type == "MAX") {
-      const auto* input = ctx.Input<Tensor>("X");
-      const auto* output = ctx.Input<Tensor>("Out");
-      // Functor not used here.
-      send_recv_cpu_for_loop_grad<T, IndexT, SendRecvMinFunctor<T>>(
-          src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr,
-          input, output);
+    if (index_type == framework::proto::VarType::INT32) {
+      SendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      SendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
+          "got %s.",
+          index_type));
     }
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/test_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_send_recv_op.py
index aa02942f3736a5..2f87357e2b26b8 100644
--- a/python/paddle/fluid/tests/unittests/test_send_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_send_recv_op.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import unittest
+
 import numpy as np
-from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 
+from op_test import OpTest
+
 
 class TestSendRecvMaxOp(OpTest):
     def setUp(self):
@@ -188,3 +191,79 @@ def compute_send_recv_for_min_max(inputs, attributes):
             x[forward_src_idx] == results[forward_dst_idx])
 
     return results, gradient / results.size
+
+
+class API_SendRecvOpTest(unittest.TestCase):
+    def test_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[3, 3], dtype="float32")
+            src_index = paddle.static.data(name="src", shape=[4], dtype="int32")
+            dst_index = paddle.static.data(name="dst", shape=[4], dtype="int32")
+
+            res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum")
+            res_mean = paddle.incubate.send_recv(x, src_index, dst_index,
+                                                 "mean")
+            res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max")
+            res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min")
+
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype='float32')
+            data2 = np.array([0, 1, 2, 0], dtype="int32")
+            data3 = np.array([1, 2, 1, 0], dtype="int32")
+
+            np_sum = np.array(
+                [[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32")
+            np_mean = np.array(
+                [[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32")
+            np_max = np.array(
+                [[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32")
+            np_min = np.array(
+                [[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32")
+
+            ret = exe.run(feed={'x': data1,
+                                'src': data2,
+                                'dst': data3},
+                          fetch_list=[res_sum, res_mean, res_max, res_min])
+
+        for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret):
+            self.assertTrue(
+                np.allclose(
+                    np_res, ret_res, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!".format(np_res, ret_res))
+
+    def test_dygraph(self):
+        device = paddle.CPUPlace()
+        with paddle.fluid.dygraph.guard(device):
+            x = paddle.to_tensor(
+                np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32")
+            src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32")
+            dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32")
+            res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum")
+            res_mean = paddle.incubate.send_recv(x, src_index, dst_index,
+                                                 "mean")
+            res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max")
+            res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min")
+
+            np_sum = np.array(
+                [[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32")
+            np_mean = np.array(
+                [[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32")
+            np_max = np.array(
+                [[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32")
+            np_min = np.array(
+                [[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32")
+
+            ret = [res_sum, res_mean, res_max, res_min]
+
+        for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret):
+            self.assertTrue(
+                np.allclose(
+                    np_res, ret_res, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!".format(np_res, ret_res))
+
+
+if __name__ == '__main__':
+    unittest.main()

From e7a3c0d32d245aa7b3b841bf55166b24d4c613a6 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 17 Nov 2021 04:03:06 +0000
Subject: [PATCH 20/30] delete redundant input tensor

---
 paddle/fluid/operators/send_recv_op.cu | 26 ++++++++++++++------------
 paddle/fluid/operators/send_recv_op.h  | 25 +++++++++++++------------
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu
index 15dcda7e1cb077..1a8fff782b05a2 100644
--- a/paddle/fluid/operators/send_recv_op.cu
+++ b/paddle/fluid/operators/send_recv_op.cu
@@ -137,14 +137,14 @@ __global__ void ManipulateMinMaxGradCUDAKernel(
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx) {
+void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
+                                      const Tensor& src_index) {
   auto* X = ctx.Input<Tensor>("X");
-  auto* src_index = ctx.Input<Tensor>("Src_index");
   auto* dst_index = ctx.Input<Tensor>("Dst_index");
   auto* Y = ctx.Output<Tensor>("Out");
   std::string pool_type = ctx.Attr<std::string>("pool_type");
 
-  const int& index_size = src_index->dims()[0];
+  const int& index_size = src_index.dims()[0];
 
   T* p_output = Y->mutable_data<T>(ctx.GetPlace());
   const auto& src_dims = X->dims();
@@ -176,7 +176,7 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx) {
     slice_size *= src_dims[i];
   }
   const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index->data<IndexT>();
+  const IndexT* s_index = src_index.data<IndexT>();
   const IndexT* d_index = dst_index->data<IndexT>();
 
   int block = 512;
@@ -253,14 +253,13 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx) {
 
 template <typename DeviceContext, typename T, typename IndexT>
 void SendRecvGradOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx) {
+    const framework::ExecutionContext& ctx, const Tensor& src_index) {
   auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* src_index = ctx.Input<Tensor>("Dst_index");
   auto* dst_index = ctx.Input<Tensor>("Src_index");
   auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
   std::string pool_type = ctx.Attr<std::string>("pool_type");
 
-  const int& index_size = src_index->dims()[0];
+  const int& index_size = src_index.dims()[0];
   if (index_size == 0) return;
 
   T* p_output = Y->mutable_data<T>(ctx.GetPlace());
@@ -282,7 +281,7 @@ void SendRecvGradOpCUDAKernelLaunchHelper(
     slice_size *= src_dims[i];
   }
   const T* p_src = X->data<T>();
-  const IndexT* s_index = src_index->data<IndexT>();
+  const IndexT* s_index = src_index.data<IndexT>();
   const IndexT* d_index = dst_index->data<IndexT>();
 
   int block = 512;
@@ -327,9 +326,10 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
     auto index_type = src_index->type();
 
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx);
+      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
     } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(ctx);
+      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
+                                                                  *src_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported Src_index or Dst_index type, Expected int, int64, but "
@@ -346,9 +346,11 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
     auto* src_index = ctx.Input<Tensor>("Dst_index");
     auto index_type = src_index->type();
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx);
+      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx,
+                                                                  *src_index);
     } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(ctx);
+      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
+          ctx, *src_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h
index 151b10f26eead0..1e47a944799cee 100644
--- a/paddle/fluid/operators/send_recv_op.h
+++ b/paddle/fluid/operators/send_recv_op.h
@@ -166,13 +166,13 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size,
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx) {
+void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
+                                  const Tensor& src_index) {
   auto* X = ctx.Input<Tensor>("X");
-  auto* src_index = ctx.Input<Tensor>("Src_index");
   auto* dst_index = ctx.Input<Tensor>("Dst_index");
   auto* Y = ctx.Output<Tensor>("Out");
 
-  const int& index_size = src_index->dims()[0];
+  const int& index_size = src_index.dims()[0];
 
   T* p_output = Y->mutable_data<T>(ctx.GetPlace());
   const auto& src_dims = X->dims();
@@ -183,7 +183,7 @@ void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx) {
 
   if (index_size == 0) return;
 
-  const IndexT* s_index = src_index->data<IndexT>();
+  const IndexT* s_index = src_index.data<IndexT>();
   const IndexT* d_index = dst_index->data<IndexT>();
   const std::string& pool_type = ctx.Attr<std::string>("pool_type");
   if (pool_type == "SUM") {
@@ -206,13 +206,13 @@ void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx) {
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx) {
+void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
+                                      const Tensor& src_index) {
   auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* src_index = ctx.Input<Tensor>("Dst_index");
   auto* dst_index = ctx.Input<Tensor>("Src_index");
   auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-  const int& index_size = src_index->dims()[0];
+  const int& index_size = src_index.dims()[0];
 
   T* p_output = Y->mutable_data<T>(ctx.GetPlace());
   const auto& src_dims = X->dims();
@@ -223,7 +223,7 @@ void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx) {
 
   if (index_size == 0) return;
 
-  const IndexT* s_index = src_index->data<IndexT>();
+  const IndexT* s_index = src_index.data<IndexT>();
   const IndexT* d_index = dst_index->data<IndexT>();
 
   const std::string& pool_type = ctx.Attr<std::string>("pool_type");
@@ -254,9 +254,9 @@ class SendRecvOpKernel : public framework::OpKernel<T> {
     auto index_type = src_index->type();
 
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx);
+      SendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
     } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx);
+      SendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx, *src_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported Src_index or Dst_index type, Expected int, int64, but "
@@ -274,9 +274,10 @@ class SendRecvGradOpKernel : public framework::OpKernel<T> {
     auto index_type = src_index->type();
 
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx);
+      SendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
     } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx);
+      SendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
+                                                                  *src_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported Src_index or Dst_index type, Expected int, int64, but "

From 42708f752d4442923c3775bd1852ee1157055d50 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 17 Nov 2021 06:04:47 +0000
Subject: [PATCH 21/30] fix en example and docs, add default value in pool_type

---
 python/paddle/incubate/operators/send_recv.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py
index 2f145780e02a5d..56fc2a8982ce96 100644
--- a/python/paddle/incubate/operators/send_recv.py
+++ b/python/paddle/incubate/operators/send_recv.py
@@ -18,14 +18,14 @@
 from paddle.fluid import core
 
 
-def send_recv(x, src_index, dst_index, pool_type, name=None):
+def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
     r"""
 
     Send Recv Operator.
 
     This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory 
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
-    to gather the corresponding positions, and then scatter the corresponding output tensor in different
+    to gather the corresponding positions, and then use `dst_index` to scatter the corresponding output tensor in different
     pooling types, like sum, mean, max, or min.
 
     Args:
@@ -34,6 +34,7 @@ def send_recv(x, src_index, dst_index, pool_type, name=None):
         dst_index (Tensor): An 1-d tensor, and should have the same shape as `src_index`. 
                             The available data type is int32, int64. 
         pool_type (str): The pooling type of send_recv, including `sum`, `mean`, `max`, `min`.
+                         Default value is `sum`.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
@@ -51,7 +52,7 @@ def send_recv(x, src_index, dst_index, pool_type, name=None):
             src_index = indexes[:, 0]
             dst_index = indexes[:, 1]
             out = paddle.incubate.send_recv(x, src_index, dst_index, pool_type="sum")
-            # Outputs: [[0., 2., 3.], [2., 8., 10.], [2., 6., 7.]]
+            # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]]
 
     """
 

From a1adbf1aed03bcf432d9c02d42dcc1270ec72f3f Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 17 Nov 2021 08:07:19 +0000
Subject: [PATCH 22/30] add shape judge and max grid judge

---
 paddle/fluid/operators/send_recv_op.cc        |  12 +-
 paddle/fluid/operators/send_recv_op.cu        | 108 ++++++++++++------
 python/paddle/incubate/operators/send_recv.py |  28 ++++-
 3 files changed, 106 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc
index 64bb1fb13beaa6..e9eb36ac0b95ea 100644
--- a/paddle/fluid/operators/send_recv_op.cc
+++ b/paddle/fluid/operators/send_recv_op.cc
@@ -59,7 +59,11 @@ class SendRecvOP : public framework::OperatorWithKernel {
                                             dst_index_dims.size()));
     }
 
-    // TODO(daisiming): If the shape of src_index and dst_index should be same?
+    PADDLE_ENFORCE_EQ(
+        src_index_dims[0], dst_index_dims[0],
+        platform::errors::InvalidArgument(
+            "Src_index and Dst_index should have the same shape."));
+
     auto dims = ctx->GetInputDim("X");
     ctx->SetOutputDim("Out", dims);
 
@@ -117,14 +121,14 @@ class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
         .InEnum({"SUM", "MEAN", "MIN", "MAX"});
     // TODO(daisiming): Add a simple example here.
     AddComment(R"DOC(
-SendRecv Operator.
+Graph Learning Send_Recv combine operator.
 
 $Out = Recv(Send(X, Src_index), Dst_index, pool_type)$
 
-This operator is mainly used in Graph domain, and the main purpose is to reduce 
+This operator is mainly used in Graph Learning domain, and the main purpose is to reduce 
 intermediate memory consumption in the process of message passing. 
 Take `x` as the input tensor, we first use `src_index` to gather corresponding 
-positions, and then scatter the corresponding output tensor in different pooling 
+positions, and then use `dst_index` to scatter the corresponding output tensor in different pooling 
 types, like sum, mean, max, or min.
 
 )DOC");
diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu
index 1a8fff782b05a2..410cc06e6e9223 100644
--- a/paddle/fluid/operators/send_recv_op.cu
+++ b/paddle/fluid/operators/send_recv_op.cu
@@ -66,13 +66,23 @@ __global__ void SendRecvCUDAKernel(const T* params, const IndexT* src_indices,
   }
 }
 
-// For min and max
+// For max
 template <typename T>
-__global__ void InputResetCUDAKernel(T* output, size_t input_size,
-                                     size_t slice_size) {
+__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size,
+                                        size_t slice_size) {
   CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
-    if (*(output + i) == std::numeric_limits<T>::min() ||
-        *(output + i) == std::numeric_limits<T>::max()) {
+    if (*(output + i) == std::numeric_limits<T>::min()) {
+      *(output + i) = 0;
+    }
+  }
+}
+
+// For min
+template <typename T>
+__global__ void InputResetMinCUDAKernel(T* output, size_t input_size,
+                                        size_t slice_size) {
+  CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) {
+    if (*(output + i) == std::numeric_limits<T>::max()) {
       *(output + i) = 0;
     }
   }
@@ -138,9 +148,9 @@ __global__ void ManipulateMinMaxGradCUDAKernel(
 
 template <typename DeviceContext, typename T, typename IndexT>
 void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
-                                      const Tensor& src_index) {
+                                      const Tensor& src_index,
+                                      const Tensor& dst_index) {
   auto* X = ctx.Input<Tensor>("X");
-  auto* dst_index = ctx.Input<Tensor>("Dst_index");
   auto* Y = ctx.Output<Tensor>("Out");
   std::string pool_type = ctx.Attr<std::string>("pool_type");
 
@@ -177,11 +187,18 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
   }
   const T* p_src = X->data<T>();
   const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
 
-  int block = 512;
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
   int64_t n = slice_size * index_size;
-  int64_t grid = (n + block - 1) / block;
+  const auto& dev_ctx = ctx.cuda_device_context();
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   int64_t input_size = src_dims[0];
   if (pool_type == "SUM") {
     SendRecvSumCUDAFunctor<T, IndexT> functor;
@@ -198,8 +215,10 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
                             .stream()>>>(p_src, s_index, d_index, p_output,
                                          index_size, slice_size, functor);
 
-    int64_t grid_max = (input_size * slice_size + block - 1) / block;
-    InputResetCUDAKernel<
+    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_max =
+        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
+    InputResetMaxCUDAKernel<
         T><<<grid_max, block, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(
                  ctx.device_context())
@@ -212,8 +231,10 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
                             .stream()>>>(p_src, s_index, d_index, p_output,
                                          index_size, slice_size, functor);
 
-    int64_t grid_min = (input_size * slice_size + block - 1) / block;
-    InputResetCUDAKernel<
+    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_min =
+        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
+    InputResetMinCUDAKernel<
         T><<<grid_min, block, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(
                  ctx.device_context())
@@ -242,7 +263,9 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
                          ctx.device_context())
                          .stream()>>>(p_dst_count, d_index, index_size);
 
-    int64_t grid_mean = (input_size * slice_size + block - 1) / block;
+    int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block;
+    int64_t grid_mean =
+        grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx;
     ManipulateMeanCUDAKernel<
         T><<<grid_mean, block, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(
@@ -253,14 +276,13 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
 
 template <typename DeviceContext, typename T, typename IndexT>
 void SendRecvGradOpCUDAKernelLaunchHelper(
-    const framework::ExecutionContext& ctx, const Tensor& src_index) {
+    const framework::ExecutionContext& ctx, const Tensor& src_index,
+    const Tensor& dst_index) {
   auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto* dst_index = ctx.Input<Tensor>("Src_index");
   auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
   std::string pool_type = ctx.Attr<std::string>("pool_type");
 
   const int& index_size = src_index.dims()[0];
-  if (index_size == 0) return;
 
   T* p_output = Y->mutable_data<T>(ctx.GetPlace());
   const auto& src_dims = X->dims();
@@ -276,19 +298,27 @@ void SendRecvGradOpCUDAKernelLaunchHelper(
   cudaMemset(p_output, 0, memset_bytes);
 #endif
 
+  if (index_size == 0) return;
+
   int64_t slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) {
     slice_size *= src_dims[i];
   }
   const T* p_src = X->data<T>();
   const IndexT* s_index = src_index.data<IndexT>();
-  const IndexT* d_index = dst_index->data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
 
-  int block = 512;
+#ifdef PADDLE_WITH_HIP
+  int block = 256;
+#else
+  int block = 1024;
+#endif
   int64_t n = slice_size * index_size;
-  int64_t grid = (n + block - 1) / block;
+  const auto& dev_ctx = ctx.cuda_device_context();
+  int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x;
+  int64_t grid_tmp = (n + block - 1) / block;
+  int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   int64_t input_size = src_dims[0];
-
   if (pool_type == "SUM") {
     SendRecvSumCUDAFunctor<T, IndexT> functor;
     SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
@@ -323,18 +353,21 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* src_index = ctx.Input<Tensor>("Src_index");
-    auto index_type = src_index->type();
-
-    if (index_type == framework::proto::VarType::INT32) {
-      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
-                                                                  *src_index);
+    auto* dst_index = ctx.Input<Tensor>("Dst_index");
+    auto src_index_type = src_index->type();
+    auto dst_index_type = dst_index->type();
+
+    if (src_index_type == framework::proto::VarType::INT32) {
+      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index,
+                                                              *dst_index);
+    } else if (src_index_type == framework::proto::VarType::INT64) {
+      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
+          ctx, *src_index, *dst_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported Src_index or Dst_index type, Expected int, int64, but "
+          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
           "got %s.",
-          index_type));
+          src_index_type));
     }
   }
 };
@@ -344,13 +377,20 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* src_index = ctx.Input<Tensor>("Dst_index");
+    auto* dst_index = ctx.Input<Tensor>("Src_index");
     auto index_type = src_index->type();
+
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx,
-                                                                  *src_index);
+      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
+          ctx, *src_index, *dst_index);
     } else if (index_type == framework::proto::VarType::INT64) {
       SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
-          ctx, *src_index);
+          ctx, *src_index, *dst_index);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
+          "got %s.",
+          src_index_type));
     }
   }
 };
diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py
index 56fc2a8982ce96..52a0ea23955253 100644
--- a/python/paddle/incubate/operators/send_recv.py
+++ b/python/paddle/incubate/operators/send_recv.py
@@ -21,13 +21,33 @@
 def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
     r"""
 
-    Send Recv Operator.
+    Graph Learning Send_Recv combine operator.
 
-    This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory 
+    This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
     to gather the corresponding positions, and then use `dst_index` to scatter the corresponding output tensor in different
     pooling types, like sum, mean, max, or min.
 
+    .. code-block:: text
+
+           Given:
+
+           X = [[0, 2, 3],
+                [1, 4, 5],
+                [2, 6, 7]]
+
+           src_index = [0, 1, 2, 0]
+
+           dst_index = [1, 2, 1, 0]
+
+           pool_type = "sum"
+
+           Then:
+
+           Out = [[0, 2, 3],
+                  [2, 8, 10],
+                  [1, 4, 5]]
+
     Args:
         x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64.
         src_index (Tensor): An 1-d tensor, and the available data type is int32, int64.
@@ -39,14 +59,14 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): The output tensor, should have the same shape as input tensor `x`.
+        out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`.
 
     Examples:
 
         .. code-block:: python
 
-            import paddle
             import numpy as np
+            import paddle
             x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32")
             indexes = paddle.to_tensor(np.array([[0, 1], [1, 2], [2, 1], [0, 0]]), dtype="int32")
             src_index = indexes[:, 0]

From ca1a3c141d21d5d50892a90c96eb61e510dadf88 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 17 Nov 2021 08:29:59 +0000
Subject: [PATCH 23/30] fix comment

---
 paddle/fluid/operators/send_recv_op.cc        | 1 -
 python/paddle/incubate/operators/send_recv.py | 5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc
index e9eb36ac0b95ea..5f51cba578a2a3 100644
--- a/paddle/fluid/operators/send_recv_op.cc
+++ b/paddle/fluid/operators/send_recv_op.cc
@@ -119,7 +119,6 @@ class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
                          "tensors of Dst_index.")
         .SetDefault("SUM")
         .InEnum({"SUM", "MEAN", "MIN", "MAX"});
-    // TODO(daisiming): Add a simple example here.
     AddComment(R"DOC(
 Graph Learning Send_Recv combine operator.
 
diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py
index 52a0ea23955253..0ee8e2ab2b83c9 100644
--- a/python/paddle/incubate/operators/send_recv.py
+++ b/python/paddle/incubate/operators/send_recv.py
@@ -50,8 +50,8 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
 
     Args:
         x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64.
-        src_index (Tensor): An 1-d tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-d tensor, and should have the same shape as `src_index`. 
+        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
                             The available data type is int32, int64. 
         pool_type (str): The pooling type of send_recv, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
@@ -67,6 +67,7 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
 
             import numpy as np
             import paddle
+
             x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32")
             indexes = paddle.to_tensor(np.array([[0, 1], [1, 2], [2, 1], [0, 0]]), dtype="int32")
             src_index = indexes[:, 0]

From f1c9c22a7c47caeac4b0c39666292f9d3842c924 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 17 Nov 2021 08:35:22 +0000
Subject: [PATCH 24/30] fix index type bug

---
 paddle/fluid/operators/send_recv_op.cu | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu
index 410cc06e6e9223..c9725740d5737e 100644
--- a/paddle/fluid/operators/send_recv_op.cu
+++ b/paddle/fluid/operators/send_recv_op.cu
@@ -354,20 +354,19 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* src_index = ctx.Input<Tensor>("Src_index");
     auto* dst_index = ctx.Input<Tensor>("Dst_index");
-    auto src_index_type = src_index->type();
-    auto dst_index_type = dst_index->type();
+    auto index_type = src_index->type();
 
-    if (src_index_type == framework::proto::VarType::INT32) {
+    if (index_type == framework::proto::VarType::INT32) {
       SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index,
                                                               *dst_index);
-    } else if (src_index_type == framework::proto::VarType::INT64) {
+    } else if (index_type == framework::proto::VarType::INT64) {
       SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
           ctx, *src_index, *dst_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
           "got %s.",
-          src_index_type));
+          index_type));
     }
   }
 };
@@ -390,7 +389,7 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported Src_index or Dst_index dtype, expected int, int64, but "
           "got %s.",
-          src_index_type));
+          index_type));
     }
   }
 };

From 6d7d4fb81808033d4fe76b11723d6f8d48dc96e1 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 17 Nov 2021 08:51:21 +0000
Subject: [PATCH 25/30] add const &

---
 paddle/fluid/operators/send_recv_op.h | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h
index 1e47a944799cee..c5f19a1aefed07 100644
--- a/paddle/fluid/operators/send_recv_op.h
+++ b/paddle/fluid/operators/send_recv_op.h
@@ -77,19 +77,19 @@ void send_recv_cpu_for_loop(const int& input_size, const int& index_size,
                             const IndexT* s_index, const IndexT* d_index,
                             const Tensor& src, Tensor* dst,
                             const std::string& pool_type,
-                            int* dst_count = NULL) {
+                            int* dst_count = nullptr) {
   Functor functor;
   if (pool_type == "SUM") {
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = s_index[i];
-      IndexT dst_idx = d_index[i];
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
       elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
                                                       dst_idx, false, functor);
     }
   } else if (pool_type == "MEAN") {
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = s_index[i];
-      IndexT dst_idx = d_index[i];
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
       elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
                                                       dst_idx, false, functor);
     }
@@ -106,8 +106,8 @@ void send_recv_cpu_for_loop(const int& input_size, const int& index_size,
   } else if (pool_type == "MIN" || pool_type == "MAX") {
     std::set<IndexT> existed_dst;
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = s_index[i];
-      IndexT dst_idx = d_index[i];
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
       bool in_set = existed_dst.find(dst_idx) != existed_dst.end();
       if (!in_set) {
         elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
@@ -132,15 +132,15 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size,
   if (pool_type == "SUM") {
     Functor functor;
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = s_index[i];
-      IndexT dst_idx = d_index[i];
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
       elementwise_inner_operation<T, IndexT, Functor>(src, dst, src_idx,
                                                       dst_idx, false, functor);
     }
   } else if (pool_type == "MEAN") {
     for (int i = 0; i < index_size; ++i) {
-      IndexT src_idx = s_index[i];
-      IndexT dst_idx = d_index[i];
+      const IndexT& src_idx = s_index[i];
+      const IndexT& dst_idx = d_index[i];
       auto src_slice = src.Slice(src_idx, src_idx + 1);
       auto dst_slice = dst->Slice(dst_idx, dst_idx + 1);
       auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
@@ -149,8 +149,8 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size,
     }
   } else if (pool_type == "MIN" || pool_type == "MAX") {
     for (int i = 0; i < index_size; ++i) {
-      auto forward_src_idx = d_index[i];
-      auto forward_dst_idx = s_index[i];
+      const IndexT& forward_src_idx = d_index[i];
+      const IndexT& forward_dst_idx = s_index[i];
       auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
       auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
       auto eigen_input = framework::EigenVector<T>::Flatten(input_slice);

From ed0c10e1dc67b08309dccc0a18c480f28212fc78 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 17 Nov 2021 13:55:46 +0000
Subject: [PATCH 26/30] fix en docs

---
 python/paddle/incubate/operators/send_recv.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py
index 0ee8e2ab2b83c9..2ee5594021d8e5 100644
--- a/python/paddle/incubate/operators/send_recv.py
+++ b/python/paddle/incubate/operators/send_recv.py
@@ -25,8 +25,8 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
 
     This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
-    to gather the corresponding positions, and then use `dst_index` to scatter the corresponding output tensor in different
-    pooling types, like sum, mean, max, or min.
+    to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor 
+    in different pooling types, like sum, mean, max, or min.
 
     .. code-block:: text
 

From 6c59c790993f6dd5ee119374f0cb3bf673e43688 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 18 Nov 2021 02:26:47 +0000
Subject: [PATCH 27/30] delete numpy in examples

---
 python/paddle/incubate/operators/send_recv.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py
index 2ee5594021d8e5..7ef2a34b05c9ec 100644
--- a/python/paddle/incubate/operators/send_recv.py
+++ b/python/paddle/incubate/operators/send_recv.py
@@ -65,11 +65,10 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32")
-            indexes = paddle.to_tensor(np.array([[0, 1], [1, 2], [2, 1], [0, 0]]), dtype="int32")
+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32")
             src_index = indexes[:, 0]
             dst_index = indexes[:, 1]
             out = paddle.incubate.send_recv(x, src_index, dst_index, pool_type="sum")

From 52a48a13f1564104b6a8eb7701f8deedb78e12de Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 18 Nov 2021 02:43:19 +0000
Subject: [PATCH 28/30] add unittest for int input

---
 .../tests/unittests/test_send_recv_op.py      | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_send_recv_op.py
index 2f87357e2b26b8..098d1a2d28cb5f 100644
--- a/python/paddle/fluid/tests/unittests/test_send_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_send_recv_op.py
@@ -264,6 +264,36 @@ def test_dygraph(self):
                 "two value is\
                 {}\n{}, check diff!".format(np_res, ret_res))
 
+    def test_int32_input(self):
+        device = paddle.CPUPlace()
+        with paddle.fluid.dygraph.guard(device):
+            x = paddle.to_tensor(
+                np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="int32")
+            src_index = paddle.to_tensor(
+                np.array([0, 1, 2, 0, 1]), dtype="int32")
+            dst_index = paddle.to_tensor(
+                np.array([1, 2, 1, 0, 1]), dtype="int32")
+            res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum")
+            res_mean = paddle.incubate.send_recv(x, src_index, dst_index,
+                                                 "mean")
+            res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max")
+            res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min")
+
+            np_sum = np.array(
+                [[0, 2, 3], [3, 12, 14], [1, 4, 5]], dtype="int32")
+            np_mean = np.array([[0, 2, 3], [1, 4, 4], [1, 4, 5]], dtype="int32")
+            np_max = np.array([[0, 2, 3], [2, 6, 6], [1, 4, 5]], dtype="int32")
+            np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="int32")
+
+            ret = [res_sum, res_mean, res_max, res_min]
+
+        for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret):
+            self.assertTrue(
+                np.allclose(
+                    np_res, ret_res, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!".format(np_res, ret_res))
+
 
 if __name__ == '__main__':
     unittest.main()

From 4bb5669d360906e816ed1384388713035757d586 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 18 Nov 2021 02:52:05 +0000
Subject: [PATCH 29/30] fix send_recv comment

---
 paddle/fluid/operators/send_recv_op.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc
index 5f51cba578a2a3..151fd1eedaa64c 100644
--- a/paddle/fluid/operators/send_recv_op.cc
+++ b/paddle/fluid/operators/send_recv_op.cc
@@ -126,9 +126,9 @@ Graph Learning Send_Recv combine operator.
 
 This operator is mainly used in Graph Learning domain, and the main purpose is to reduce 
 intermediate memory consumption in the process of message passing. 
-Take `x` as the input tensor, we first use `src_index` to gather corresponding 
-positions, and then use `dst_index` to scatter the corresponding output tensor in different pooling 
-types, like sum, mean, max, or min.
+Take `x` as the input tensor, we first use `src_index` to gather corresponding data, 
+and then use `dst_index` to update the corresponding position of output tensor in different 
+pooling types, like sum, mean, max, or min.
 
 )DOC");
   }

From f0137ecb0e1539dfd18b776fa4cacbe10274c604 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 18 Nov 2021 07:31:16 +0000
Subject: [PATCH 30/30] change send_recv to graph_send_recv

---
 ...{send_recv_op.cc => graph_send_recv_op.cc} | 55 ++++++------
 ...{send_recv_op.cu => graph_send_recv_op.cu} | 85 ++++++++++---------
 .../{send_recv_op.h => graph_send_recv_op.h}  | 66 +++++++-------
 ..._recv_op.py => test_graph_send_recv_op.py} | 74 +++++++++-------
 python/paddle/incubate/__init__.py            |  4 +-
 python/paddle/incubate/operators/__init__.py  |  2 +-
 .../{send_recv.py => graph_send_recv.py}      | 20 ++---
 7 files changed, 162 insertions(+), 144 deletions(-)
 rename paddle/fluid/operators/{send_recv_op.cc => graph_send_recv_op.cc} (77%)
 rename paddle/fluid/operators/{send_recv_op.cu => graph_send_recv_op.cu} (84%)
 rename paddle/fluid/operators/{send_recv_op.h => graph_send_recv_op.h} (82%)
 rename python/paddle/fluid/tests/unittests/{test_send_recv_op.py => test_graph_send_recv_op.py} (77%)
 rename python/paddle/incubate/operators/{send_recv.py => graph_send_recv.py} (84%)

diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
similarity index 77%
rename from paddle/fluid/operators/send_recv_op.cc
rename to paddle/fluid/operators/graph_send_recv_op.cc
index 151fd1eedaa64c..6af8388d9eba4e 100644
--- a/paddle/fluid/operators/send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/send_recv_op.h"
+#include "paddle/fluid/operators/graph_send_recv_op.h"
 
 namespace paddle {
 namespace operators {
 
-class SendRecvOP : public framework::OperatorWithKernel {
+class GraphSendRecvOP : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SendRecv");
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv");
     OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index",
-                   "SendRecv");
+                   "GraphSendRecv");
     OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index",
-                   "SendRecv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SendRecv");
+                   "GraphSendRecv");
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv");
 
     auto src_index_dims = ctx->GetInputDim("Src_index");
     if (src_index_dims.size() == 2) {
@@ -69,7 +69,7 @@ class SendRecvOP : public framework::OperatorWithKernel {
 
     if (ctx->Attrs().Get<std::string>("pool_type") == "MEAN") {
       OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count",
-                     "SendRecv");
+                     "GraphSendRecv");
       ctx->SetOutputDim("Dst_count", {dims[0]});
     }
   }
@@ -83,7 +83,7 @@ class SendRecvOP : public framework::OperatorWithKernel {
   }
 };
 
-class SendRecvGradOp : public framework::OperatorWithKernel {
+class GraphSendRecvGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
@@ -101,15 +101,14 @@ class SendRecvGradOp : public framework::OperatorWithKernel {
   }
 };
 
-class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
+class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor with data type float32, "
-             "float64 or float16");
+             "The input tensor with data type float32, float64, int32, int64.");
     AddInput("Src_index", "The source index tensor.");
     AddInput("Dst_index", "The destination index tensor.");
-    AddOutput("Out", "Output tensor of send_recv op.");
+    AddOutput("Out", "Output tensor of graph_send_recv op.");
     AddOutput("Dst_count",
               "Count tensor of Dst_index, mainly for MEAN pool_type.")
         .AsIntermediate();
@@ -135,13 +134,13 @@ pooling types, like sum, mean, max, or min.
 };
 
 template <typename T>
-class SendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
+class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
  public:
   using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 
  protected:
   void Apply(GradOpPtr<T> op) const override {
-    op->SetType("send_recv_grad");
+    op->SetType("graph_send_recv_grad");
     op->SetInput("Src_index", this->Input("Src_index"));
     op->SetInput("Dst_index", this->Input("Dst_index"));
 
@@ -167,16 +166,18 @@ class SendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
 namespace ops = paddle::operators;
 using CPU = paddle::platform::CPUDeviceContext;
 
-REGISTER_OPERATOR(send_recv, ops::SendRecvOP, ops::SendRecvOpMaker,
-                  ops::SendRecvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SendRecvGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(send_recv_grad, ops::SendRecvGradOp);
-REGISTER_OP_CPU_KERNEL(send_recv, ops::SendRecvOpKernel<CPU, float>,
-                       ops::SendRecvOpKernel<CPU, double>,
-                       ops::SendRecvOpKernel<CPU, int>,
-                       ops::SendRecvOpKernel<CPU, int64_t>);
-
-REGISTER_OP_CPU_KERNEL(send_recv_grad, ops::SendRecvGradOpKernel<CPU, float>,
-                       ops::SendRecvGradOpKernel<CPU, double>,
-                       ops::SendRecvGradOpKernel<CPU, int>,
-                       ops::SendRecvGradOpKernel<CPU, int64_t>);
+REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP,
+                  ops::GraphSendRecvOpMaker,
+                  ops::GraphSendRecvGradOpMaker<paddle::framework::OpDesc>,
+                  ops::GraphSendRecvGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp);
+REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel<CPU, float>,
+                       ops::GraphSendRecvOpKernel<CPU, double>,
+                       ops::GraphSendRecvOpKernel<CPU, int>,
+                       ops::GraphSendRecvOpKernel<CPU, int64_t>);
+
+REGISTER_OP_CPU_KERNEL(graph_send_recv_grad,
+                       ops::GraphSendRecvGradOpKernel<CPU, float>,
+                       ops::GraphSendRecvGradOpKernel<CPU, double>,
+                       ops::GraphSendRecvGradOpKernel<CPU, int>,
+                       ops::GraphSendRecvGradOpKernel<CPU, int64_t>);
diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu
similarity index 84%
rename from paddle/fluid/operators/send_recv_op.cu
rename to paddle/fluid/operators/graph_send_recv_op.cu
index c9725740d5737e..d9f56ec4dc0388 100644
--- a/paddle/fluid/operators/send_recv_op.cu
+++ b/paddle/fluid/operators/graph_send_recv_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/send_recv_op.h"
+#include "paddle/fluid/operators/graph_send_recv_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -27,7 +27,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T, typename IndexT>
-struct SendRecvSumCUDAFunctor {
+struct GraphSendRecvSumCUDAFunctor {
   DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
                                 const IndexT& out_i) {
     paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i));
@@ -35,7 +35,7 @@ struct SendRecvSumCUDAFunctor {
 };
 
 template <typename T, typename IndexT>
-struct SendRecvMaxCUDAFunctor {
+struct GraphSendRecvMaxCUDAFunctor {
   DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
                                 const IndexT& out_i) {
     paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i));
@@ -43,7 +43,7 @@ struct SendRecvMaxCUDAFunctor {
 };
 
 template <typename T, typename IndexT>
-struct SendRecvMinCUDAFunctor {
+struct GraphSendRecvMinCUDAFunctor {
   DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i,
                                 const IndexT& out_i) {
     paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i));
@@ -51,10 +51,11 @@ struct SendRecvMinCUDAFunctor {
 };
 
 template <typename T, typename IndexT, typename Functor>
-__global__ void SendRecvCUDAKernel(const T* params, const IndexT* src_indices,
-                                   const IndexT* dst_indices, T* output,
-                                   size_t index_size, size_t slice_size,
-                                   Functor functor) {
+__global__ void GraphSendRecvCUDAKernel(const T* params,
+                                        const IndexT* src_indices,
+                                        const IndexT* dst_indices, T* output,
+                                        size_t index_size, size_t slice_size,
+                                        Functor functor) {
   CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) {
     int64_t indices_i = i / slice_size;
     int64_t slice_i = i - indices_i * slice_size;
@@ -147,9 +148,9 @@ __global__ void ManipulateMinMaxGradCUDAKernel(
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
-                                      const Tensor& src_index,
-                                      const Tensor& dst_index) {
+void GraphSendRecvOpCUDAKernelLaunchHelper(
+    const framework::ExecutionContext& ctx, const Tensor& src_index,
+    const Tensor& dst_index) {
   auto* X = ctx.Input<Tensor>("X");
   auto* Y = ctx.Output<Tensor>("Out");
   std::string pool_type = ctx.Attr<std::string>("pool_type");
@@ -201,15 +202,17 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   int64_t input_size = src_dims[0];
   if (pool_type == "SUM") {
-    SendRecvSumCUDAFunctor<T, IndexT> functor;
-    SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<T, IndexT,
+                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
         grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                             ctx.device_context())
                             .stream()>>>(p_src, s_index, d_index, p_output,
                                          index_size, slice_size, functor);
   } else if (pool_type == "MAX") {
-    SendRecvMaxCUDAFunctor<T, IndexT> functor;
-    SendRecvCUDAKernel<T, IndexT, SendRecvMaxCUDAFunctor<T, IndexT>><<<
+    GraphSendRecvMaxCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<T, IndexT,
+                            GraphSendRecvMaxCUDAFunctor<T, IndexT>><<<
         grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                             ctx.device_context())
                             .stream()>>>(p_src, s_index, d_index, p_output,
@@ -224,8 +227,9 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
                  ctx.device_context())
                  .stream()>>>(p_output, input_size, slice_size);
   } else if (pool_type == "MIN") {
-    SendRecvMinCUDAFunctor<T, IndexT> functor;
-    SendRecvCUDAKernel<T, IndexT, SendRecvMinCUDAFunctor<T, IndexT>><<<
+    GraphSendRecvMinCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<T, IndexT,
+                            GraphSendRecvMinCUDAFunctor<T, IndexT>><<<
         grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                             ctx.device_context())
                             .stream()>>>(p_src, s_index, d_index, p_output,
@@ -240,8 +244,9 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
                  ctx.device_context())
                  .stream()>>>(p_output, input_size, slice_size);
   } else if (pool_type == "MEAN") {
-    SendRecvSumCUDAFunctor<T, IndexT> functor;
-    SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<T, IndexT,
+                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
         grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                             ctx.device_context())
                             .stream()>>>(p_src, s_index, d_index, p_output,
@@ -275,7 +280,7 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx,
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-void SendRecvGradOpCUDAKernelLaunchHelper(
+void GraphSendRecvGradOpCUDAKernelLaunchHelper(
     const framework::ExecutionContext& ctx, const Tensor& src_index,
     const Tensor& dst_index) {
   auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -320,8 +325,9 @@ void SendRecvGradOpCUDAKernelLaunchHelper(
   int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
   int64_t input_size = src_dims[0];
   if (pool_type == "SUM") {
-    SendRecvSumCUDAFunctor<T, IndexT> functor;
-    SendRecvCUDAKernel<T, IndexT, SendRecvSumCUDAFunctor<T, IndexT>><<<
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    GraphSendRecvCUDAKernel<T, IndexT,
+                            GraphSendRecvSumCUDAFunctor<T, IndexT>><<<
         grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
                             ctx.device_context())
                             .stream()>>>(p_src, s_index, d_index, p_output,
@@ -349,7 +355,7 @@ void SendRecvGradOpCUDAKernelLaunchHelper(
 }
 
 template <typename DeviceContext, typename T>
-class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
+class GraphSendRecvOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* src_index = ctx.Input<Tensor>("Src_index");
@@ -357,10 +363,10 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
     auto index_type = src_index->type();
 
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index,
-                                                              *dst_index);
+      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
+          ctx, *src_index, *dst_index);
     } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
+      GraphSendRecvOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
           ctx, *src_index, *dst_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -372,7 +378,7 @@ class SendRecvOpCUDAKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T>
-class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
+class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* src_index = ctx.Input<Tensor>("Dst_index");
@@ -380,10 +386,10 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
     auto index_type = src_index->type();
 
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
+      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int>(
           ctx, *src_index, *dst_index);
     } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
+      GraphSendRecvGradOpCUDAKernelLaunchHelper<DeviceContext, T, int64_t>(
           ctx, *src_index, *dst_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -400,13 +406,14 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel<T> {
 using CUDA = paddle::platform::CUDADeviceContext;
 namespace ops = paddle::operators;
 
-REGISTER_OP_CUDA_KERNEL(send_recv, ops::SendRecvOpCUDAKernel<CUDA, float>,
-                        ops::SendRecvOpCUDAKernel<CUDA, double>,
-                        ops::SendRecvOpCUDAKernel<CUDA, int>,
-                        ops::SendRecvOpCUDAKernel<CUDA, int64_t>);
-
-REGISTER_OP_CUDA_KERNEL(send_recv_grad,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, float>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, double>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, int>,
-                        ops::SendRecvGradOpCUDAKernel<CUDA, int64_t>);
+REGISTER_OP_CUDA_KERNEL(graph_send_recv,
+                        ops::GraphSendRecvOpCUDAKernel<CUDA, float>,
+                        ops::GraphSendRecvOpCUDAKernel<CUDA, double>,
+                        ops::GraphSendRecvOpCUDAKernel<CUDA, int>,
+                        ops::GraphSendRecvOpCUDAKernel<CUDA, int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad,
+                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, float>,
+                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, double>,
+                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int>,
+                        ops::GraphSendRecvGradOpCUDAKernel<CUDA, int64_t>);
diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h
similarity index 82%
rename from paddle/fluid/operators/send_recv_op.h
rename to paddle/fluid/operators/graph_send_recv_op.h
index c5f19a1aefed07..1c7ea74be2ff4c 100644
--- a/paddle/fluid/operators/send_recv_op.h
+++ b/paddle/fluid/operators/graph_send_recv_op.h
@@ -24,7 +24,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-struct SendRecvSumFunctor {
+struct GraphSendRecvSumFunctor {
   void operator()(const bool& first_flag, const Tensor& src_slice,
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
@@ -34,7 +34,7 @@ struct SendRecvSumFunctor {
 };
 
 template <typename T>
-struct SendRecvMinFunctor {
+struct GraphSendRecvMinFunctor {
   void operator()(const bool& first_flag, const Tensor& src_slice,
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
@@ -48,7 +48,7 @@ struct SendRecvMinFunctor {
 };
 
 template <typename T>
-struct SendRecvMaxFunctor {
+struct GraphSendRecvMaxFunctor {
   void operator()(const int& first_flag, const Tensor& src_slice,
                   Tensor* dst_slice) {
     auto eigen_src = framework::EigenVector<T>::Flatten(src_slice);
@@ -73,11 +73,11 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst,
 }
 
 template <typename T, typename IndexT, typename Functor>
-void send_recv_cpu_for_loop(const int& input_size, const int& index_size,
-                            const IndexT* s_index, const IndexT* d_index,
-                            const Tensor& src, Tensor* dst,
-                            const std::string& pool_type,
-                            int* dst_count = nullptr) {
+void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size,
+                                  const IndexT* s_index, const IndexT* d_index,
+                                  const Tensor& src, Tensor* dst,
+                                  const std::string& pool_type,
+                                  int* dst_count = nullptr) {
   Functor functor;
   if (pool_type == "SUM") {
     for (int i = 0; i < index_size; ++i) {
@@ -122,13 +122,11 @@ void send_recv_cpu_for_loop(const int& input_size, const int& index_size,
 }
 
 template <typename T, typename IndexT, typename Functor>
-void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size,
-                                 const IndexT* s_index, const IndexT* d_index,
-                                 const Tensor& src, Tensor* dst,
-                                 const std::string& pool_type,
-                                 const int* dst_count = nullptr,
-                                 const Tensor* input = nullptr,
-                                 const Tensor* output = nullptr) {
+void graph_send_recv_cpu_for_loop_grad(
+    const int& input_size, const int& index_size, const IndexT* s_index,
+    const IndexT* d_index, const Tensor& src, Tensor* dst,
+    const std::string& pool_type, const int* dst_count = nullptr,
+    const Tensor* input = nullptr, const Tensor* output = nullptr) {
   if (pool_type == "SUM") {
     Functor functor;
     for (int i = 0; i < index_size; ++i) {
@@ -166,8 +164,8 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size,
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
-                                  const Tensor& src_index) {
+void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
+                                       const Tensor& src_index) {
   auto* X = ctx.Input<Tensor>("X");
   auto* dst_index = ctx.Input<Tensor>("Dst_index");
   auto* Y = ctx.Output<Tensor>("Out");
@@ -187,27 +185,27 @@ void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
   const IndexT* d_index = dst_index->data<IndexT>();
   const std::string& pool_type = ctx.Attr<std::string>("pool_type");
   if (pool_type == "SUM") {
-    send_recv_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
+    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
         src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
   } else if (pool_type == "MIN") {
-    send_recv_cpu_for_loop<T, IndexT, SendRecvMinFunctor<T>>(
+    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMinFunctor<T>>(
         src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
   } else if (pool_type == "MAX") {
-    send_recv_cpu_for_loop<T, IndexT, SendRecvMaxFunctor<T>>(
+    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvMaxFunctor<T>>(
         src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
   } else if (pool_type == "MEAN") {
     auto* dst_count = ctx.Output<Tensor>("Dst_count");
     int* p_dst_count = dst_count->mutable_data<int>(ctx.GetPlace());
     memset(p_dst_count, 0, src_dims[0] * sizeof(int));
-    send_recv_cpu_for_loop<T, IndexT, SendRecvSumFunctor<T>>(
+    graph_send_recv_cpu_for_loop<T, IndexT, GraphSendRecvSumFunctor<T>>(
         src_dims[0], index_size, s_index, d_index, *X, Y, pool_type,
         p_dst_count);
   }
 }
 
 template <typename DeviceContext, typename T, typename IndexT>
-void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
-                                      const Tensor& src_index) {
+void GraphSendRecvGradOpKernelLaunchHelper(
+    const framework::ExecutionContext& ctx, const Tensor& src_index) {
   auto* X = ctx.Input<Tensor>(framework::GradVarName("Out"));
   auto* dst_index = ctx.Input<Tensor>("Src_index");
   auto* Y = ctx.Output<Tensor>(framework::GradVarName("X"));
@@ -228,35 +226,36 @@ void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx,
 
   const std::string& pool_type = ctx.Attr<std::string>("pool_type");
   if (pool_type == "SUM") {
-    send_recv_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
+    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
         src_dims[0], index_size, s_index, d_index, *X, Y, pool_type);
   } else if (pool_type == "MEAN") {
     auto* dst_count = ctx.Input<Tensor>("Dst_count");
     const int* s_count = dst_count->data<int>();
     // Functor not used here.
-    send_recv_cpu_for_loop_grad<T, IndexT, SendRecvSumFunctor<T>>(
+    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvSumFunctor<T>>(
         src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count);
   } else if (pool_type == "MIN" || pool_type == "MAX") {
     const auto* input = ctx.Input<Tensor>("X");
     const auto* output = ctx.Input<Tensor>("Out");
     // Functor not used here.
-    send_recv_cpu_for_loop_grad<T, IndexT, SendRecvMinFunctor<T>>(
+    graph_send_recv_cpu_for_loop_grad<T, IndexT, GraphSendRecvMinFunctor<T>>(
         src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr,
         input, output);
   }
 }
 
 template <typename DeviceContext, typename T>
-class SendRecvOpKernel : public framework::OpKernel<T> {
+class GraphSendRecvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* src_index = ctx.Input<Tensor>("Src_index");
     auto index_type = src_index->type();
 
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
+      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
     } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx, *src_index);
+      GraphSendRecvOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
+                                                                   *src_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported Src_index or Dst_index type, Expected int, int64, but "
@@ -267,17 +266,18 @@ class SendRecvOpKernel : public framework::OpKernel<T> {
 };
 
 template <typename DeviceContext, typename T>
-class SendRecvGradOpKernel : public framework::OpKernel<T> {
+class GraphSendRecvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* src_index = ctx.Input<Tensor>("Dst_index");
     auto index_type = src_index->type();
 
     if (index_type == framework::proto::VarType::INT32) {
-      SendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx, *src_index);
+      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int>(ctx,
+                                                                   *src_index);
     } else if (index_type == framework::proto::VarType::INT64) {
-      SendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(ctx,
-                                                                  *src_index);
+      GraphSendRecvGradOpKernelLaunchHelper<DeviceContext, T, int64_t>(
+          ctx, *src_index);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Unsupported Src_index or Dst_index type, Expected int, int64, but "
diff --git a/python/paddle/fluid/tests/unittests/test_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
similarity index 77%
rename from python/paddle/fluid/tests/unittests/test_send_recv_op.py
rename to python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
index 098d1a2d28cb5f..68b354775d13e6 100644
--- a/python/paddle/fluid/tests/unittests/test_send_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
@@ -21,10 +21,10 @@
 from op_test import OpTest
 
 
-class TestSendRecvMaxOp(OpTest):
+class TestGraphSendRecvMaxOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        self.op_type = "send_recv"
+        self.op_type = "graph_send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         src_index = index[:, 0]
@@ -34,8 +34,8 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MAX'}
 
-        out, self.gradient = compute_send_recv_for_min_max(self.inputs,
-                                                           self.attrs)
+        out, self.gradient = compute_graph_send_recv_for_min_max(self.inputs,
+                                                                 self.attrs)
         self.outputs = {'Out': out}
 
     def test_check_output(self):
@@ -45,10 +45,10 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient])
 
 
-class TestSendRecvMinOp(OpTest):
+class TestGraphSendRecvMinOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        self.op_type = "send_recv"
+        self.op_type = "graph_send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         src_index = index[:, 0]
@@ -58,8 +58,8 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MIN'}
 
-        out, self.gradient = compute_send_recv_for_min_max(self.inputs,
-                                                           self.attrs)
+        out, self.gradient = compute_graph_send_recv_for_min_max(self.inputs,
+                                                                 self.attrs)
 
         self.outputs = {'Out': out}
 
@@ -70,10 +70,10 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient])
 
 
-class TestSendRecvSumOp(OpTest):
+class TestGraphSendRecvSumOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        self.op_type = "send_recv"
+        self.op_type = "graph_send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         src_index = index[:, 0]
@@ -83,7 +83,7 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'SUM'}
 
-        out, _ = compute_send_recv_for_sum_mean(self.inputs, self.attrs)
+        out, _ = compute_graph_send_recv_for_sum_mean(self.inputs, self.attrs)
 
         self.outputs = {'Out': out}
 
@@ -94,10 +94,10 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-class TestSendRecvMeanOp(OpTest):
+class TestGraphSendRecvMeanOp(OpTest):
     def setUp(self):
         paddle.enable_static()
-        self.op_type = "send_recv"
+        self.op_type = "graph_send_recv"
         x = np.random.random((10, 20)).astype("float64")
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         src_index = index[:, 0]
@@ -107,7 +107,8 @@ def setUp(self):
 
         self.attrs = {'pool_type': 'MEAN'}
 
-        out, dst_count = compute_send_recv_for_sum_mean(self.inputs, self.attrs)
+        out, dst_count = compute_graph_send_recv_for_sum_mean(self.inputs,
+                                                              self.attrs)
 
         self.outputs = {'Out': out, 'Dst_count': dst_count}
 
@@ -118,7 +119,7 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
 
-def compute_send_recv_for_sum_mean(inputs, attributes):
+def compute_graph_send_recv_for_sum_mean(inputs, attributes):
     x = inputs['X']
     src_index = inputs['Src_index']
     dst_index = inputs['Dst_index']
@@ -148,7 +149,7 @@ def compute_send_recv_for_sum_mean(inputs, attributes):
     return results, count
 
 
-def compute_send_recv_for_min_max(inputs, attributes):
+def compute_graph_send_recv_for_min_max(inputs, attributes):
     x = inputs['X']
     src_index = inputs['Src_index']
     dst_index = inputs['Dst_index']
@@ -193,7 +194,7 @@ def compute_send_recv_for_min_max(inputs, attributes):
     return results, gradient / results.size
 
 
-class API_SendRecvOpTest(unittest.TestCase):
+class API_GraphSendRecvOpTest(unittest.TestCase):
     def test_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
@@ -201,11 +202,14 @@ def test_static(self):
             src_index = paddle.static.data(name="src", shape=[4], dtype="int32")
             dst_index = paddle.static.data(name="dst", shape=[4], dtype="int32")
 
-            res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum")
-            res_mean = paddle.incubate.send_recv(x, src_index, dst_index,
-                                                 "mean")
-            res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max")
-            res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min")
+            res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "sum")
+            res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                       "mean")
+            res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "max")
+            res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "min")
 
             exe = paddle.static.Executor(paddle.CPUPlace())
             data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype='float32')
@@ -240,11 +244,14 @@ def test_dygraph(self):
                 np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32")
             src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32")
             dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32")
-            res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum")
-            res_mean = paddle.incubate.send_recv(x, src_index, dst_index,
-                                                 "mean")
-            res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max")
-            res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min")
+            res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "sum")
+            res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                       "mean")
+            res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "max")
+            res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "min")
 
             np_sum = np.array(
                 [[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32")
@@ -273,11 +280,14 @@ def test_int32_input(self):
                 np.array([0, 1, 2, 0, 1]), dtype="int32")
             dst_index = paddle.to_tensor(
                 np.array([1, 2, 1, 0, 1]), dtype="int32")
-            res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum")
-            res_mean = paddle.incubate.send_recv(x, src_index, dst_index,
-                                                 "mean")
-            res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max")
-            res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min")
+            res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "sum")
+            res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                       "mean")
+            res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "max")
+            res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                      "min")
 
             np_sum = np.array(
                 [[0, 2, 3], [3, 12, 14], [1, 4, 5]], dtype="int32")
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index 6445ee74c0d065..e5215cf506413c 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -18,7 +18,7 @@
 from ..fluid.layer_helper import LayerHelper  # noqa: F401
 from .operators import softmax_mask_fuse_upper_triangle  # noqa: F401
 from .operators import softmax_mask_fuse  # noqa: F401
-from .operators import send_recv
+from .operators import graph_send_recv
 from .tensor import segment_sum
 from .tensor import segment_mean
 from .tensor import segment_max
@@ -31,7 +31,7 @@
     'ModelAverage',
     'softmax_mask_fuse_upper_triangle',
     'softmax_mask_fuse',
-    'send_recv',
+    'graph_send_recv',
     'segment_sum',
     'segment_mean',
     'segment_max',
diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py
index 90bcaddd79f5da..ecf73fb393cc17 100644
--- a/python/paddle/incubate/operators/__init__.py
+++ b/python/paddle/incubate/operators/__init__.py
@@ -15,4 +15,4 @@
 from .softmax_mask_fuse_upper_triangle import softmax_mask_fuse_upper_triangle  # noqa: F401
 from .softmax_mask_fuse import softmax_mask_fuse  # noqa: F401
 from .resnet_unit import ResNetUnit  #noqa: F401
-from .send_recv import send_recv  #noqa: F401
+from .graph_send_recv import graph_send_recv  #noqa: F401
diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py
similarity index 84%
rename from python/paddle/incubate/operators/send_recv.py
rename to python/paddle/incubate/operators/graph_send_recv.py
index 7ef2a34b05c9ec..9b8f542658dd65 100644
--- a/python/paddle/incubate/operators/send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -18,7 +18,7 @@
 from paddle.fluid import core
 
 
-def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
+def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
     r"""
 
     Graph Learning Send_Recv combine operator.
@@ -53,7 +53,7 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
         dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
                             The available data type is int32, int64. 
-        pool_type (str): The pooling type of send_recv, including `sum`, `mean`, `max`, `min`.
+        pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
@@ -71,7 +71,7 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
             indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32")
             src_index = indexes[:, 0]
             dst_index = indexes[:, 1]
-            out = paddle.incubate.send_recv(x, src_index, dst_index, pool_type="sum")
+            out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
             # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]]
 
     """
@@ -82,23 +82,23 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None):
             % pool_type)
 
     if in_dygraph_mode():
-        out, tmp = core.ops.send_recv(x, src_index, dst_index, 'pool_type',
-                                      pool_type.upper())
+        out, tmp = core.ops.graph_send_recv(x, src_index, dst_index,
+                                            'pool_type', pool_type.upper())
         return out
 
     check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"),
-                             "send_recv")
+                             "graph_send_recv")
     check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"),
-                             "send_recv")
+                             "graph_send_recv")
     check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"),
-                             "send_recv")
+                             "graph_send_recv")
 
-    helper = LayerHelper("send_recv", **locals())
+    helper = LayerHelper("graph_send_recv", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     dst_count = helper.create_variable_for_type_inference(
         dtype="int32", stop_gradient=True)
     helper.append_op(
-        type="send_recv",
+        type="graph_send_recv",
         inputs={"X": x,
                 "Src_index": src_index,
                 "Dst_index": dst_index},