From f3b6ee4c5e9f19e3cdc875824fb38af749881fa7 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 28 Oct 2021 08:37:17 +0000 Subject: [PATCH 01/30] add cpu version, using set: sum, min, max --- .../fused/fused_gather_scatter_op.cc | 181 +++++++++++++++++ .../operators/fused/fused_gather_scatter_op.h | 184 ++++++++++++++++++ 2 files changed, 365 insertions(+) create mode 100644 paddle/fluid/operators/fused/fused_gather_scatter_op.cc create mode 100644 paddle/fluid/operators/fused/fused_gather_scatter_op.h diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc new file mode 100644 index 00000000000000..0eea9b90a4a94e --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc @@ -0,0 +1,181 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fused_gather_scatter_op.h" + +namespace paddle { +namespace operators { + +class FusedGatherScatterOP : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ( + ctx->HasInput("X"), true, + platform::errors::InvalidArgument( + "Input(X) of FusedGatherScatterOp should not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Gather_index"), true, + platform::errors::InvalidArgument( + "Input(Gather_indx) of FusedGatherScatterOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Scatter_index"), true, + platform::errors::InvalidArgument( + "Input(Scatter_index) of FusedGatherScatterOp should " + "not be null.")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput("Out"), true, + platform::errors::InvalidArgument( + "Output(Out) of FusedGatherScatterOp should not be null.")); + + auto gather_index_dims = ctx->GetInputDim("Gather_index"); + if (gather_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(gather_index_dims[1], 1, + platform::errors::InvalidArgument( + "The last dim of gather_index should be 1 when it " + "is 2D, but we get %d", + gather_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + gather_index_dims.size(), 1, + platform::errors::InvalidArgument( + "The gather_index should be 1D, when it is not 2D, but we get %d", + gather_index_dims.size())); + } + + auto scatter_index_dims = ctx->GetInputDim("Scatter_index"); + if (scatter_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(scatter_index_dims[1], 1, + platform::errors::InvalidArgument( + "The last dim of scatter_index should be 1 when it " + "is 2D, but we get %d", + scatter_index_dims[1])); + } else { + PADDLE_ENFORCE_EQ( + scatter_index_dims.size(), 1, + platform::errors::InvalidArgument("The scatter_index should be 1D, " + "when it is not 2D, but we get %d", + scatter_index_dims.size())); + } + + auto dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), + ctx.device_context()); + } +}; + +class FusedGatherScatterGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto in_dims = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(framework::GradVarName("X"), in_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.device_context()); + } +}; + +class FusedGatherScatterOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor with data type float32, " + "float64 or float16"); + AddInput("Gather_index", "The gather index tensor."); + AddInput("Scatter_index", "The scatter index tensor."); + AddOutput("Out", "Output tensor of fused_gather_scatter op."); + AddAttr( + "pool_type", + "(string, default 'SUM')" + "We use Gather_index to gather correspoinding place of X. " + "Then we need to use different pool type to scatter the result.") + .SetDefault("SUM") + .InEnum({"SUM", "MEAN", "MIN", "MAX"}); + // TODO(daisiming): Add a simple example here. + AddComment(R"DOC( +Fused Gather Scatter Operator. + +$Out = Scatter(Gather(X, Gather_index), Scatter_index, pool_type)$ + +This operator helps perform fused computation of gather operator and scatter operator, so as to +decrease intermediate GPU memory occupation of using gather op and scatter op successively. + +Example: + +pass +)DOC"); + } +}; + +template +class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("fused_gather_scatter_grad"); + op->SetInput("Gather_index", this->Input("Gather_index")); + op->SetInput("Scatter_index", this->Input("Scatter_index")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetAttrMap(this->Attrs()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR( + fused_gather_scatter, ops::FusedGatherScatterOP, + ops::FusedGatherScatterOpMaker, + ops::FusedGatherScatterGradOpMaker, + ops::FusedGatherScatterGradOpMaker); +REGISTER_OPERATOR(fused_gather_scatter_grad, ops::FusedGatherScatterGradOp); +REGISTER_OP_CPU_KERNEL(fused_gather_scatter, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel); +REGISTER_OP_CPU_KERNEL( + fused_gather_scatter_grad, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel); diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h new file mode 100644 index 00000000000000..b782a478753338 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -0,0 +1,184 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct FusedGatherScatterSumFunctor { + void operator()(const int& first_flag, const Tensor& src_slice, + Tensor* dst_slice) { + auto eigen_src = framework::EigenVector::Flatten(src_slice); + auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); + eigen_dst += eigen_src; + } +}; + +template +struct FusedGatherScatterMinFunctor { + void operator()(const int& first_flag, const Tensor& src_slice, + Tensor* dst_slice) { + auto eigen_src = framework::EigenVector::Flatten(src_slice); + auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); + if (first_flag == 0) { + eigen_dst += eigen_src; + } else { + eigen_dst = eigen_dst.cwiseMin(eigen_src); + } + } +}; + +template +struct FusedGatherScatterMaxFunctor { + void operator()(const int& first_flag, const Tensor& src_slice, + Tensor* dst_slice) { + auto eigen_src = framework::EigenVector::Flatten(src_slice); + auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); + if (first_flag == 0) { + eigen_dst += eigen_src; + } else { + eigen_dst = eigen_dst.cwiseMax(eigen_src); + } + } +}; + +template +void elementwise_inner_operation(const Tensor& src, Tensor* dst, + const IndexT& src_index, + const IndexT& dst_index, + const bool& first_flag, Functor functor) { + auto src_slice = src.Slice(src_index, src_index + 1); + auto dst_slice = dst->Slice(dst_index, dst_index + 1); + + functor(first_flag, src_slice, &dst_slice); +} + +template +void gather_scatter_cpu_for_loop(const int index_size, const IndexT* g_index, + const IndexT* s_index, const Tensor& src, + Tensor* dst, const std::string& pool_type) { + Functor functor; + if (pool_type == "MIN" || pool_type == "MAX") { + std::set existed_dst; + for (int i = 0; i < index_size; ++i) { + IndexT src_ptr = g_index[i]; + IndexT dst_ptr = s_index[i]; + int nRet = std::count(existed_dst.begin(), existed_dst.end(), dst_ptr); + if (nRet == 0) { + elementwise_inner_operation(src, dst, src_ptr, + dst_ptr, 0, functor); + existed_dst.insert(dst_ptr); + } else { + elementwise_inner_operation(src, dst, src_ptr, + dst_ptr, 1, functor); + } + } + } else if (pool_type == "SUM" || pool_type == "MEAN") { + for (int i = 0; i < index_size; ++i) { + IndexT src_ptr = g_index[i]; + IndexT dst_ptr = s_index[i]; + elementwise_inner_operation(src, dst, src_ptr, + dst_ptr, 0, functor); + } + } +} + +template +class FusedGatherScatterOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* gather_index = ctx.Input("Gather_index"); + auto* scatter_index = ctx.Input("Scatter_index"); + auto* Y = ctx.Output("Out"); + + int index_size = gather_index->dims()[0]; + + T* p_output = Y->mutable_data(ctx.GetPlace()); + auto src_dims = X->dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + const size_t& memset_bytes = memset_size * sizeof(T); + memset(p_output, 0, memset_bytes); + + const IndexT* g_index = gather_index->data(); + const IndexT* s_index = scatter_index->data(); + + std::string pool_type = ctx.Attr("pool_type"); + if (pool_type == "SUM") { + gather_scatter_cpu_for_loop>( + index_size, g_index, s_index, *X, Y, pool_type); + } else if (pool_type == "MIN") { + gather_scatter_cpu_for_loop>( + index_size, g_index, s_index, *X, Y, pool_type); + } else if (pool_type == "MAX") { + gather_scatter_cpu_for_loop>( + index_size, g_index, s_index, *X, Y, pool_type); + } else if (pool_type == "MEAN") { + gather_scatter_cpu_for_loop>( + index_size, g_index, s_index, *X, Y, pool_type); + // TODO(daisiming): Add mean operation. + } + } +}; + +template +class FusedGatherScatterGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input(framework::GradVarName("Out")); + auto* gather_index = ctx.Input("Gather_index"); + auto* scatter_index = ctx.Input("Scatter_index"); + auto* Y = ctx.Output(framework::GradVarName("X")); + + int index_size = gather_index->dims()[0]; + + T* p_output = Y->mutable_data(ctx.GetPlace()); + auto src_dims = X->dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + const size_t& memset_bytes = memset_size * sizeof(T); + memset(p_output, 0, memset_bytes); + + const IndexT* g_index = gather_index->data(); + const IndexT* s_index = scatter_index->data(); + + std::string pool_type = ctx.Attr("pool_type"); + if (pool_type == "SUM") { + gather_scatter_cpu_for_loop>( + index_size, g_index, s_index, *X, Y, pool_type); + } else if (pool_type == "MIN") { + gather_scatter_cpu_for_loop>( + index_size, g_index, s_index, *X, Y, pool_type); + } else if (pool_type == "MAX") { + gather_scatter_cpu_for_loop>( + index_size, g_index, s_index, *X, Y, pool_type); + } else if (pool_type == "MEAN") { + gather_scatter_cpu_for_loop>( + index_size, g_index, s_index, *X, Y, pool_type); + // TODO(daisiming): Add mean operation. + } + } +}; + +} // namespace operators +} // namespace paddle From 470aa0fd23f4b26faba65a3faab67f97e2a4148f Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Fri, 29 Oct 2021 07:40:39 +0000 Subject: [PATCH 02/30] add cpu version: mean --- .../operators/fused/fused_gather_scatter_op.h | 34 +++++++++++++++++-- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index b782a478753338..563ae656fc9b8d 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -73,7 +73,7 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst, } template -void gather_scatter_cpu_for_loop(const int index_size, const IndexT* g_index, +void gather_scatter_cpu_for_loop(const int& index_size, const IndexT* g_index, const IndexT* s_index, const Tensor& src, Tensor* dst, const std::string& pool_type) { Functor functor; @@ -102,6 +102,16 @@ void gather_scatter_cpu_for_loop(const int index_size, const IndexT* g_index, } } +template +void cal_mean_cpu_for_loop(const int& size, Tensor* dst, int* count) { + for (int i = 0; i < size; ++i) { + if (count[i] == 0) continue; + auto dst_slice = dst->Slice(i, i + 1); + auto eigen_dst = framework::EigenVector::Flatten(dst_slice); + eigen_dst = eigen_dst / static_cast(count[i]); + } +} + template class FusedGatherScatterOpKernel : public framework::OpKernel { public: @@ -136,7 +146,16 @@ class FusedGatherScatterOpKernel : public framework::OpKernel { } else if (pool_type == "MEAN") { gather_scatter_cpu_for_loop>( index_size, g_index, s_index, *X, Y, pool_type); - // TODO(daisiming): Add mean operation. + int count[src_dims[0]]; + memset(count, 0, src_dims[0] * sizeof(int)); + for (int i = 0; i < index_size; ++i) { + IndexT dst_ptr = s_index[i]; + count[dst_ptr] += 1; + } + for (int i = 0; i < src_dims[0]; ++i) { + VLOG(0) << count[i]; + } + cal_mean_cpu_for_loop(src_dims[0], Y, count); } } }; @@ -175,7 +194,16 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { } else if (pool_type == "MEAN") { gather_scatter_cpu_for_loop>( index_size, g_index, s_index, *X, Y, pool_type); - // TODO(daisiming): Add mean operation. + int count[src_dims[0]]; + memset(count, 0, src_dims[0] * sizeof(int)); + for (int i = 0; i < index_size; ++i) { + IndexT dst_ptr = s_index[i]; + count[dst_ptr] += 1; + } + for (int i = 0; i < src_dims[0]; ++i) { + VLOG(0) << count[i]; + } + cal_mean_cpu_for_loop(src_dims[0], Y, count); } } }; From b31111c87b37b1df7c2a47cd9fdc1acd4d5c086b Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Fri, 29 Oct 2021 10:13:15 +0000 Subject: [PATCH 03/30] improve cpu code and fix dynamic memory allcation problem --- .../fused/fused_gather_scatter_op.cc | 32 +++-- .../operators/fused/fused_gather_scatter_op.h | 120 ++++++++---------- 2 files changed, 75 insertions(+), 77 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc index 0eea9b90a4a94e..789ee7e3f217ed 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc @@ -160,22 +160,28 @@ REGISTER_OPERATOR( ops::FusedGatherScatterGradOpMaker, ops::FusedGatherScatterGradOpMaker); REGISTER_OPERATOR(fused_gather_scatter_grad, ops::FusedGatherScatterGradOp); -REGISTER_OP_CPU_KERNEL(fused_gather_scatter, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel); +REGISTER_OP_CPU_KERNEL( + fused_gather_scatter, ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel); + REGISTER_OP_CPU_KERNEL( fused_gather_scatter_grad, ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel); + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel, + ops::FusedGatherScatterGradOpKernel); diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index 563ae656fc9b8d..401657a4a5d5a4 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -25,7 +25,7 @@ using Tensor = framework::Tensor; template struct FusedGatherScatterSumFunctor { - void operator()(const int& first_flag, const Tensor& src_slice, + void operator()(const bool& first_flag, const Tensor& src_slice, Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); @@ -35,11 +35,11 @@ struct FusedGatherScatterSumFunctor { template struct FusedGatherScatterMinFunctor { - void operator()(const int& first_flag, const Tensor& src_slice, + void operator()(const bool& first_flag, const Tensor& src_slice, Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag == 0) { + if (first_flag) { eigen_dst += eigen_src; } else { eigen_dst = eigen_dst.cwiseMin(eigen_src); @@ -53,7 +53,7 @@ struct FusedGatherScatterMaxFunctor { Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); auto eigen_dst = framework::EigenVector::Flatten(*dst_slice); - if (first_flag == 0) { + if (first_flag) { eigen_dst += eigen_src; } else { eigen_dst = eigen_dst.cwiseMax(eigen_src); @@ -73,42 +73,54 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst, } template -void gather_scatter_cpu_for_loop(const int& index_size, const IndexT* g_index, - const IndexT* s_index, const Tensor& src, - Tensor* dst, const std::string& pool_type) { +void gather_scatter_cpu_for_loop(const int& index_size, const int& input_size, + const IndexT* g_index, const IndexT* s_index, + const Tensor& src, Tensor* dst, + const std::string& pool_type) { Functor functor; if (pool_type == "MIN" || pool_type == "MAX") { std::set existed_dst; + bool in_set = false; for (int i = 0; i < index_size; ++i) { - IndexT src_ptr = g_index[i]; - IndexT dst_ptr = s_index[i]; - int nRet = std::count(existed_dst.begin(), existed_dst.end(), dst_ptr); - if (nRet == 0) { - elementwise_inner_operation(src, dst, src_ptr, - dst_ptr, 0, functor); - existed_dst.insert(dst_ptr); + IndexT src_idx = g_index[i]; + IndexT dst_idx = s_index[i]; + in_set = existed_dst.find(dst_idx) != existed_dst.end(); + if (!in_set) { + elementwise_inner_operation(src, dst, src_idx, + dst_idx, true, functor); + existed_dst.emplace(dst_idx); } else { - elementwise_inner_operation(src, dst, src_ptr, - dst_ptr, 1, functor); + elementwise_inner_operation( + src, dst, src_idx, dst_idx, false, functor); } } - } else if (pool_type == "SUM" || pool_type == "MEAN") { + } else if (pool_type == "SUM") { for (int i = 0; i < index_size; ++i) { - IndexT src_ptr = g_index[i]; - IndexT dst_ptr = s_index[i]; - elementwise_inner_operation(src, dst, src_ptr, - dst_ptr, 0, functor); + IndexT src_idx = g_index[i]; + IndexT dst_idx = s_index[i]; + elementwise_inner_operation(src, dst, src_idx, + dst_idx, false, functor); } - } -} - -template -void cal_mean_cpu_for_loop(const int& size, Tensor* dst, int* count) { - for (int i = 0; i < size; ++i) { - if (count[i] == 0) continue; - auto dst_slice = dst->Slice(i, i + 1); - auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(count[i]); + } else if (pool_type == "MEAN") { + for (int i = 0; i < index_size; ++i) { + IndexT src_idx = g_index[i]; + IndexT dst_idx = s_index[i]; + elementwise_inner_operation(src, dst, src_idx, + dst_idx, false, functor); + } + int* count = new int[input_size]; + memset(count, 0, input_size * sizeof(int)); + for (int i = 0; i < index_size; ++i) { + IndexT dst_idx = s_index[i]; + count[dst_idx] += 1; + } + for (int i = 0; i < input_size; ++i) { + if (count[i] == 0) continue; + auto dst_slice = dst->Slice(i, i + 1); + auto eigen_dst = framework::EigenVector::Flatten(dst_slice); + eigen_dst = eigen_dst / static_cast(count[i]); + } + delete[] count; } } @@ -121,10 +133,10 @@ class FusedGatherScatterOpKernel : public framework::OpKernel { auto* scatter_index = ctx.Input("Scatter_index"); auto* Y = ctx.Output("Out"); - int index_size = gather_index->dims()[0]; + const int& index_size = gather_index->dims()[0]; T* p_output = Y->mutable_data(ctx.GetPlace()); - auto src_dims = X->dims(); + const auto& src_dims = X->dims(); int64_t memset_size = 1; for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; const size_t& memset_bytes = memset_size * sizeof(T); @@ -136,26 +148,16 @@ class FusedGatherScatterOpKernel : public framework::OpKernel { std::string pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { gather_scatter_cpu_for_loop>( - index_size, g_index, s_index, *X, Y, pool_type); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MIN") { gather_scatter_cpu_for_loop>( - index_size, g_index, s_index, *X, Y, pool_type); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MAX") { gather_scatter_cpu_for_loop>( - index_size, g_index, s_index, *X, Y, pool_type); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { gather_scatter_cpu_for_loop>( - index_size, g_index, s_index, *X, Y, pool_type); - int count[src_dims[0]]; - memset(count, 0, src_dims[0] * sizeof(int)); - for (int i = 0; i < index_size; ++i) { - IndexT dst_ptr = s_index[i]; - count[dst_ptr] += 1; - } - for (int i = 0; i < src_dims[0]; ++i) { - VLOG(0) << count[i]; - } - cal_mean_cpu_for_loop(src_dims[0], Y, count); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } } }; @@ -169,10 +171,10 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { auto* scatter_index = ctx.Input("Scatter_index"); auto* Y = ctx.Output(framework::GradVarName("X")); - int index_size = gather_index->dims()[0]; + const int& index_size = gather_index->dims()[0]; T* p_output = Y->mutable_data(ctx.GetPlace()); - auto src_dims = X->dims(); + const auto& src_dims = X->dims(); int64_t memset_size = 1; for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; const size_t& memset_bytes = memset_size * sizeof(T); @@ -181,29 +183,19 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { const IndexT* g_index = gather_index->data(); const IndexT* s_index = scatter_index->data(); - std::string pool_type = ctx.Attr("pool_type"); + const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { gather_scatter_cpu_for_loop>( - index_size, g_index, s_index, *X, Y, pool_type); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MIN") { gather_scatter_cpu_for_loop>( - index_size, g_index, s_index, *X, Y, pool_type); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MAX") { gather_scatter_cpu_for_loop>( - index_size, g_index, s_index, *X, Y, pool_type); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { gather_scatter_cpu_for_loop>( - index_size, g_index, s_index, *X, Y, pool_type); - int count[src_dims[0]]; - memset(count, 0, src_dims[0] * sizeof(int)); - for (int i = 0; i < index_size; ++i) { - IndexT dst_ptr = s_index[i]; - count[dst_ptr] += 1; - } - for (int i = 0; i < src_dims[0]; ++i) { - VLOG(0) << count[i]; - } - cal_mean_cpu_for_loop(src_dims[0], Y, count); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } } }; From a54860c7c1c5e776e2f59135130b45c7730d1396 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 3 Nov 2021 12:50:22 +0000 Subject: [PATCH 04/30] fix arg error, add index judge, delete fp16 --- .../fused/fused_gather_scatter_op.cc | 37 +++++++++---------- .../operators/fused/fused_gather_scatter_op.h | 3 +- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc index 789ee7e3f217ed..cd7459ac70f3ed 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc @@ -29,7 +29,7 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( ctx->HasInput("Gather_index"), true, platform::errors::InvalidArgument( - "Input(Gather_indx) of FusedGatherScatterOp should not be null.")); + "Input(Gather_index) of FusedGatherScatterOp should not be null.")); PADDLE_ENFORCE_EQ(ctx->HasInput("Scatter_index"), true, platform::errors::InvalidArgument( "Input(Scatter_index) of FusedGatherScatterOp should " @@ -43,14 +43,14 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel { if (gather_index_dims.size() == 2) { PADDLE_ENFORCE_EQ(gather_index_dims[1], 1, platform::errors::InvalidArgument( - "The last dim of gather_index should be 1 when it " + "The last dim of Gather_index should be 1 when it " "is 2D, but we get %d", gather_index_dims[1])); } else { PADDLE_ENFORCE_EQ( gather_index_dims.size(), 1, platform::errors::InvalidArgument( - "The gather_index should be 1D, when it is not 2D, but we get %d", + "The Gather_index should be 1D, when it is not 2D, but we get %d", gather_index_dims.size())); } @@ -58,17 +58,19 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel { if (scatter_index_dims.size() == 2) { PADDLE_ENFORCE_EQ(scatter_index_dims[1], 1, platform::errors::InvalidArgument( - "The last dim of scatter_index should be 1 when it " + "The last dim of Scatter_index should be 1 when it " "is 2D, but we get %d", scatter_index_dims[1])); } else { PADDLE_ENFORCE_EQ( scatter_index_dims.size(), 1, - platform::errors::InvalidArgument("The scatter_index should be 1D, " + platform::errors::InvalidArgument("The Scatter_index should be 1D, " "when it is not 2D, but we get %d", scatter_index_dims.size())); } + // TODO(daisiming): If the shape of scatter_index and gather_index should be + // same? auto dims = ctx->GetInputDim("X"); ctx->SetOutputDim("Out", dims); } @@ -160,17 +162,15 @@ REGISTER_OPERATOR( ops::FusedGatherScatterGradOpMaker, ops::FusedGatherScatterGradOpMaker); REGISTER_OPERATOR(fused_gather_scatter_grad, ops::FusedGatherScatterGradOp); -REGISTER_OP_CPU_KERNEL( - fused_gather_scatter, ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel); +REGISTER_OP_CPU_KERNEL(fused_gather_scatter, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel, + ops::FusedGatherScatterOpKernel); REGISTER_OP_CPU_KERNEL( fused_gather_scatter_grad, @@ -181,7 +181,4 @@ REGISTER_OP_CPU_KERNEL( ops::FusedGatherScatterGradOpKernel, ops::FusedGatherScatterGradOpKernel, ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel); + ops::FusedGatherScatterGradOpKernel); diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index 401657a4a5d5a4..8e901aac98961c 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -73,7 +73,7 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst, } template -void gather_scatter_cpu_for_loop(const int& index_size, const int& input_size, +void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, const IndexT* g_index, const IndexT* s_index, const Tensor& src, Tensor* dst, const std::string& pool_type) { @@ -134,6 +134,7 @@ class FusedGatherScatterOpKernel : public framework::OpKernel { auto* Y = ctx.Output("Out"); const int& index_size = gather_index->dims()[0]; + if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); const auto& src_dims = X->dims(); From 2f79165017f8202e16c6a660490b42ca317e1323 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 4 Nov 2021 09:10:18 +0000 Subject: [PATCH 05/30] fix bug in CudaAtomicMax and CudaAtomicMin --- paddle/fluid/platform/cuda_primitives.h | 28 +++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 4708a99e8fc4ca..d443e78ed874f3 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -178,9 +178,17 @@ CUDA_ATOMIC_WRAPPER(Max, int64_t) { // Here, we check long long int must be int64_t. static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT "long long should be int64"); - return CudaAtomicMax( - reinterpret_cast(address), // NOLINT - static_cast(val)); // NOLINT + long long int res = *address; // NOLINT + while (val > res) { + long long int old = res; // NOLINT + res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT + (unsigned long long int)old, // NOLINT + (unsigned long long int)val); // NOLINT + if (res == old) { + break; + } + } + return res; } CUDA_ATOMIC_WRAPPER(Max, float) { @@ -254,9 +262,17 @@ CUDA_ATOMIC_WRAPPER(Min, int64_t) { // Here, we check long long int must be int64_t. static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT "long long should be int64"); - return CudaAtomicMin( - reinterpret_cast(address), // NOLINT - static_cast(val)); // NOLINT + long long int res = *address; // NOLINT + while (val < res) { + long long int old = res; // NOLINT + res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT + (unsigned long long int)old, // NOLINT + (unsigned long long int)val); // NOLINT + if (res == old) { + break; + } + } + return res; } CUDA_ATOMIC_WRAPPER(Min, float) { From 8316d6eb8fa2f25476dd570374f831f7ca516144 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 8 Nov 2021 07:42:07 +0000 Subject: [PATCH 06/30] add CUDA version --- .../fused/fused_gather_scatter_op.cu | 349 ++++++++++++++++++ .../operators/fused/fused_gather_scatter_op.h | 2 +- 2 files changed, 350 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/fused/fused_gather_scatter_op.cu diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu new file mode 100644 index 00000000000000..4f0514ebf47f46 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu @@ -0,0 +1,349 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/fused/fused_gather_scatter_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct GatherScatterSumCUDAFunctor { + DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); + } +}; + +template +struct GatherScatterMaxCUDAFunctor { + DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); + } +}; + +template +struct GatherScatterMinCUDAFunctor { + DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, + const IndexT& out_i) { + paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); + } +}; + +template +__global__ void GatherScatterCUDAKernel(const T* params, + const IndexT* gather_indices, + const IndexT* scatter_indices, + T* output, size_t index_size, + size_t slice_size, Functor functor) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT gather_i = gather_indices[indices_i]; + IndexT scatter_i = scatter_indices[indices_i]; + int64_t in_i = gather_i * slice_size + slice_i; + int64_t out_i = scatter_i * slice_size + slice_i; + functor(params, output, in_i, out_i); + } +} + +template +__global__ void InputResetCUDAKernel(T* output, size_t input_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + if (*(output + i) == std::numeric_limits::min() || + *(output + i) == std::numeric_limits::max()) { + *(output + i) = 0; + } + } +} + +template +__global__ void ComputeCountCUDAKernel(int* count, + const IndexT* scatter_indices, + size_t index_size) { + CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { + IndexT scatter_i = scatter_indices[i]; + paddle::platform::CudaAtomicAdd(count + scatter_i, 1); + } +} + +template +__global__ void ManipulateMeanCUDAKernel(T* output, int* count, + size_t input_size, size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + int64_t c_index = i / slice_size; + if (*(count + c_index) > 1) { + *(output + i) = *(output + i) / *(count + c_index); + } + } +} + +template +class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input("X"); + auto* gather_index = ctx.Input("Gather_index"); + auto* scatter_index = ctx.Input("Scatter_index"); + auto* Y = ctx.Output("Out"); + std::string pool_type = ctx.Attr("pool_type"); + + const int& index_size = gather_index->dims()[0]; + if (index_size == 0) return; + + T* p_output = Y->mutable_data(ctx.GetPlace()); + const auto& src_dims = X->dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + if (pool_type == "SUM" || pool_type == "MEAN") { + cudaMemset(p_output, 0, memset_bytes); + } else if (pool_type == "MAX") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, + std::numeric_limits::min()); + } else if (pool_type == "MIN") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, + std::numeric_limits::max()); + } + + int64_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) { + slice_size *= src_dims[i]; + } + const T* p_src = X->data(); + const IndexT* g_index = gather_index->data(); + const IndexT* s_index = scatter_index->data(); + + int block = 512; + int64_t n = slice_size * index_size; + int64_t grid = (n + block - 1) / block; + int64_t input_size = src_dims[0]; + if (pool_type == "SUM") { + GatherScatterSumCUDAFunctor functor; + GatherScatterCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, functor); + } else if (pool_type == "MAX") { + GatherScatterMaxCUDAFunctor functor; + GatherScatterCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, functor); + + int64_t grid_max = (input_size * slice_size + block - 1) / block; + InputResetCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, input_size, slice_size); + } else if (pool_type == "MIN") { + GatherScatterMinCUDAFunctor functor; + GatherScatterCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, functor); + + int64_t grid_min = (input_size * slice_size + block - 1) / block; + InputResetCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, input_size, slice_size); + } else if (pool_type == "MEAN") { + GatherScatterSumCUDAFunctor functor; + GatherScatterCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, functor); + + Tensor count; + int* count_ptr = count.mutable_data({input_size}, ctx.GetPlace()); + cudaMemset(count_ptr, 0, input_size * sizeof(int)); + int64_t grid_count = (index_size + block - 1) / block; + ComputeCountCUDAKernel< + T, IndexT><<( + ctx.device_context()) + .stream()>>>(count_ptr, s_index, index_size); + + int64_t grid_mean = (input_size * slice_size + block - 1) / block; + ManipulateMeanCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, count_ptr, input_size, slice_size); + } + } +}; + +template +class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* X = ctx.Input(framework::GradVarName("Out")); + auto* gather_index = ctx.Input("Gather_index"); + auto* scatter_index = ctx.Input("Scatter_index"); + auto* Y = ctx.Output(framework::GradVarName("X")); + std::string pool_type = ctx.Attr("pool_type"); + + const int& index_size = gather_index->dims()[0]; + if (index_size == 0) return; + + T* p_output = Y->mutable_data(ctx.GetPlace()); + const auto& src_dims = X->dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + if (pool_type == "SUM" || pool_type == "MEAN") { + cudaMemset(p_output, 0, memset_bytes); + } else if (pool_type == "MAX") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, + std::numeric_limits::min()); + } else if (pool_type == "MIN") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, + std::numeric_limits::max()); + } + + int64_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) { + slice_size *= src_dims[i]; + } + const T* p_src = X->data(); + const IndexT* g_index = gather_index->data(); + const IndexT* s_index = scatter_index->data(); + + int block = 512; + int64_t n = slice_size * index_size; + int64_t grid = (n + block - 1) / block; + int64_t input_size = src_dims[0]; + if (pool_type == "SUM") { + GatherScatterSumCUDAFunctor functor; + GatherScatterCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, functor); + } else if (pool_type == "MAX") { + GatherScatterMaxCUDAFunctor functor; + GatherScatterCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, functor); + + int64_t grid_max = (input_size * slice_size + block - 1) / block; + InputResetCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, input_size, slice_size); + } else if (pool_type == "MIN") { + GatherScatterMinCUDAFunctor functor; + GatherScatterCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, functor); + + int64_t grid_min = (input_size * slice_size + block - 1) / block; + InputResetCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, input_size, slice_size); + } else if (pool_type == "MEAN") { + GatherScatterSumCUDAFunctor functor; + GatherScatterCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, functor); + + Tensor count; + int* count_ptr = count.mutable_data({input_size}, ctx.GetPlace()); + cudaMemset(count_ptr, 0, input_size * sizeof(int)); + int64_t grid_count = (index_size + block - 1) / block; + ComputeCountCUDAKernel< + T, IndexT><<( + ctx.device_context()) + .stream()>>>(count_ptr, s_index, index_size); + + int64_t grid_mean = (input_size * slice_size + block - 1) / block; + ManipulateMeanCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, count_ptr, input_size, slice_size); + } + } +}; + +} // namespace operators +} // namespace paddle + +using CUDA = paddle::platform::CUDADeviceContext; +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + fused_gather_scatter, ops::FusedGatherScatterOpCUDAKernel, + ops::FusedGatherScatterOpCUDAKernel, + ops::FusedGatherScatterOpCUDAKernel, + ops::FusedGatherScatterOpCUDAKernel, + ops::FusedGatherScatterOpCUDAKernel, + ops::FusedGatherScatterOpCUDAKernel, + ops::FusedGatherScatterOpCUDAKernel, + ops::FusedGatherScatterOpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL( + fused_gather_scatter_grad, + ops::FusedGatherScatterGradOpCUDAKernel, + ops::FusedGatherScatterGradOpCUDAKernel, + ops::FusedGatherScatterGradOpCUDAKernel, + ops::FusedGatherScatterGradOpCUDAKernel, + ops::FusedGatherScatterGradOpCUDAKernel, + ops::FusedGatherScatterGradOpCUDAKernel, + ops::FusedGatherScatterGradOpCUDAKernel, + ops::FusedGatherScatterGradOpCUDAKernel); diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index 8e901aac98961c..8a8b9eaf9654fc 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -115,7 +115,7 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, count[dst_idx] += 1; } for (int i = 0; i < input_size; ++i) { - if (count[i] == 0) continue; + if (count[i] <= 1) continue; auto dst_slice = dst->Slice(i, i + 1); auto eigen_dst = framework::EigenVector::Flatten(dst_slice); eigen_dst = eigen_dst / static_cast(count[i]); From 80accd81f522287605978890c6c138de1d799b5e Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 9 Nov 2021 06:13:28 +0000 Subject: [PATCH 07/30] fix grad_op bug for index --- .../fluid/operators/fused/fused_gather_scatter_op.cu | 4 ++-- paddle/fluid/operators/fused/fused_gather_scatter_op.h | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu index 4f0514ebf47f46..b82c4e7da835ee 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu @@ -215,8 +215,8 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* gather_index = ctx.Input("Gather_index"); - auto* scatter_index = ctx.Input("Scatter_index"); + auto* gather_index = ctx.Input("Scatter_index"); + auto* scatter_index = ctx.Input("Gather_index"); auto* Y = ctx.Output(framework::GradVarName("X")); std::string pool_type = ctx.Attr("pool_type"); diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index 8a8b9eaf9654fc..e74a861d63e86b 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -80,11 +80,10 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, Functor functor; if (pool_type == "MIN" || pool_type == "MAX") { std::set existed_dst; - bool in_set = false; for (int i = 0; i < index_size; ++i) { IndexT src_idx = g_index[i]; IndexT dst_idx = s_index[i]; - in_set = existed_dst.find(dst_idx) != existed_dst.end(); + bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); if (!in_set) { elementwise_inner_operation(src, dst, src_idx, dst_idx, true, functor); @@ -146,7 +145,7 @@ class FusedGatherScatterOpKernel : public framework::OpKernel { const IndexT* g_index = gather_index->data(); const IndexT* s_index = scatter_index->data(); - std::string pool_type = ctx.Attr("pool_type"); + const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { gather_scatter_cpu_for_loop>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); @@ -168,11 +167,12 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* gather_index = ctx.Input("Gather_index"); - auto* scatter_index = ctx.Input("Scatter_index"); + auto* gather_index = ctx.Input("Scatter_index"); + auto* scatter_index = ctx.Input("Gather_index"); auto* Y = ctx.Output(framework::GradVarName("X")); const int& index_size = gather_index->dims()[0]; + if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); const auto& src_dims = X->dims(); From 868a50a16b82b3bfe26af0f262ea332c9acb76e9 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 11 Nov 2021 04:56:07 +0000 Subject: [PATCH 08/30] add op test, add correct cpu grad op --- .../fused/fused_gather_scatter_op.cc | 43 +++-- .../operators/fused/fused_gather_scatter_op.h | 77 ++++++-- .../unittests/test_fused_gather_scatter_op.py | 182 ++++++++++++++++++ 3 files changed, 273 insertions(+), 29 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc index cd7459ac70f3ed..e55216e2e82a2d 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc @@ -22,22 +22,13 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of FusedGatherScatterOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Gather_index"), true, - platform::errors::InvalidArgument( - "Input(Gather_index) of FusedGatherScatterOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Scatter_index"), true, - platform::errors::InvalidArgument( - "Input(Scatter_index) of FusedGatherScatterOp should " - "not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of FusedGatherScatterOp should not be null.")); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGatherScatter"); + OP_INOUT_CHECK(ctx->HasInput("Gather_index"), "Input", "Gather_index", + "FusedGatherScatter"); + OP_INOUT_CHECK(ctx->HasInput("Scatter_index"), "Input", "Scatter_index", + "FusedGatherScatter"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", + "FusedGatherScatter"); auto gather_index_dims = ctx->GetInputDim("Gather_index"); if (gather_index_dims.size() == 2) { @@ -73,6 +64,12 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel { // same? auto dims = ctx->GetInputDim("X"); ctx->SetOutputDim("Out", dims); + + if (ctx->Attrs().Get("pool_type") == "MEAN") { + OP_INOUT_CHECK(ctx->HasOutput("Scatter_count"), "Output", "Scatter_count", + "FusedGatherScatter"); + ctx->SetOutputDim("Scatter_count", {dims[0]}); + } } protected: @@ -111,6 +108,9 @@ class FusedGatherScatterOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Gather_index", "The gather index tensor."); AddInput("Scatter_index", "The scatter index tensor."); AddOutput("Out", "Output tensor of fused_gather_scatter op."); + AddOutput("Scatter_count", + "Count tensor of Scatter index, mainly for MEAN pool_type.") + .AsIntermediate(); AddAttr( "pool_type", "(string, default 'SUM')" @@ -144,6 +144,17 @@ class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker { op->SetType("fused_gather_scatter_grad"); op->SetInput("Gather_index", this->Input("Gather_index")); op->SetInput("Scatter_index", this->Input("Scatter_index")); + + if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { + op->SetInput("Scatter_count", this->Output("Scatter_count")); + } + + if (BOOST_GAT_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || + BOOST_GAT_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { + op->SetInput("X", this->Input("X")); + op->SetInput("Out", this->Output("Out")); + } + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); op->SetAttrMap(this->Attrs()); diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index e74a861d63e86b..cac1309c06e33b 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -76,7 +76,8 @@ template void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, const IndexT* g_index, const IndexT* s_index, const Tensor& src, Tensor* dst, - const std::string& pool_type) { + const std::string& pool_type, + int* scatter_count = NULL) { Functor functor; if (pool_type == "MIN" || pool_type == "MAX") { std::set existed_dst; @@ -107,19 +108,59 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, elementwise_inner_operation(src, dst, src_idx, dst_idx, false, functor); } - int* count = new int[input_size]; - memset(count, 0, input_size * sizeof(int)); for (int i = 0; i < index_size; ++i) { IndexT dst_idx = s_index[i]; - count[dst_idx] += 1; + *(scatter_count + dst_idx) += 1; } for (int i = 0; i < input_size; ++i) { - if (count[i] <= 1) continue; + if (*(scatter_count + i) == 0) continue; auto dst_slice = dst->Slice(i, i + 1); auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(count[i]); + eigen_dst = eigen_dst / static_cast(*(scatter_count + i)); + } + } +} + +template +void gather_scatter_cpu_for_loop_grad(const int& input_size, + const int& index_size, + const IndexT* g_index, + const IndexT* s_index, const Tensor& src, + Tensor* dst, const std::string& pool_type, + const int* scatter_count = NULL) { + Functor functor; + if (pool_type == "SUM") { + for (int i = 0; i < index_size; ++i) { + IndexT src_idx = g_index[i]; + IndexT dst_idx = s_index[i]; + elementwise_inner_operation(src, dst, src_idx, + dst_idx, false, functor); + } + } else if (pool_type == "MEAN") { + for (int i = 0; i < index_size; ++i) { + IndexT src_idx = g_index[i]; + IndexT dst_idx = s_index[i]; + auto src_slice = src.Slice(src_idx, src_idx + 1); + auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); + auto eigen_src = framework::EigenVector::Flatten(src_slice); + auto eigen_dst = framework::EigenVector::Flatten(dst_slice); + eigen_dst += (eigen_src / static_cast(scatter_count[src_idx])); + } + } else if (pool_type == "MIN" || pool_type == "MAX") { + std::set existed_dst; + for (int i = 0; i < index_size; ++i) { + IndexT src_idx = g_index[i]; + IndexT dst_idx = s_index[i]; + bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); + if (!in_set) { + elementwise_inner_operation(src, dst, src_idx, + dst_idx, true, functor); + existed_dst.emplace(dst_idx); + } else { + elementwise_inner_operation( + src, dst, src_idx, dst_idx, false, functor); + } } - delete[] count; } } @@ -156,8 +197,12 @@ class FusedGatherScatterOpKernel : public framework::OpKernel { gather_scatter_cpu_for_loop>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { + auto* scatter_count = ctx.Output("Scatter_count"); + int* p_scatter_count = scatter_count->mutable_data(ctx.GetPlace()); + memset(p_scatter_count, 0, src_dims[0] * sizeof(int)); gather_scatter_cpu_for_loop>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, + p_scatter_count); } } }; @@ -186,17 +231,23 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { - gather_scatter_cpu_for_loop>( + gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MIN") { - gather_scatter_cpu_for_loop>( + gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MAX") { - gather_scatter_cpu_for_loop>( + gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { - gather_scatter_cpu_for_loop>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); + auto* scatter_count = ctx.Input("Scatter_count"); + const int* s_count = scatter_count->data(); + gather_scatter_cpu_for_loop_grad>( + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count); } } }; diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py new file mode 100644 index 00000000000000..cbd5590d6750db --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py @@ -0,0 +1,182 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from op_test import OpTest +import paddle +import paddle.fluid as fluid +""" +class TestFusedGatherScatterMaxOp(OpTest): + def setUp(self): + paddle.enable_static() + self.op_type = "fused_gather_scatter" + x = np.random.random((10, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)) + gather_index = index[:, 0] + scatter_index = index[:, 1] + + self.inputs = { + 'X': x, + 'Gather_index': gather_index, + 'Scatter_index': scatter_index + } + + self.attrs = {'pool_type': 'MAX'} + + out, scatter_count = compute_gather_scatter(self.inputs, self.attrs) + + self.outputs = {'Out': out, 'Scatter_count': scatter_count} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestFusedGatherScatterMinOp(OpTest): + def setUp(self): + paddle.enable_static() + self.op_type = "fused_gather_scatter" + x = np.random.random((10, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)) + gather_index = index[:, 0] + scatter_index = index[:, 1] + + self.inputs = { + 'X': x, + 'Gather_index': gather_index, + 'Scatter_index': scatter_index + } + + self.attrs = {'pool_type': 'MIN'} + + out, scatter_count = compute_gather_scatter(self.inputs, self.attrs) + + self.outputs = {'Out': out, 'Scatter_count': scatter_count} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') +""" + + +class TestFusedGatherScatterSumOp(OpTest): + def setUp(self): + paddle.enable_static() + self.op_type = "fused_gather_scatter" + x = np.random.random((10, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)) + gather_index = index[:, 0] + scatter_index = index[:, 1] + + self.inputs = { + 'X': x, + 'Gather_index': gather_index, + 'Scatter_index': scatter_index + } + + self.attrs = {'pool_type': 'SUM'} + + out, _ = compute_gather_scatter(self.inputs, self.attrs) + + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestFusedGatherScatterMeanOp(OpTest): + def setUp(self): + paddle.enable_static() + self.op_type = "fused_gather_scatter" + x = np.random.random((10, 20)).astype("float64") + index = np.random.randint(0, 10, (15, 2)) + gather_index = index[:, 0] + scatter_index = index[:, 1] + + self.inputs = { + 'X': x, + 'Gather_index': gather_index, + 'Scatter_index': scatter_index + } + + self.attrs = {'pool_type': 'MEAN'} + + out, scatter_count = compute_gather_scatter(self.inputs, self.attrs) + + self.outputs = {'Out': out, 'Scatter_count': scatter_count} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +def compute_gather_scatter(inputs, attributes): + x = inputs['X'] + gather_index = inputs['Gather_index'] + scatter_index = inputs['Scatter_index'] + + pool_type = attributes['pool_type'] + + gather_x = x[gather_index] + target_shape = list(x.shape) + if pool_type == 'SUM': + results = np.zeros(target_shape, dtype=x.dtype) + for index, s_id in enumerate(scatter_index): + results[s_id, :] += gather_x[index, :] + elif pool_type == 'MEAN': + results = np.zeros(target_shape, dtype=x.dtype) + count = np.zeros(target_shape[0], dtype=np.int32) + for index, s_id in enumerate(scatter_index): + results[s_id, :] += gather_x[index, :] + count[s_id] += 1 + results = results / count.reshape([-1, 1]) + results[np.isnan(results)] = 0 + elif pool_type == 'MAX': + results = np.zeros(target_shape, dtype=x.dtype) + first_set = set() + for index, s_id in enumerate(scatter_index): + if s_id not in first_set: + results[s_id, :] += gather_x[index, :] + first_set.add(s_id) + else: + results[s_id, :] = np.maximum(results[s_id, :], + gather_x[index, :]) + elif pool_type == 'MIN': + results = np.zeros(target_shape, dtype=x.dtype) + first_set = set() + for index, s_id in enumerate(scatter_index): + if s_id not in first_set: + results[s_id, :] += gather_x[index, :] + first_set.add(s_id) + else: + results[s_id, :] = np.minimum(results[s_id, :], + gather_x[index, :]) + else: + raise ValueError( + "Invalid pool_type, only SUM, MEAN, MAX, MIN supported!") + + count = np.zeros(target_shape[0], dtype=np.int32) + for index, s_id in enumerate(scatter_index): + count[s_id] += 1 + + return results, count From d2da1ee74f7fbfbda143f75129f0b00e7a5af54c Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 11 Nov 2021 12:10:53 +0000 Subject: [PATCH 09/30] Add correct CUDA Mean grad --- .../fused/fused_gather_scatter_op.cc | 4 +- .../fused/fused_gather_scatter_op.cu | 59 +++++++++++-------- 2 files changed, 35 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc index e55216e2e82a2d..dc5b7a95265751 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cc @@ -149,8 +149,8 @@ class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker { op->SetInput("Scatter_count", this->Output("Scatter_count")); } - if (BOOST_GAT_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || - BOOST_GAT_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { + if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || + BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { op->SetInput("X", this->Input("X")); op->SetInput("Out", this->Output("Out")); } diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu index b82c4e7da835ee..21247599300065 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu @@ -67,6 +67,7 @@ __global__ void GatherScatterCUDAKernel(const T* params, } } +// For min and max template __global__ void InputResetCUDAKernel(T* output, size_t input_size, size_t slice_size) { @@ -78,6 +79,7 @@ __global__ void InputResetCUDAKernel(T* output, size_t input_size, } } +// Get scatter_count template __global__ void ComputeCountCUDAKernel(int* count, const IndexT* scatter_indices, @@ -88,6 +90,7 @@ __global__ void ComputeCountCUDAKernel(int* count, } } +// For forward mean template __global__ void ManipulateMeanCUDAKernel(T* output, int* count, size_t input_size, size_t slice_size) { @@ -99,6 +102,26 @@ __global__ void ManipulateMeanCUDAKernel(T* output, int* count, } } +// For backward mean +template +__global__ void ManipulateMeanGradCUDAKernel(const T* params, + const IndexT* gather_indices, + const IndexT* scatter_indices, + T* output, size_t index_size, + size_t slice_size, + const int* scatter_count) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT gather_i = gather_indices[indices_i]; + IndexT scatter_i = scatter_indices[indices_i]; + int64_t in_i = gather_i * slice_size + slice_i; + int64_t out_i = scatter_i * slice_size + slice_i; + paddle::platform::CudaAtomicAdd(output + out_i, + *(params + in_i) / scatter_count[gather_i]); + } +} + template class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { public: @@ -190,22 +213,23 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { .stream()>>>(p_src, g_index, s_index, p_output, index_size, slice_size, functor); - Tensor count; - int* count_ptr = count.mutable_data({input_size}, ctx.GetPlace()); - cudaMemset(count_ptr, 0, input_size * sizeof(int)); + auto* scatter_count = ctx.Output("Scatter_count"); + int* p_scatter_count = scatter_count->mutable_data(ctx.GetPlace()); + cudaMemset(p_scatter_count, 0, input_size * sizeof(int)); int64_t grid_count = (index_size + block - 1) / block; ComputeCountCUDAKernel< T, IndexT><<( ctx.device_context()) - .stream()>>>(count_ptr, s_index, index_size); + .stream()>>>(p_scatter_count, s_index, index_size); int64_t grid_mean = (input_size * slice_size + block - 1) / block; ManipulateMeanCUDAKernel< T><<( ctx.device_context()) - .stream()>>>(p_output, count_ptr, input_size, slice_size); + .stream()>>>(p_output, p_scatter_count, input_size, + slice_size); } } }; @@ -293,30 +317,13 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { ctx.device_context()) .stream()>>>(p_output, input_size, slice_size); } else if (pool_type == "MEAN") { - GatherScatterSumCUDAFunctor functor; - GatherScatterCUDAKernel><<< + auto* scatter_count = ctx.Input("Scatter_count"); + const int* s_count = scatter_count->data(); + ManipulateMeanGradCUDAKernel<<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, g_index, s_index, p_output, - index_size, slice_size, functor); - - Tensor count; - int* count_ptr = count.mutable_data({input_size}, ctx.GetPlace()); - cudaMemset(count_ptr, 0, input_size * sizeof(int)); - int64_t grid_count = (index_size + block - 1) / block; - ComputeCountCUDAKernel< - T, IndexT><<( - ctx.device_context()) - .stream()>>>(count_ptr, s_index, index_size); - - int64_t grid_mean = (input_size * slice_size + block - 1) / block; - ManipulateMeanCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, count_ptr, input_size, slice_size); + index_size, slice_size, s_count); } } }; From 0235923e63c5c0ac4296ef153e90db12ff44dfb9 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Fri, 12 Nov 2021 09:59:35 +0000 Subject: [PATCH 10/30] [Add] Successful MEAN and SUM --- .../operators/fused/fused_gather_scatter_op.h | 85 +++++++++---------- .../unittests/test_fused_gather_scatter_op.py | 70 ++++++++++----- 2 files changed, 90 insertions(+), 65 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index cac1309c06e33b..73ae5a168a30e7 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -79,22 +79,7 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, const std::string& pool_type, int* scatter_count = NULL) { Functor functor; - if (pool_type == "MIN" || pool_type == "MAX") { - std::set existed_dst; - for (int i = 0; i < index_size; ++i) { - IndexT src_idx = g_index[i]; - IndexT dst_idx = s_index[i]; - bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); - if (!in_set) { - elementwise_inner_operation(src, dst, src_idx, - dst_idx, true, functor); - existed_dst.emplace(dst_idx); - } else { - elementwise_inner_operation( - src, dst, src_idx, dst_idx, false, functor); - } - } - } else if (pool_type == "SUM") { + if (pool_type == "SUM") { for (int i = 0; i < index_size; ++i) { IndexT src_idx = g_index[i]; IndexT dst_idx = s_index[i]; @@ -118,18 +103,32 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, auto eigen_dst = framework::EigenVector::Flatten(dst_slice); eigen_dst = eigen_dst / static_cast(*(scatter_count + i)); } + } else if (pool_type == "MIN" || pool_type == "MAX") { + std::set existed_dst; + for (int i = 0; i < index_size; ++i) { + IndexT src_idx = g_index[i]; + IndexT dst_idx = s_index[i]; + bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); + if (!in_set) { + elementwise_inner_operation(src, dst, src_idx, + dst_idx, true, functor); + existed_dst.emplace(dst_idx); + } else { + elementwise_inner_operation( + src, dst, src_idx, dst_idx, false, functor); + } + } } } template -void gather_scatter_cpu_for_loop_grad(const int& input_size, - const int& index_size, - const IndexT* g_index, - const IndexT* s_index, const Tensor& src, - Tensor* dst, const std::string& pool_type, - const int* scatter_count = NULL) { - Functor functor; +void gather_scatter_cpu_for_loop_grad( + const int& input_size, const int& index_size, const IndexT* g_index, + const IndexT* s_index, const Tensor& src, Tensor* dst, + const std::string& pool_type, const int* scatter_count = nullptr, + const Tensor* input = nullptr, const Tensor* output = nullptr) { if (pool_type == "SUM") { + Functor functor; for (int i = 0; i < index_size; ++i) { IndexT src_idx = g_index[i]; IndexT dst_idx = s_index[i]; @@ -147,19 +146,19 @@ void gather_scatter_cpu_for_loop_grad(const int& input_size, eigen_dst += (eigen_src / static_cast(scatter_count[src_idx])); } } else if (pool_type == "MIN" || pool_type == "MAX") { - std::set existed_dst; for (int i = 0; i < index_size; ++i) { - IndexT src_idx = g_index[i]; - IndexT dst_idx = s_index[i]; - bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); - if (!in_set) { - elementwise_inner_operation(src, dst, src_idx, - dst_idx, true, functor); - existed_dst.emplace(dst_idx); - } else { - elementwise_inner_operation( - src, dst, src_idx, dst_idx, false, functor); - } + auto forward_src_idx = s_index[i]; + auto forward_dst_idx = g_index[i]; + auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); + auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); + auto eigen_input = framework::EigenVector::Flatten(input_slice); + auto eigen_output = framework::EigenVector::Flatten(output_slice); + + auto src_slice = src.Slice(forward_dst_idx, forward_dst_idx + 1); + auto dst_slice = dst->Slice(forward_src_idx, forward_src_idx + 1); + auto eigen_src = framework::EigenVector::Flatten(src_slice); + auto eigen_dst = framework::EigenVector::Flatten(dst_slice); + eigen_dst += eigen_src * (eigen_output == eigen_input); } } } @@ -234,20 +233,20 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); - } else if (pool_type == "MIN") { - gather_scatter_cpu_for_loop_grad>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); - } else if (pool_type == "MAX") { - gather_scatter_cpu_for_loop_grad>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { auto* scatter_count = ctx.Input("Scatter_count"); const int* s_count = scatter_count->data(); gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count); + } else if (pool_type == "MIN" || pool_type == "MAX") { + auto* input = ctx.Input("X"); + auto* output = ctx.Input("Out"); + // Functor not used here. + gather_scatter_cpu_for_loop_grad>( + src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, nullptr, + input, output); } } }; diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py index cbd5590d6750db..63c4161fb53d40 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py @@ -16,7 +16,8 @@ from op_test import OpTest import paddle import paddle.fluid as fluid -""" + + class TestFusedGatherScatterMaxOp(OpTest): def setUp(self): paddle.enable_static() @@ -34,9 +35,10 @@ def setUp(self): self.attrs = {'pool_type': 'MAX'} - out, scatter_count = compute_gather_scatter(self.inputs, self.attrs) + out, gradient = compute_gather_scatter_for_min_max(self.inputs, + self.attrs) - self.outputs = {'Out': out, 'Scatter_count': scatter_count} + self.outputs = {'Out': out} def test_check_output(self): self.check_output() @@ -62,16 +64,16 @@ def setUp(self): self.attrs = {'pool_type': 'MIN'} - out, scatter_count = compute_gather_scatter(self.inputs, self.attrs) + out, self.gradient = compute_gather_scatter_for_min_max(self.inputs, + self.attrs) - self.outputs = {'Out': out, 'Scatter_count': scatter_count} + self.outputs = {'Out': out} def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X'], 'Out') -""" + self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient]) class TestFusedGatherScatterSumOp(OpTest): @@ -91,7 +93,7 @@ def setUp(self): self.attrs = {'pool_type': 'SUM'} - out, _ = compute_gather_scatter(self.inputs, self.attrs) + out, _ = compute_gather_scatter_for_sum_mean(self.inputs, self.attrs) self.outputs = {'Out': out} @@ -119,7 +121,8 @@ def setUp(self): self.attrs = {'pool_type': 'MEAN'} - out, scatter_count = compute_gather_scatter(self.inputs, self.attrs) + out, scatter_count = compute_gather_scatter_for_sum_mean(self.inputs, + self.attrs) self.outputs = {'Out': out, 'Scatter_count': scatter_count} @@ -130,7 +133,7 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') -def compute_gather_scatter(inputs, attributes): +def compute_gather_scatter_for_sum_mean(inputs, attributes): x = inputs['X'] gather_index = inputs['Gather_index'] scatter_index = inputs['Scatter_index'] @@ -139,20 +142,41 @@ def compute_gather_scatter(inputs, attributes): gather_x = x[gather_index] target_shape = list(x.shape) + results = np.zeros(target_shape, dtype=x.dtype) if pool_type == 'SUM': - results = np.zeros(target_shape, dtype=x.dtype) for index, s_id in enumerate(scatter_index): results[s_id, :] += gather_x[index, :] elif pool_type == 'MEAN': - results = np.zeros(target_shape, dtype=x.dtype) count = np.zeros(target_shape[0], dtype=np.int32) for index, s_id in enumerate(scatter_index): results[s_id, :] += gather_x[index, :] count[s_id] += 1 results = results / count.reshape([-1, 1]) results[np.isnan(results)] = 0 - elif pool_type == 'MAX': - results = np.zeros(target_shape, dtype=x.dtype) + else: + raise ValueError("Invalid pool_type, only SUM, MEAN supported!") + + count = np.zeros(target_shape[0], dtype=np.int32) + for index, s_id in enumerate(scatter_index): + count[s_id] += 1 + + return results, count + + +def compute_gather_scatter_for_min_max(inputs, attributes): + x = inputs['X'] + gather_index = inputs['Gather_index'] + scatter_index = inputs['Scatter_index'] + + pool_type = attributes['pool_type'] + + gather_x = x[gather_index] + target_shape = list(x.shape) + results = np.zeros(target_shape, dtype=x.dtype) + gradient = np.zeros_like(x) + + # Calculate forward output + if pool_type == "MAX": first_set = set() for index, s_id in enumerate(scatter_index): if s_id not in first_set: @@ -161,8 +185,7 @@ def compute_gather_scatter(inputs, attributes): else: results[s_id, :] = np.maximum(results[s_id, :], gather_x[index, :]) - elif pool_type == 'MIN': - results = np.zeros(target_shape, dtype=x.dtype) + elif pool_type == "MIN": first_set = set() for index, s_id in enumerate(scatter_index): if s_id not in first_set: @@ -172,11 +195,14 @@ def compute_gather_scatter(inputs, attributes): results[s_id, :] = np.minimum(results[s_id, :], gather_x[index, :]) else: - raise ValueError( - "Invalid pool_type, only SUM, MEAN, MAX, MIN supported!") + raise ValueError("Invalid pool_type, only MAX, MIN supported!") - count = np.zeros(target_shape[0], dtype=np.int32) - for index, s_id in enumerate(scatter_index): - count[s_id] += 1 + # Calculate backward gradient + index_size = len(gather_index) + for i in range(index_size): + forward_src_idx = scatter_index[i] + forward_dst_idx = gather_index[i] + gradient[forward_src_idx] += 1 * ( + x[forward_src_idx] == results[forward_dst_idx]) - return results, count + return results, gradient From 484a84bed837c8905fd4cd68fc625b515565adba Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 15 Nov 2021 03:43:00 +0000 Subject: [PATCH 11/30] [Add] Successful MIN and MAX in CPU --- .../operators/fused/fused_gather_scatter_op.h | 1 + .../unittests/test_fused_gather_scatter_op.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index 73ae5a168a30e7..0303030f701308 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -236,6 +236,7 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { } else if (pool_type == "MEAN") { auto* scatter_count = ctx.Input("Scatter_count"); const int* s_count = scatter_count->data(); + // Functor not used here. gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count); diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py index 63c4161fb53d40..49c7a6894beaed 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py @@ -22,7 +22,7 @@ class TestFusedGatherScatterMaxOp(OpTest): def setUp(self): paddle.enable_static() self.op_type = "fused_gather_scatter" - x = np.random.random((10, 20)).astype("float64") + x = np.random.randint(0, 50, (10, 10)).astype("float64") index = np.random.randint(0, 10, (15, 2)) gather_index = index[:, 0] scatter_index = index[:, 1] @@ -35,16 +35,15 @@ def setUp(self): self.attrs = {'pool_type': 'MAX'} - out, gradient = compute_gather_scatter_for_min_max(self.inputs, - self.attrs) - + out, self.gradient = compute_gather_scatter_for_min_max(self.inputs, + self.attrs) self.outputs = {'Out': out} def test_check_output(self): self.check_output() def test_check_grad(self): - self.check_grad(['X'], 'Out') + self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient]) class TestFusedGatherScatterMinOp(OpTest): @@ -200,9 +199,9 @@ def compute_gather_scatter_for_min_max(inputs, attributes): # Calculate backward gradient index_size = len(gather_index) for i in range(index_size): - forward_src_idx = scatter_index[i] - forward_dst_idx = gather_index[i] + forward_src_idx = gather_index[i] + forward_dst_idx = scatter_index[i] gradient[forward_src_idx] += 1 * ( x[forward_src_idx] == results[forward_dst_idx]) - return results, gradient + return results, gradient / results.size From a16e4129269b5859fc2b4622e1b86b2a025f31b4 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 15 Nov 2021 07:34:16 +0000 Subject: [PATCH 12/30] [Add] Successful MIN and MAX in CUDA --- .../fused/fused_gather_scatter_op.cu | 72 ++++++++----------- .../operators/fused/fused_gather_scatter_op.h | 4 +- .../unittests/test_fused_gather_scatter_op.py | 2 +- 3 files changed, 34 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu index 21247599300065..e0774158477296 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu @@ -122,6 +122,25 @@ __global__ void ManipulateMeanGradCUDAKernel(const T* params, } } +// For backward min and max +template +__global__ void ManipulateMinMaxGradCUDAKernel( + const T* params, const IndexT* gather_indices, + const IndexT* scatter_indices, T* output, size_t index_size, + size_t slice_size, const T* ptr_input, const T* ptr_output) { + CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { + int64_t indices_i = i / slice_size; + int64_t slice_i = i - indices_i * slice_size; + IndexT gather_i = gather_indices[indices_i]; + IndexT scatter_i = scatter_indices[indices_i]; + int64_t in_i = gather_i * slice_size + slice_i; + int64_t out_i = scatter_i * slice_size + slice_i; + paddle::platform::CudaAtomicAdd( + output + out_i, + *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); + } +} + template class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { public: @@ -254,17 +273,7 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { memset_size *= src_dims[i]; } const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { - cudaMemset(p_output, 0, memset_bytes); - } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); - } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::max()); - } + cudaMemset(p_output, 0, memset_bytes); int64_t slice_size = 1; for (int i = 1; i < src_dims.size(); ++i) { @@ -286,36 +295,6 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { ctx.device_context()) .stream()>>>(p_src, g_index, s_index, p_output, index_size, slice_size, functor); - } else if (pool_type == "MAX") { - GatherScatterMaxCUDAFunctor functor; - GatherScatterCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, - index_size, slice_size, functor); - - int64_t grid_max = (input_size * slice_size + block - 1) / block; - InputResetCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MIN") { - GatherScatterMinCUDAFunctor functor; - GatherScatterCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, - index_size, slice_size, functor); - - int64_t grid_min = (input_size * slice_size + block - 1) / block; - InputResetCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); } else if (pool_type == "MEAN") { auto* scatter_count = ctx.Input("Scatter_count"); const int* s_count = scatter_count->data(); @@ -324,6 +303,17 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { ctx.device_context()) .stream()>>>(p_src, g_index, s_index, p_output, index_size, slice_size, s_count); + } else if (pool_type == "MAX" || pool_type == "MIN") { + auto* input = ctx.Input("X"); + auto* output = ctx.Input("Out"); + const T* ptr_input = input->data(); + const T* ptr_output = output->data(); + ManipulateMinMaxGradCUDAKernel<<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, g_index, s_index, p_output, + index_size, slice_size, ptr_input, + ptr_output); } } }; diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/fused/fused_gather_scatter_op.h index 0303030f701308..15d241eb5105e5 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.h @@ -241,8 +241,8 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { FusedGatherScatterSumFunctor>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count); } else if (pool_type == "MIN" || pool_type == "MAX") { - auto* input = ctx.Input("X"); - auto* output = ctx.Input("Out"); + const auto* input = ctx.Input("X"); + const auto* output = ctx.Input("Out"); // Functor not used here. gather_scatter_cpu_for_loop_grad>( diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py index 49c7a6894beaed..f3b307296ec45a 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py @@ -22,7 +22,7 @@ class TestFusedGatherScatterMaxOp(OpTest): def setUp(self): paddle.enable_static() self.op_type = "fused_gather_scatter" - x = np.random.randint(0, 50, (10, 10)).astype("float64") + x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)) gather_index = index[:, 0] scatter_index = index[:, 1] From 071425f74f0d758793057ba606ea034eff34e68c Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 15 Nov 2021 09:28:20 +0000 Subject: [PATCH 13/30] fix windows dtype ci --- .../fluid/tests/unittests/test_fused_gather_scatter_op.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py index f3b307296ec45a..fc21dfacf79f15 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py @@ -23,7 +23,7 @@ def setUp(self): paddle.enable_static() self.op_type = "fused_gather_scatter" x = np.random.random((10, 20)).astype("float64") - index = np.random.randint(0, 10, (15, 2)) + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) gather_index = index[:, 0] scatter_index = index[:, 1] @@ -51,7 +51,7 @@ def setUp(self): paddle.enable_static() self.op_type = "fused_gather_scatter" x = np.random.random((10, 20)).astype("float64") - index = np.random.randint(0, 10, (15, 2)) + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) gather_index = index[:, 0] scatter_index = index[:, 1] @@ -80,7 +80,7 @@ def setUp(self): paddle.enable_static() self.op_type = "fused_gather_scatter" x = np.random.random((10, 20)).astype("float64") - index = np.random.randint(0, 10, (15, 2)) + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) gather_index = index[:, 0] scatter_index = index[:, 1] @@ -108,7 +108,7 @@ def setUp(self): paddle.enable_static() self.op_type = "fused_gather_scatter" x = np.random.random((10, 20)).astype("float64") - index = np.random.randint(0, 10, (15, 2)) + index = np.random.randint(0, 10, (15, 2)).astype(np.int64) gather_index = index[:, 0] scatter_index = index[:, 1] From f9666b9b6d7da56d423bdeb39ebb32c664797903 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Mon, 15 Nov 2021 11:27:40 +0000 Subject: [PATCH 14/30] fix ROCM ci by adding HIP flag --- .../operators/fused/fused_gather_scatter_op.cu | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu index e0774158477296..2f2d885a4a9670 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu +++ b/paddle/fluid/operators/fused/fused_gather_scatter_op.cu @@ -162,7 +162,12 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { } const size_t& memset_bytes = memset_size * sizeof(T); if (pool_type == "SUM" || pool_type == "MEAN") { +#ifdef PADDLE_WITH_HIP + hipMemset(p_output, 0, memset_bytes); +#else cudaMemset(p_output, 0, memset_bytes); +#endif + } else if (pool_type == "MAX") { thrust::device_ptr p_output_ptr(p_output); thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, @@ -234,7 +239,13 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { auto* scatter_count = ctx.Output("Scatter_count"); int* p_scatter_count = scatter_count->mutable_data(ctx.GetPlace()); + +#ifdef PADDLE_WITH_HIP + hipMemset(p_scatter_count, 0, input_size * sizeof(int)); +#else cudaMemset(p_scatter_count, 0, input_size * sizeof(int)); +#endif + int64_t grid_count = (index_size + block - 1) / block; ComputeCountCUDAKernel< T, IndexT><< { memset_size *= src_dims[i]; } const size_t& memset_bytes = memset_size * sizeof(T); + +#ifdef PADDLE_WITH_HIP + hipMemset(p_output, 0, memset_bytes); +#else cudaMemset(p_output, 0, memset_bytes); +#endif int64_t slice_size = 1; for (int i = 1; i < src_dims.size(); ++i) { From 66e9af05c5e9c5454f9f445f292eea879a014191 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 16 Nov 2021 02:43:19 +0000 Subject: [PATCH 15/30] rename fused_gather_scatter to send_recv --- ...d_gather_scatter_op.cc => send_recv_op.cc} | 128 +++++++++--------- ...d_gather_scatter_op.cu => send_recv_op.cu} | 93 ++++++------- ...sed_gather_scatter_op.h => send_recv_op.h} | 35 +++-- ...her_scatter_op.py => test_send_recv_op.py} | 40 +++--- 4 files changed, 140 insertions(+), 156 deletions(-) rename paddle/fluid/operators/{fused/fused_gather_scatter_op.cc => send_recv_op.cc} (51%) rename paddle/fluid/operators/{fused/fused_gather_scatter_op.cu => send_recv_op.cu} (81%) rename paddle/fluid/operators/{fused/fused_gather_scatter_op.h => send_recv_op.h} (89%) rename python/paddle/fluid/tests/unittests/{test_fused_gather_scatter_op.py => test_send_recv_op.py} (87%) diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc b/paddle/fluid/operators/send_recv_op.cc similarity index 51% rename from paddle/fluid/operators/fused/fused_gather_scatter_op.cc rename to paddle/fluid/operators/send_recv_op.cc index dc5b7a95265751..06cbff6328c113 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cc +++ b/paddle/fluid/operators/send_recv_op.cc @@ -12,62 +12,60 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/fused/fused_gather_scatter_op.h" +#include "paddle/fluid/operators/send_recv_op.h" namespace paddle { namespace operators { -class FusedGatherScatterOP : public framework::OperatorWithKernel { +class SendRecvOP : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedGatherScatter"); - OP_INOUT_CHECK(ctx->HasInput("Gather_index"), "Input", "Gather_index", - "FusedGatherScatter"); - OP_INOUT_CHECK(ctx->HasInput("Scatter_index"), "Input", "Scatter_index", - "FusedGatherScatter"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", - "FusedGatherScatter"); - - auto gather_index_dims = ctx->GetInputDim("Gather_index"); - if (gather_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(gather_index_dims[1], 1, + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SendRecv"); + OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index", + "SendRecv"); + OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index", + "SendRecv"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SendRecv"); + + auto src_index_dims = ctx->GetInputDim("Src_index"); + if (src_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(src_index_dims[1], 1, platform::errors::InvalidArgument( - "The last dim of Gather_index should be 1 when it " + "The last dim of Src_index should be 1 when it " "is 2D, but we get %d", - gather_index_dims[1])); + src_index_dims[1])); } else { PADDLE_ENFORCE_EQ( - gather_index_dims.size(), 1, + src_index_dims.size(), 1, platform::errors::InvalidArgument( - "The Gather_index should be 1D, when it is not 2D, but we get %d", - gather_index_dims.size())); + "The Src_index should be 1D, when it is not 2D, but we get %d", + src_index_dims.size())); } - auto scatter_index_dims = ctx->GetInputDim("Scatter_index"); - if (scatter_index_dims.size() == 2) { - PADDLE_ENFORCE_EQ(scatter_index_dims[1], 1, + auto dst_index_dims = ctx->GetInputDim("Dst_index"); + if (dst_index_dims.size() == 2) { + PADDLE_ENFORCE_EQ(dst_index_dims[1], 1, platform::errors::InvalidArgument( - "The last dim of Scatter_index should be 1 when it " + "The last dim of Dst_index should be 1 when it " "is 2D, but we get %d", - scatter_index_dims[1])); + dst_index_dims[1])); } else { PADDLE_ENFORCE_EQ( - scatter_index_dims.size(), 1, - platform::errors::InvalidArgument("The Scatter_index should be 1D, " + dst_index_dims.size(), 1, + platform::errors::InvalidArgument("The Dst_index should be 1D, " "when it is not 2D, but we get %d", - scatter_index_dims.size())); + dst_index_dims.size())); } - // TODO(daisiming): If the shape of scatter_index and gather_index should be - // same? + // TODO(daisiming): If the shape of src_index and dst_index should be same? auto dims = ctx->GetInputDim("X"); ctx->SetOutputDim("Out", dims); if (ctx->Attrs().Get("pool_type") == "MEAN") { OP_INOUT_CHECK(ctx->HasOutput("Scatter_count"), "Output", "Scatter_count", - "FusedGatherScatter"); + "SendRecv"); ctx->SetOutputDim("Scatter_count", {dims[0]}); } } @@ -81,7 +79,7 @@ class FusedGatherScatterOP : public framework::OperatorWithKernel { } }; -class FusedGatherScatterGradOp : public framework::OperatorWithKernel { +class SendRecvGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -99,28 +97,28 @@ class FusedGatherScatterGradOp : public framework::OperatorWithKernel { } }; -class FusedGatherScatterOpMaker : public framework::OpProtoAndCheckerMaker { +class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "The input tensor with data type float32, " "float64 or float16"); - AddInput("Gather_index", "The gather index tensor."); - AddInput("Scatter_index", "The scatter index tensor."); - AddOutput("Out", "Output tensor of fused_gather_scatter op."); + AddInput("Src_index", "The source index tensor."); + AddInput("Dst_index", "The destination index tensor."); + AddOutput("Out", "Output tensor of send_recv op."); AddOutput("Scatter_count", - "Count tensor of Scatter index, mainly for MEAN pool_type.") + "Count tensor of Dst_index, mainly for MEAN pool_type.") .AsIntermediate(); AddAttr( "pool_type", "(string, default 'SUM')" - "We use Gather_index to gather correspoinding place of X. " + "We use Src_index to gather correspoinding place of X. " "Then we need to use different pool type to scatter the result.") .SetDefault("SUM") .InEnum({"SUM", "MEAN", "MIN", "MAX"}); // TODO(daisiming): Add a simple example here. AddComment(R"DOC( -Fused Gather Scatter Operator. +SendRecv Operator. $Out = Scatter(Gather(X, Gather_index), Scatter_index, pool_type)$ @@ -135,15 +133,15 @@ pass }; template -class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker { +class SendRecvGradOpMaker : public framework::SingleGradOpMaker { public: using framework::SingleGradOpMaker::SingleGradOpMaker; protected: void Apply(GradOpPtr op) const override { - op->SetType("fused_gather_scatter_grad"); - op->SetInput("Gather_index", this->Input("Gather_index")); - op->SetInput("Scatter_index", this->Input("Scatter_index")); + op->SetType("send_recv_grad"); + op->SetInput("Src_index", this->Input("Src_index")); + op->SetInput("Dst_index", this->Input("Dst_index")); if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { op->SetInput("Scatter_count", this->Output("Scatter_count")); @@ -167,29 +165,25 @@ class FusedGatherScatterGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; -REGISTER_OPERATOR( - fused_gather_scatter, ops::FusedGatherScatterOP, - ops::FusedGatherScatterOpMaker, - ops::FusedGatherScatterGradOpMaker, - ops::FusedGatherScatterGradOpMaker); -REGISTER_OPERATOR(fused_gather_scatter_grad, ops::FusedGatherScatterGradOp); -REGISTER_OP_CPU_KERNEL(fused_gather_scatter, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel, - ops::FusedGatherScatterOpKernel); - -REGISTER_OP_CPU_KERNEL( - fused_gather_scatter_grad, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel, - ops::FusedGatherScatterGradOpKernel); +REGISTER_OPERATOR(send_recv, ops::SendRecvOP, ops::SendRecvOpMaker, + ops::SendRecvGradOpMaker, + ops::SendRecvGradOpMaker); +REGISTER_OPERATOR(send_recv_grad, ops::SendRecvGradOp); +REGISTER_OP_CPU_KERNEL(send_recv, ops::SendRecvOpKernel, + ops::SendRecvOpKernel, + ops::SendRecvOpKernel, + ops::SendRecvOpKernel, + ops::SendRecvOpKernel, + ops::SendRecvOpKernel, + ops::SendRecvOpKernel, + ops::SendRecvOpKernel); + +REGISTER_OP_CPU_KERNEL(send_recv_grad, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel); diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu b/paddle/fluid/operators/send_recv_op.cu similarity index 81% rename from paddle/fluid/operators/fused/fused_gather_scatter_op.cu rename to paddle/fluid/operators/send_recv_op.cu index 2f2d885a4a9670..1014794c8bc3ec 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.cu +++ b/paddle/fluid/operators/send_recv_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/fused/fused_gather_scatter_op.h" +#include "paddle/fluid/operators/send_recv_op.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/place.h" @@ -27,7 +27,7 @@ namespace operators { using Tensor = framework::Tensor; template -struct GatherScatterSumCUDAFunctor { +struct SendRecvSumCUDAFunctor { DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, const IndexT& out_i) { paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); @@ -35,7 +35,7 @@ struct GatherScatterSumCUDAFunctor { }; template -struct GatherScatterMaxCUDAFunctor { +struct SendRecvMaxCUDAFunctor { DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, const IndexT& out_i) { paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); @@ -43,7 +43,7 @@ struct GatherScatterMaxCUDAFunctor { }; template -struct GatherScatterMinCUDAFunctor { +struct SendRecvMinCUDAFunctor { DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, const IndexT& out_i) { paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); @@ -51,11 +51,11 @@ struct GatherScatterMinCUDAFunctor { }; template -__global__ void GatherScatterCUDAKernel(const T* params, - const IndexT* gather_indices, - const IndexT* scatter_indices, - T* output, size_t index_size, - size_t slice_size, Functor functor) { +__global__ void SendRecvCUDAKernel(const T* params, + const IndexT* gather_indices, + const IndexT* scatter_indices, T* output, + size_t index_size, size_t slice_size, + Functor functor) { CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { int64_t indices_i = i / slice_size; int64_t slice_i = i - indices_i * slice_size; @@ -142,12 +142,12 @@ __global__ void ManipulateMinMaxGradCUDAKernel( } template -class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { +class SendRecvOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input("X"); - auto* gather_index = ctx.Input("Gather_index"); - auto* scatter_index = ctx.Input("Scatter_index"); + auto* gather_index = ctx.Input("Src_index"); + auto* scatter_index = ctx.Input("Dst_index"); auto* Y = ctx.Output("Out"); std::string pool_type = ctx.Attr("pool_type"); @@ -191,17 +191,15 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { int64_t grid = (n + block - 1) / block; int64_t input_size = src_dims[0]; if (pool_type == "SUM") { - GatherScatterSumCUDAFunctor functor; - GatherScatterCUDAKernel><<< + SendRecvSumCUDAFunctor functor; + SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, g_index, s_index, p_output, index_size, slice_size, functor); } else if (pool_type == "MAX") { - GatherScatterMaxCUDAFunctor functor; - GatherScatterCUDAKernel><<< + SendRecvMaxCUDAFunctor functor; + SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, g_index, s_index, p_output, @@ -214,9 +212,8 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { ctx.device_context()) .stream()>>>(p_output, input_size, slice_size); } else if (pool_type == "MIN") { - GatherScatterMinCUDAFunctor functor; - GatherScatterCUDAKernel><<< + SendRecvMinCUDAFunctor functor; + SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, g_index, s_index, p_output, @@ -229,9 +226,8 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { ctx.device_context()) .stream()>>>(p_output, input_size, slice_size); } else if (pool_type == "MEAN") { - GatherScatterSumCUDAFunctor functor; - GatherScatterCUDAKernel><<< + SendRecvSumCUDAFunctor functor; + SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, g_index, s_index, p_output, @@ -265,12 +261,12 @@ class FusedGatherScatterOpCUDAKernel : public framework::OpKernel { }; template -class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { +class SendRecvGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* gather_index = ctx.Input("Scatter_index"); - auto* scatter_index = ctx.Input("Gather_index"); + auto* gather_index = ctx.Input("Dst_index"); + auto* scatter_index = ctx.Input("Src_index"); auto* Y = ctx.Output(framework::GradVarName("X")); std::string pool_type = ctx.Attr("pool_type"); @@ -304,9 +300,8 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { int64_t grid = (n + block - 1) / block; int64_t input_size = src_dims[0]; if (pool_type == "SUM") { - GatherScatterSumCUDAFunctor functor; - GatherScatterCUDAKernel><<< + SendRecvSumCUDAFunctor functor; + SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, g_index, s_index, p_output, @@ -340,23 +335,21 @@ class FusedGatherScatterGradOpCUDAKernel : public framework::OpKernel { using CUDA = paddle::platform::CUDADeviceContext; namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - fused_gather_scatter, ops::FusedGatherScatterOpCUDAKernel, - ops::FusedGatherScatterOpCUDAKernel, - ops::FusedGatherScatterOpCUDAKernel, - ops::FusedGatherScatterOpCUDAKernel, - ops::FusedGatherScatterOpCUDAKernel, - ops::FusedGatherScatterOpCUDAKernel, - ops::FusedGatherScatterOpCUDAKernel, - ops::FusedGatherScatterOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL( - fused_gather_scatter_grad, - ops::FusedGatherScatterGradOpCUDAKernel, - ops::FusedGatherScatterGradOpCUDAKernel, - ops::FusedGatherScatterGradOpCUDAKernel, - ops::FusedGatherScatterGradOpCUDAKernel, - ops::FusedGatherScatterGradOpCUDAKernel, - ops::FusedGatherScatterGradOpCUDAKernel, - ops::FusedGatherScatterGradOpCUDAKernel, - ops::FusedGatherScatterGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(send_recv, ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(send_recv_grad, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/fused/fused_gather_scatter_op.h b/paddle/fluid/operators/send_recv_op.h similarity index 89% rename from paddle/fluid/operators/fused/fused_gather_scatter_op.h rename to paddle/fluid/operators/send_recv_op.h index 15d241eb5105e5..b4c2c0fd003b1b 100644 --- a/paddle/fluid/operators/fused/fused_gather_scatter_op.h +++ b/paddle/fluid/operators/send_recv_op.h @@ -24,7 +24,7 @@ namespace operators { using Tensor = framework::Tensor; template -struct FusedGatherScatterSumFunctor { +struct SendRecvSumFunctor { void operator()(const bool& first_flag, const Tensor& src_slice, Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); @@ -34,7 +34,7 @@ struct FusedGatherScatterSumFunctor { }; template -struct FusedGatherScatterMinFunctor { +struct SendRecvMinFunctor { void operator()(const bool& first_flag, const Tensor& src_slice, Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); @@ -48,7 +48,7 @@ struct FusedGatherScatterMinFunctor { }; template -struct FusedGatherScatterMaxFunctor { +struct SendRecvMaxFunctor { void operator()(const int& first_flag, const Tensor& src_slice, Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); @@ -164,12 +164,12 @@ void gather_scatter_cpu_for_loop_grad( } template -class FusedGatherScatterOpKernel : public framework::OpKernel { +class SendRecvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input("X"); - auto* gather_index = ctx.Input("Gather_index"); - auto* scatter_index = ctx.Input("Scatter_index"); + auto* gather_index = ctx.Input("Src_index"); + auto* scatter_index = ctx.Input("Dst_index"); auto* Y = ctx.Output("Out"); const int& index_size = gather_index->dims()[0]; @@ -187,19 +187,19 @@ class FusedGatherScatterOpKernel : public framework::OpKernel { const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { - gather_scatter_cpu_for_loop>( + gather_scatter_cpu_for_loop>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MIN") { - gather_scatter_cpu_for_loop>( + gather_scatter_cpu_for_loop>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MAX") { - gather_scatter_cpu_for_loop>( + gather_scatter_cpu_for_loop>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { auto* scatter_count = ctx.Output("Scatter_count"); int* p_scatter_count = scatter_count->mutable_data(ctx.GetPlace()); memset(p_scatter_count, 0, src_dims[0] * sizeof(int)); - gather_scatter_cpu_for_loop>( + gather_scatter_cpu_for_loop>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, p_scatter_count); } @@ -207,12 +207,12 @@ class FusedGatherScatterOpKernel : public framework::OpKernel { }; template -class FusedGatherScatterGradOpKernel : public framework::OpKernel { +class SendRecvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* gather_index = ctx.Input("Scatter_index"); - auto* scatter_index = ctx.Input("Gather_index"); + auto* gather_index = ctx.Input("Dst_index"); + auto* scatter_index = ctx.Input("Src_index"); auto* Y = ctx.Output(framework::GradVarName("X")); const int& index_size = gather_index->dims()[0]; @@ -230,22 +230,19 @@ class FusedGatherScatterGradOpKernel : public framework::OpKernel { const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { - gather_scatter_cpu_for_loop_grad>( + gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { auto* scatter_count = ctx.Input("Scatter_count"); const int* s_count = scatter_count->data(); // Functor not used here. - gather_scatter_cpu_for_loop_grad>( + gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count); } else if (pool_type == "MIN" || pool_type == "MAX") { const auto* input = ctx.Input("X"); const auto* output = ctx.Input("Out"); // Functor not used here. - gather_scatter_cpu_for_loop_grad>( + gather_scatter_cpu_for_loop_grad>( src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, nullptr, input, output); } diff --git a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py b/python/paddle/fluid/tests/unittests/test_send_recv_op.py similarity index 87% rename from python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py rename to python/paddle/fluid/tests/unittests/test_send_recv_op.py index fc21dfacf79f15..0ad1603c1e5d8a 100644 --- a/python/paddle/fluid/tests/unittests/test_fused_gather_scatter_op.py +++ b/python/paddle/fluid/tests/unittests/test_send_recv_op.py @@ -18,10 +18,10 @@ import paddle.fluid as fluid -class TestFusedGatherScatterMaxOp(OpTest): +class TestSendRecvMaxOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "fused_gather_scatter" + self.op_type = "send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) gather_index = index[:, 0] @@ -29,8 +29,8 @@ def setUp(self): self.inputs = { 'X': x, - 'Gather_index': gather_index, - 'Scatter_index': scatter_index + 'Src_index': gather_index, + 'Dst_index': scatter_index } self.attrs = {'pool_type': 'MAX'} @@ -46,10 +46,10 @@ def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient]) -class TestFusedGatherScatterMinOp(OpTest): +class TestSendRecvMinOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "fused_gather_scatter" + self.op_type = "send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) gather_index = index[:, 0] @@ -57,8 +57,8 @@ def setUp(self): self.inputs = { 'X': x, - 'Gather_index': gather_index, - 'Scatter_index': scatter_index + 'Src_index': gather_index, + 'Dst_index': scatter_index } self.attrs = {'pool_type': 'MIN'} @@ -75,10 +75,10 @@ def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient]) -class TestFusedGatherScatterSumOp(OpTest): +class TestSendRecvSumOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "fused_gather_scatter" + self.op_type = "send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) gather_index = index[:, 0] @@ -86,8 +86,8 @@ def setUp(self): self.inputs = { 'X': x, - 'Gather_index': gather_index, - 'Scatter_index': scatter_index + 'Src_index': gather_index, + 'Dst_index': scatter_index } self.attrs = {'pool_type': 'SUM'} @@ -103,10 +103,10 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') -class TestFusedGatherScatterMeanOp(OpTest): +class TestSendRecvMeanOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "fused_gather_scatter" + self.op_type = "send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) gather_index = index[:, 0] @@ -114,8 +114,8 @@ def setUp(self): self.inputs = { 'X': x, - 'Gather_index': gather_index, - 'Scatter_index': scatter_index + 'Src_index': gather_index, + 'Dst_index': scatter_index } self.attrs = {'pool_type': 'MEAN'} @@ -134,8 +134,8 @@ def test_check_grad(self): def compute_gather_scatter_for_sum_mean(inputs, attributes): x = inputs['X'] - gather_index = inputs['Gather_index'] - scatter_index = inputs['Scatter_index'] + gather_index = inputs['Src_index'] + scatter_index = inputs['Dst_index'] pool_type = attributes['pool_type'] @@ -164,8 +164,8 @@ def compute_gather_scatter_for_sum_mean(inputs, attributes): def compute_gather_scatter_for_min_max(inputs, attributes): x = inputs['X'] - gather_index = inputs['Gather_index'] - scatter_index = inputs['Scatter_index'] + gather_index = inputs['Src_index'] + scatter_index = inputs['Dst_index'] pool_type = attributes['pool_type'] From aa3042d32893296144debb65405b266e1f98dada Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 16 Nov 2021 10:09:32 +0000 Subject: [PATCH 16/30] unify name as send and recv --- paddle/fluid/operators/send_recv_op.cc | 16 ++- paddle/fluid/operators/send_recv_op.cu | 106 ++++++++-------- paddle/fluid/operators/send_recv_op.h | 116 +++++++++--------- .../tests/unittests/test_send_recv_op.py | 87 ++++++------- 4 files changed, 151 insertions(+), 174 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc index 06cbff6328c113..e19575833597f9 100644 --- a/paddle/fluid/operators/send_recv_op.cc +++ b/paddle/fluid/operators/send_recv_op.cc @@ -64,9 +64,9 @@ class SendRecvOP : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", dims); if (ctx->Attrs().Get("pool_type") == "MEAN") { - OP_INOUT_CHECK(ctx->HasOutput("Scatter_count"), "Output", "Scatter_count", + OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count", "SendRecv"); - ctx->SetOutputDim("Scatter_count", {dims[0]}); + ctx->SetOutputDim("Dst_count", {dims[0]}); } } @@ -106,24 +106,22 @@ class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Src_index", "The source index tensor."); AddInput("Dst_index", "The destination index tensor."); AddOutput("Out", "Output tensor of send_recv op."); - AddOutput("Scatter_count", + AddOutput("Dst_count", "Count tensor of Dst_index, mainly for MEAN pool_type.") .AsIntermediate(); AddAttr( "pool_type", "(string, default 'SUM')" - "We use Src_index to gather correspoinding place of X. " - "Then we need to use different pool type to scatter the result.") + "Define different pool types to receive the result tensors") .SetDefault("SUM") .InEnum({"SUM", "MEAN", "MIN", "MAX"}); // TODO(daisiming): Add a simple example here. AddComment(R"DOC( SendRecv Operator. -$Out = Scatter(Gather(X, Gather_index), Scatter_index, pool_type)$ +$Out = Recv(Send(X, Src_index), Dst_index, pool_type)$ -This operator helps perform fused computation of gather operator and scatter operator, so as to -decrease intermediate GPU memory occupation of using gather op and scatter op successively. +This operator Example: @@ -144,7 +142,7 @@ class SendRecvGradOpMaker : public framework::SingleGradOpMaker { op->SetInput("Dst_index", this->Input("Dst_index")); if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { - op->SetInput("Scatter_count", this->Output("Scatter_count")); + op->SetInput("Dst_count", this->Output("Dst_count")); } if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu index 1014794c8bc3ec..1fcff7e568ce06 100644 --- a/paddle/fluid/operators/send_recv_op.cu +++ b/paddle/fluid/operators/send_recv_op.cu @@ -51,18 +51,17 @@ struct SendRecvMinCUDAFunctor { }; template -__global__ void SendRecvCUDAKernel(const T* params, - const IndexT* gather_indices, - const IndexT* scatter_indices, T* output, +__global__ void SendRecvCUDAKernel(const T* params, const IndexT* src_indices, + const IndexT* dst_indices, T* output, size_t index_size, size_t slice_size, Functor functor) { CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { int64_t indices_i = i / slice_size; int64_t slice_i = i - indices_i * slice_size; - IndexT gather_i = gather_indices[indices_i]; - IndexT scatter_i = scatter_indices[indices_i]; - int64_t in_i = gather_i * slice_size + slice_i; - int64_t out_i = scatter_i * slice_size + slice_i; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; functor(params, output, in_i, out_i); } } @@ -79,14 +78,13 @@ __global__ void InputResetCUDAKernel(T* output, size_t input_size, } } -// Get scatter_count +// Get dst_count template -__global__ void ComputeCountCUDAKernel(int* count, - const IndexT* scatter_indices, +__global__ void ComputeCountCUDAKernel(int* count, const IndexT* dst_indices, size_t index_size) { CUDA_KERNEL_LOOP_TYPE(i, index_size, int64_t) { - IndexT scatter_i = scatter_indices[i]; - paddle::platform::CudaAtomicAdd(count + scatter_i, 1); + IndexT dst_i = dst_indices[i]; + paddle::platform::CudaAtomicAdd(count + dst_i, 1); } } @@ -104,37 +102,34 @@ __global__ void ManipulateMeanCUDAKernel(T* output, int* count, // For backward mean template -__global__ void ManipulateMeanGradCUDAKernel(const T* params, - const IndexT* gather_indices, - const IndexT* scatter_indices, - T* output, size_t index_size, - size_t slice_size, - const int* scatter_count) { +__global__ void ManipulateMeanGradCUDAKernel( + const T* params, const IndexT* src_indices, const IndexT* dst_indices, + T* output, size_t index_size, size_t slice_size, const int* dst_count) { CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { int64_t indices_i = i / slice_size; int64_t slice_i = i - indices_i * slice_size; - IndexT gather_i = gather_indices[indices_i]; - IndexT scatter_i = scatter_indices[indices_i]; - int64_t in_i = gather_i * slice_size + slice_i; - int64_t out_i = scatter_i * slice_size + slice_i; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; paddle::platform::CudaAtomicAdd(output + out_i, - *(params + in_i) / scatter_count[gather_i]); + *(params + in_i) / dst_count[src_i]); } } // For backward min and max template __global__ void ManipulateMinMaxGradCUDAKernel( - const T* params, const IndexT* gather_indices, - const IndexT* scatter_indices, T* output, size_t index_size, - size_t slice_size, const T* ptr_input, const T* ptr_output) { + const T* params, const IndexT* src_indices, const IndexT* dst_indices, + T* output, size_t index_size, size_t slice_size, const T* ptr_input, + const T* ptr_output) { CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { int64_t indices_i = i / slice_size; int64_t slice_i = i - indices_i * slice_size; - IndexT gather_i = gather_indices[indices_i]; - IndexT scatter_i = scatter_indices[indices_i]; - int64_t in_i = gather_i * slice_size + slice_i; - int64_t out_i = scatter_i * slice_size + slice_i; + IndexT src_i = src_indices[indices_i]; + IndexT dst_i = dst_indices[indices_i]; + int64_t in_i = src_i * slice_size + slice_i; + int64_t out_i = dst_i * slice_size + slice_i; paddle::platform::CudaAtomicAdd( output + out_i, *(params + in_i) * (*(ptr_input + out_i) == *(ptr_output + in_i))); @@ -146,12 +141,12 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input("X"); - auto* gather_index = ctx.Input("Src_index"); - auto* scatter_index = ctx.Input("Dst_index"); + auto* src_index = ctx.Input("Src_index"); + auto* dst_index = ctx.Input("Dst_index"); auto* Y = ctx.Output("Out"); std::string pool_type = ctx.Attr("pool_type"); - const int& index_size = gather_index->dims()[0]; + const int& index_size = src_index->dims()[0]; if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); @@ -183,8 +178,8 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { slice_size *= src_dims[i]; } const T* p_src = X->data(); - const IndexT* g_index = gather_index->data(); - const IndexT* s_index = scatter_index->data(); + const IndexT* s_index = src_index->data(); + const IndexT* d_index = dst_index->data(); int block = 512; int64_t n = slice_size * index_size; @@ -195,14 +190,14 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, + .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, functor); } else if (pool_type == "MAX") { SendRecvMaxCUDAFunctor functor; SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, + .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, functor); int64_t grid_max = (input_size * slice_size + block - 1) / block; @@ -216,7 +211,7 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, + .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, functor); int64_t grid_min = (input_size * slice_size + block - 1) / block; @@ -230,16 +225,16 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, + .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, functor); - auto* scatter_count = ctx.Output("Scatter_count"); - int* p_scatter_count = scatter_count->mutable_data(ctx.GetPlace()); + auto* dst_count = ctx.Output("Dst_count"); + int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); #ifdef PADDLE_WITH_HIP - hipMemset(p_scatter_count, 0, input_size * sizeof(int)); + hipMemset(p_dst_count, 0, input_size * sizeof(int)); #else - cudaMemset(p_scatter_count, 0, input_size * sizeof(int)); + cudaMemset(p_dst_count, 0, input_size * sizeof(int)); #endif int64_t grid_count = (index_size + block - 1) / block; @@ -247,15 +242,14 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { T, IndexT><<( ctx.device_context()) - .stream()>>>(p_scatter_count, s_index, index_size); + .stream()>>>(p_dst_count, d_index, index_size); int64_t grid_mean = (input_size * slice_size + block - 1) / block; ManipulateMeanCUDAKernel< T><<( ctx.device_context()) - .stream()>>>(p_output, p_scatter_count, input_size, - slice_size); + .stream()>>>(p_output, p_dst_count, input_size, slice_size); } } }; @@ -265,12 +259,12 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* gather_index = ctx.Input("Dst_index"); - auto* scatter_index = ctx.Input("Src_index"); + auto* src_index = ctx.Input("Dst_index"); + auto* dst_index = ctx.Input("Src_index"); auto* Y = ctx.Output(framework::GradVarName("X")); std::string pool_type = ctx.Attr("pool_type"); - const int& index_size = gather_index->dims()[0]; + const int& index_size = src_index->dims()[0]; if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); @@ -292,8 +286,8 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { slice_size *= src_dims[i]; } const T* p_src = X->data(); - const IndexT* g_index = gather_index->data(); - const IndexT* s_index = scatter_index->data(); + const IndexT* s_index = src_index->data(); + const IndexT* d_index = dst_index->data(); int block = 512; int64_t n = slice_size * index_size; @@ -304,15 +298,15 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { SendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, + .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, functor); } else if (pool_type == "MEAN") { - auto* scatter_count = ctx.Input("Scatter_count"); - const int* s_count = scatter_count->data(); + auto* dst_count = ctx.Input("Dst_count"); + const int* s_count = dst_count->data(); ManipulateMeanGradCUDAKernel<<< grid, block, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, + .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, s_count); } else if (pool_type == "MAX" || pool_type == "MIN") { auto* input = ctx.Input("X"); @@ -322,7 +316,7 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { ManipulateMinMaxGradCUDAKernel<<< grid, block, 0, reinterpret_cast( ctx.device_context()) - .stream()>>>(p_src, g_index, s_index, p_output, + .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, ptr_input, ptr_output); } diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h index b4c2c0fd003b1b..9141953c70fd22 100644 --- a/paddle/fluid/operators/send_recv_op.h +++ b/paddle/fluid/operators/send_recv_op.h @@ -73,41 +73,41 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst, } template -void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, - const IndexT* g_index, const IndexT* s_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - int* scatter_count = NULL) { +void send_recv_cpu_for_loop(const int& input_size, const int& index_size, + const IndexT* s_index, const IndexT* d_index, + const Tensor& src, Tensor* dst, + const std::string& pool_type, + int* dst_count = NULL) { Functor functor; if (pool_type == "SUM") { for (int i = 0; i < index_size; ++i) { - IndexT src_idx = g_index[i]; - IndexT dst_idx = s_index[i]; + IndexT src_idx = s_index[i]; + IndexT dst_idx = d_index[i]; elementwise_inner_operation(src, dst, src_idx, dst_idx, false, functor); } } else if (pool_type == "MEAN") { for (int i = 0; i < index_size; ++i) { - IndexT src_idx = g_index[i]; - IndexT dst_idx = s_index[i]; + IndexT src_idx = s_index[i]; + IndexT dst_idx = d_index[i]; elementwise_inner_operation(src, dst, src_idx, dst_idx, false, functor); } for (int i = 0; i < index_size; ++i) { - IndexT dst_idx = s_index[i]; - *(scatter_count + dst_idx) += 1; + IndexT dst_idx = d_index[i]; + *(dst_count + dst_idx) += 1; } for (int i = 0; i < input_size; ++i) { - if (*(scatter_count + i) == 0) continue; + if (*(dst_count + i) == 0) continue; auto dst_slice = dst->Slice(i, i + 1); auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst = eigen_dst / static_cast(*(scatter_count + i)); + eigen_dst = eigen_dst / static_cast(*(dst_count + i)); } } else if (pool_type == "MIN" || pool_type == "MAX") { std::set existed_dst; for (int i = 0; i < index_size; ++i) { - IndexT src_idx = g_index[i]; - IndexT dst_idx = s_index[i]; + IndexT src_idx = s_index[i]; + IndexT dst_idx = d_index[i]; bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); if (!in_set) { elementwise_inner_operation(src, dst, src_idx, @@ -122,33 +122,35 @@ void gather_scatter_cpu_for_loop(const int& input_size, const int& index_size, } template -void gather_scatter_cpu_for_loop_grad( - const int& input_size, const int& index_size, const IndexT* g_index, - const IndexT* s_index, const Tensor& src, Tensor* dst, - const std::string& pool_type, const int* scatter_count = nullptr, - const Tensor* input = nullptr, const Tensor* output = nullptr) { +void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size, + const IndexT* s_index, const IndexT* d_index, + const Tensor& src, Tensor* dst, + const std::string& pool_type, + const int* dst_count = nullptr, + const Tensor* input = nullptr, + const Tensor* output = nullptr) { if (pool_type == "SUM") { Functor functor; for (int i = 0; i < index_size; ++i) { - IndexT src_idx = g_index[i]; - IndexT dst_idx = s_index[i]; + IndexT src_idx = s_index[i]; + IndexT dst_idx = d_index[i]; elementwise_inner_operation(src, dst, src_idx, dst_idx, false, functor); } } else if (pool_type == "MEAN") { for (int i = 0; i < index_size; ++i) { - IndexT src_idx = g_index[i]; - IndexT dst_idx = s_index[i]; + IndexT src_idx = s_index[i]; + IndexT dst_idx = d_index[i]; auto src_slice = src.Slice(src_idx, src_idx + 1); auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); auto eigen_src = framework::EigenVector::Flatten(src_slice); auto eigen_dst = framework::EigenVector::Flatten(dst_slice); - eigen_dst += (eigen_src / static_cast(scatter_count[src_idx])); + eigen_dst += (eigen_src / static_cast(dst_count[src_idx])); } } else if (pool_type == "MIN" || pool_type == "MAX") { for (int i = 0; i < index_size; ++i) { - auto forward_src_idx = s_index[i]; - auto forward_dst_idx = g_index[i]; + auto forward_src_idx = d_index[i]; + auto forward_dst_idx = s_index[i]; auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); auto eigen_input = framework::EigenVector::Flatten(input_slice); @@ -168,11 +170,11 @@ class SendRecvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input("X"); - auto* gather_index = ctx.Input("Src_index"); - auto* scatter_index = ctx.Input("Dst_index"); + auto* src_index = ctx.Input("Src_index"); + auto* dst_index = ctx.Input("Dst_index"); auto* Y = ctx.Output("Out"); - const int& index_size = gather_index->dims()[0]; + const int& index_size = src_index->dims()[0]; if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); @@ -182,26 +184,26 @@ class SendRecvOpKernel : public framework::OpKernel { const size_t& memset_bytes = memset_size * sizeof(T); memset(p_output, 0, memset_bytes); - const IndexT* g_index = gather_index->data(); - const IndexT* s_index = scatter_index->data(); + const IndexT* s_index = src_index->data(); + const IndexT* d_index = dst_index->data(); const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { - gather_scatter_cpu_for_loop>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); + send_recv_cpu_for_loop>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); } else if (pool_type == "MIN") { - gather_scatter_cpu_for_loop>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); + send_recv_cpu_for_loop>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); } else if (pool_type == "MAX") { - gather_scatter_cpu_for_loop>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); + send_recv_cpu_for_loop>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { - auto* scatter_count = ctx.Output("Scatter_count"); - int* p_scatter_count = scatter_count->mutable_data(ctx.GetPlace()); - memset(p_scatter_count, 0, src_dims[0] * sizeof(int)); - gather_scatter_cpu_for_loop>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, - p_scatter_count); + auto* dst_count = ctx.Output("Dst_count"); + int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); + memset(p_dst_count, 0, src_dims[0] * sizeof(int)); + send_recv_cpu_for_loop>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, + p_dst_count); } } }; @@ -211,11 +213,11 @@ class SendRecvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* gather_index = ctx.Input("Dst_index"); - auto* scatter_index = ctx.Input("Src_index"); + auto* src_index = ctx.Input("Dst_index"); + auto* dst_index = ctx.Input("Src_index"); auto* Y = ctx.Output(framework::GradVarName("X")); - const int& index_size = gather_index->dims()[0]; + const int& index_size = src_index->dims()[0]; if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); @@ -225,25 +227,25 @@ class SendRecvGradOpKernel : public framework::OpKernel { const size_t& memset_bytes = memset_size * sizeof(T); memset(p_output, 0, memset_bytes); - const IndexT* g_index = gather_index->data(); - const IndexT* s_index = scatter_index->data(); + const IndexT* s_index = src_index->data(); + const IndexT* d_index = dst_index->data(); const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { - gather_scatter_cpu_for_loop_grad>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type); + send_recv_cpu_for_loop_grad>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { - auto* scatter_count = ctx.Input("Scatter_count"); - const int* s_count = scatter_count->data(); + auto* dst_count = ctx.Input("Dst_count"); + const int* s_count = dst_count->data(); // Functor not used here. - gather_scatter_cpu_for_loop_grad>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, s_count); + send_recv_cpu_for_loop_grad>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); } else if (pool_type == "MIN" || pool_type == "MAX") { const auto* input = ctx.Input("X"); const auto* output = ctx.Input("Out"); // Functor not used here. - gather_scatter_cpu_for_loop_grad>( - src_dims[0], index_size, g_index, s_index, *X, Y, pool_type, nullptr, + send_recv_cpu_for_loop_grad>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, input, output); } } diff --git a/python/paddle/fluid/tests/unittests/test_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_send_recv_op.py index 0ad1603c1e5d8a..aa02942f3736a5 100644 --- a/python/paddle/fluid/tests/unittests/test_send_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_send_recv_op.py @@ -24,19 +24,15 @@ def setUp(self): self.op_type = "send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) - gather_index = index[:, 0] - scatter_index = index[:, 1] + src_index = index[:, 0] + dst_index = index[:, 1] - self.inputs = { - 'X': x, - 'Src_index': gather_index, - 'Dst_index': scatter_index - } + self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index} self.attrs = {'pool_type': 'MAX'} - out, self.gradient = compute_gather_scatter_for_min_max(self.inputs, - self.attrs) + out, self.gradient = compute_send_recv_for_min_max(self.inputs, + self.attrs) self.outputs = {'Out': out} def test_check_output(self): @@ -52,19 +48,15 @@ def setUp(self): self.op_type = "send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) - gather_index = index[:, 0] - scatter_index = index[:, 1] + src_index = index[:, 0] + dst_index = index[:, 1] - self.inputs = { - 'X': x, - 'Src_index': gather_index, - 'Dst_index': scatter_index - } + self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index} self.attrs = {'pool_type': 'MIN'} - out, self.gradient = compute_gather_scatter_for_min_max(self.inputs, - self.attrs) + out, self.gradient = compute_send_recv_for_min_max(self.inputs, + self.attrs) self.outputs = {'Out': out} @@ -81,18 +73,14 @@ def setUp(self): self.op_type = "send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) - gather_index = index[:, 0] - scatter_index = index[:, 1] + src_index = index[:, 0] + dst_index = index[:, 1] - self.inputs = { - 'X': x, - 'Src_index': gather_index, - 'Dst_index': scatter_index - } + self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index} self.attrs = {'pool_type': 'SUM'} - out, _ = compute_gather_scatter_for_sum_mean(self.inputs, self.attrs) + out, _ = compute_send_recv_for_sum_mean(self.inputs, self.attrs) self.outputs = {'Out': out} @@ -109,21 +97,16 @@ def setUp(self): self.op_type = "send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) - gather_index = index[:, 0] - scatter_index = index[:, 1] + src_index = index[:, 0] + dst_index = index[:, 1] - self.inputs = { - 'X': x, - 'Src_index': gather_index, - 'Dst_index': scatter_index - } + self.inputs = {'X': x, 'Src_index': src_index, 'Dst_index': dst_index} self.attrs = {'pool_type': 'MEAN'} - out, scatter_count = compute_gather_scatter_for_sum_mean(self.inputs, - self.attrs) + out, dst_count = compute_send_recv_for_sum_mean(self.inputs, self.attrs) - self.outputs = {'Out': out, 'Scatter_count': scatter_count} + self.outputs = {'Out': out, 'Dst_count': dst_count} def test_check_output(self): self.check_output() @@ -132,22 +115,22 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') -def compute_gather_scatter_for_sum_mean(inputs, attributes): +def compute_send_recv_for_sum_mean(inputs, attributes): x = inputs['X'] - gather_index = inputs['Src_index'] - scatter_index = inputs['Dst_index'] + src_index = inputs['Src_index'] + dst_index = inputs['Dst_index'] pool_type = attributes['pool_type'] - gather_x = x[gather_index] + gather_x = x[src_index] target_shape = list(x.shape) results = np.zeros(target_shape, dtype=x.dtype) if pool_type == 'SUM': - for index, s_id in enumerate(scatter_index): + for index, s_id in enumerate(dst_index): results[s_id, :] += gather_x[index, :] elif pool_type == 'MEAN': count = np.zeros(target_shape[0], dtype=np.int32) - for index, s_id in enumerate(scatter_index): + for index, s_id in enumerate(dst_index): results[s_id, :] += gather_x[index, :] count[s_id] += 1 results = results / count.reshape([-1, 1]) @@ -156,20 +139,20 @@ def compute_gather_scatter_for_sum_mean(inputs, attributes): raise ValueError("Invalid pool_type, only SUM, MEAN supported!") count = np.zeros(target_shape[0], dtype=np.int32) - for index, s_id in enumerate(scatter_index): + for index, s_id in enumerate(dst_index): count[s_id] += 1 return results, count -def compute_gather_scatter_for_min_max(inputs, attributes): +def compute_send_recv_for_min_max(inputs, attributes): x = inputs['X'] - gather_index = inputs['Src_index'] - scatter_index = inputs['Dst_index'] + src_index = inputs['Src_index'] + dst_index = inputs['Dst_index'] pool_type = attributes['pool_type'] - gather_x = x[gather_index] + gather_x = x[src_index] target_shape = list(x.shape) results = np.zeros(target_shape, dtype=x.dtype) gradient = np.zeros_like(x) @@ -177,7 +160,7 @@ def compute_gather_scatter_for_min_max(inputs, attributes): # Calculate forward output if pool_type == "MAX": first_set = set() - for index, s_id in enumerate(scatter_index): + for index, s_id in enumerate(dst_index): if s_id not in first_set: results[s_id, :] += gather_x[index, :] first_set.add(s_id) @@ -186,7 +169,7 @@ def compute_gather_scatter_for_min_max(inputs, attributes): gather_x[index, :]) elif pool_type == "MIN": first_set = set() - for index, s_id in enumerate(scatter_index): + for index, s_id in enumerate(dst_index): if s_id not in first_set: results[s_id, :] += gather_x[index, :] first_set.add(s_id) @@ -197,10 +180,10 @@ def compute_gather_scatter_for_min_max(inputs, attributes): raise ValueError("Invalid pool_type, only MAX, MIN supported!") # Calculate backward gradient - index_size = len(gather_index) + index_size = len(src_index) for i in range(index_size): - forward_src_idx = gather_index[i] - forward_dst_idx = scatter_index[i] + forward_src_idx = src_index[i] + forward_dst_idx = dst_index[i] gradient[forward_src_idx] += 1 * ( x[forward_src_idx] == results[forward_dst_idx]) From 05c4acf0731e44d8bd75bb385e4deb63efaed997 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 16 Nov 2021 11:43:53 +0000 Subject: [PATCH 17/30] change zero index return time --- paddle/fluid/operators/send_recv_op.cc | 13 +++++-------- paddle/fluid/operators/send_recv_op.cu | 3 ++- paddle/fluid/operators/send_recv_op.h | 3 ++- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc index e19575833597f9..a4bc51170de504 100644 --- a/paddle/fluid/operators/send_recv_op.cc +++ b/paddle/fluid/operators/send_recv_op.cc @@ -109,10 +109,10 @@ class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Dst_count", "Count tensor of Dst_index, mainly for MEAN pool_type.") .AsIntermediate(); - AddAttr( - "pool_type", - "(string, default 'SUM')" - "Define different pool types to receive the result tensors") + AddAttr("pool_type", + "(string, default 'SUM')" + "Define different pool types to receive the result " + "tensors of Dst_index.") .SetDefault("SUM") .InEnum({"SUM", "MEAN", "MIN", "MAX"}); // TODO(daisiming): Add a simple example here. @@ -121,11 +121,8 @@ SendRecv Operator. $Out = Recv(Send(X, Src_index), Dst_index, pool_type)$ -This operator +This operator is mainly used in Graph domain. We use this operator to perform message passing process. We can gather feature according to Src_index, and then use different pool types to define how to receive the result tensors of Dst_index. -Example: - -pass )DOC"); } }; diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu index 1fcff7e568ce06..9cdf8acdbe3501 100644 --- a/paddle/fluid/operators/send_recv_op.cu +++ b/paddle/fluid/operators/send_recv_op.cu @@ -147,7 +147,6 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { std::string pool_type = ctx.Attr("pool_type"); const int& index_size = src_index->dims()[0]; - if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); const auto& src_dims = X->dims(); @@ -173,6 +172,8 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { std::numeric_limits::max()); } + if (index_size == 0) return; + int64_t slice_size = 1; for (int i = 1; i < src_dims.size(); ++i) { slice_size *= src_dims[i]; diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h index 9141953c70fd22..64dc474b60e0b7 100644 --- a/paddle/fluid/operators/send_recv_op.h +++ b/paddle/fluid/operators/send_recv_op.h @@ -175,7 +175,6 @@ class SendRecvOpKernel : public framework::OpKernel { auto* Y = ctx.Output("Out"); const int& index_size = src_index->dims()[0]; - if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); const auto& src_dims = X->dims(); @@ -184,6 +183,8 @@ class SendRecvOpKernel : public framework::OpKernel { const size_t& memset_bytes = memset_size * sizeof(T); memset(p_output, 0, memset_bytes); + if (index_size == 0) return; + const IndexT* s_index = src_index->data(); const IndexT* d_index = dst_index->data(); From c4408b53e714545a4511d9150c2a96bb24033a57 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Tue, 16 Nov 2021 13:52:21 +0000 Subject: [PATCH 18/30] add send_recv incubate api --- paddle/fluid/operators/send_recv_op.cc | 2 +- python/paddle/incubate/__init__.py | 2 + python/paddle/incubate/operators/__init__.py | 1 + python/paddle/incubate/operators/send_recv.py | 87 +++++++++++++++++++ 4 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 python/paddle/incubate/operators/send_recv.py diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc index a4bc51170de504..24bd47a6bd8265 100644 --- a/paddle/fluid/operators/send_recv_op.cc +++ b/paddle/fluid/operators/send_recv_op.cc @@ -121,7 +121,7 @@ SendRecv Operator. $Out = Recv(Send(X, Src_index), Dst_index, pool_type)$ -This operator is mainly used in Graph domain. We use this operator to perform message passing process. We can gather feature according to Src_index, and then use different pool types to define how to receive the result tensors of Dst_index. +This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` to gather the corresponding positions, and then scatter the corresponding output tensor in different pooling types, like sum, mean, max, or min. )DOC"); } diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py index f44e38347e5383..6445ee74c0d065 100644 --- a/python/paddle/incubate/__init__.py +++ b/python/paddle/incubate/__init__.py @@ -18,6 +18,7 @@ from ..fluid.layer_helper import LayerHelper # noqa: F401 from .operators import softmax_mask_fuse_upper_triangle # noqa: F401 from .operators import softmax_mask_fuse # noqa: F401 +from .operators import send_recv from .tensor import segment_sum from .tensor import segment_mean from .tensor import segment_max @@ -30,6 +31,7 @@ 'ModelAverage', 'softmax_mask_fuse_upper_triangle', 'softmax_mask_fuse', + 'send_recv', 'segment_sum', 'segment_mean', 'segment_max', diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py index 9a6710d0950974..90bcaddd79f5da 100644 --- a/python/paddle/incubate/operators/__init__.py +++ b/python/paddle/incubate/operators/__init__.py @@ -15,3 +15,4 @@ from .softmax_mask_fuse_upper_triangle import softmax_mask_fuse_upper_triangle # noqa: F401 from .softmax_mask_fuse import softmax_mask_fuse # noqa: F401 from .resnet_unit import ResNetUnit #noqa: F401 +from .send_recv import send_recv #noqa: F401 diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py new file mode 100644 index 00000000000000..2f145780e02a5d --- /dev/null +++ b/python/paddle/incubate/operators/send_recv.py @@ -0,0 +1,87 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import in_dygraph_mode +from paddle.fluid.data_feeder import check_variable_and_dtype +from paddle.fluid import core + + +def send_recv(x, src_index, dst_index, pool_type, name=None): + r""" + + Send Recv Operator. + + This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory + consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` + to gather the corresponding positions, and then scatter the corresponding output tensor in different + pooling types, like sum, mean, max, or min. + + Args: + x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64. + src_index (Tensor): An 1-d tensor, and the available data type is int32, int64. + dst_index (Tensor): An 1-d tensor, and should have the same shape as `src_index`. + The available data type is int32, int64. + pool_type (str): The pooling type of send_recv, including `sum`, `mean`, `max`, `min`. + name (str, optional): Name for the operation (optional, default is None). + For more information, please refer to :ref:`api_guide_Name`. + + Returns: + out (Tensor): The output tensor, should have the same shape as input tensor `x`. + + Examples: + + .. code-block:: python + + import paddle + import numpy as np + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") + indexes = paddle.to_tensor(np.array([[0, 1], [1, 2], [2, 1], [0, 0]]), dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out = paddle.incubate.send_recv(x, src_index, dst_index, pool_type="sum") + # Outputs: [[0., 2., 3.], [2., 8., 10.], [2., 6., 7.]] + + """ + + if pool_type not in ["sum", "mean", "max", "min"]: + raise ValueError( + "pool_type should be `sum`, `mean`, `max` or `min`, but received %s" + % pool_type) + + if in_dygraph_mode(): + out, tmp = core.ops.send_recv(x, src_index, dst_index, 'pool_type', + pool_type.upper()) + return out + + check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"), + "send_recv") + check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"), + "send_recv") + check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"), + "send_recv") + + helper = LayerHelper("send_recv", **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + dst_count = helper.create_variable_for_type_inference( + dtype="int32", stop_gradient=True) + helper.append_op( + type="send_recv", + inputs={"X": x, + "Src_index": src_index, + "Dst_index": dst_index}, + outputs={"Out": out, + "Dst_count": dst_count}, + attrs={"pool_type": pool_type.upper()}) + return out From 8e435f52053804408bd7cd3406da2ac62efc7961 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 17 Nov 2021 03:45:53 +0000 Subject: [PATCH 19/30] fix index data type, add unittest case for API --- paddle/fluid/operators/send_recv_op.cc | 33 +- paddle/fluid/operators/send_recv_op.cu | 379 +++++++++--------- paddle/fluid/operators/send_recv_op.h | 170 ++++---- .../tests/unittests/test_send_recv_op.py | 81 +++- 4 files changed, 396 insertions(+), 267 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc index 24bd47a6bd8265..64bb1fb13beaa6 100644 --- a/paddle/fluid/operators/send_recv_op.cc +++ b/paddle/fluid/operators/send_recv_op.cc @@ -121,7 +121,11 @@ SendRecv Operator. $Out = Recv(Send(X, Src_index), Dst_index, pool_type)$ -This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` to gather the corresponding positions, and then scatter the corresponding output tensor in different pooling types, like sum, mean, max, or min. +This operator is mainly used in Graph domain, and the main purpose is to reduce +intermediate memory consumption in the process of message passing. +Take `x` as the input tensor, we first use `src_index` to gather corresponding +positions, and then scatter the corresponding output tensor in different pooling +types, like sum, mean, max, or min. )DOC"); } @@ -164,21 +168,12 @@ REGISTER_OPERATOR(send_recv, ops::SendRecvOP, ops::SendRecvOpMaker, ops::SendRecvGradOpMaker, ops::SendRecvGradOpMaker); REGISTER_OPERATOR(send_recv_grad, ops::SendRecvGradOp); -REGISTER_OP_CPU_KERNEL(send_recv, ops::SendRecvOpKernel, - ops::SendRecvOpKernel, - ops::SendRecvOpKernel, - ops::SendRecvOpKernel, - ops::SendRecvOpKernel, - ops::SendRecvOpKernel, - ops::SendRecvOpKernel, - ops::SendRecvOpKernel); - -REGISTER_OP_CPU_KERNEL(send_recv_grad, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel); +REGISTER_OP_CPU_KERNEL(send_recv, ops::SendRecvOpKernel, + ops::SendRecvOpKernel, + ops::SendRecvOpKernel, + ops::SendRecvOpKernel); + +REGISTER_OP_CPU_KERNEL(send_recv_grad, ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel, + ops::SendRecvGradOpKernel); diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu index 9cdf8acdbe3501..15dcda7e1cb077 100644 --- a/paddle/fluid/operators/send_recv_op.cu +++ b/paddle/fluid/operators/send_recv_op.cu @@ -137,189 +137,218 @@ __global__ void ManipulateMinMaxGradCUDAKernel( } template -class SendRecvOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* src_index = ctx.Input("Src_index"); - auto* dst_index = ctx.Input("Dst_index"); - auto* Y = ctx.Output("Out"); - std::string pool_type = ctx.Attr("pool_type"); +void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx) { + auto* X = ctx.Input("X"); + auto* src_index = ctx.Input("Src_index"); + auto* dst_index = ctx.Input("Dst_index"); + auto* Y = ctx.Output("Out"); + std::string pool_type = ctx.Attr("pool_type"); + + const int& index_size = src_index->dims()[0]; + + T* p_output = Y->mutable_data(ctx.GetPlace()); + const auto& src_dims = X->dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); + if (pool_type == "SUM" || pool_type == "MEAN") { +#ifdef PADDLE_WITH_HIP + hipMemset(p_output, 0, memset_bytes); +#else + cudaMemset(p_output, 0, memset_bytes); +#endif + } else if (pool_type == "MAX") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, + std::numeric_limits::min()); + } else if (pool_type == "MIN") { + thrust::device_ptr p_output_ptr(p_output); + thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, + std::numeric_limits::max()); + } - const int& index_size = src_index->dims()[0]; + if (index_size == 0) return; + + int64_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) { + slice_size *= src_dims[i]; + } + const T* p_src = X->data(); + const IndexT* s_index = src_index->data(); + const IndexT* d_index = dst_index->data(); + + int block = 512; + int64_t n = slice_size * index_size; + int64_t grid = (n + block - 1) / block; + int64_t input_size = src_dims[0]; + if (pool_type == "SUM") { + SendRecvSumCUDAFunctor functor; + SendRecvCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, s_index, d_index, p_output, + index_size, slice_size, functor); + } else if (pool_type == "MAX") { + SendRecvMaxCUDAFunctor functor; + SendRecvCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, s_index, d_index, p_output, + index_size, slice_size, functor); + + int64_t grid_max = (input_size * slice_size + block - 1) / block; + InputResetCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, input_size, slice_size); + } else if (pool_type == "MIN") { + SendRecvMinCUDAFunctor functor; + SendRecvCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, s_index, d_index, p_output, + index_size, slice_size, functor); + + int64_t grid_min = (input_size * slice_size + block - 1) / block; + InputResetCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, input_size, slice_size); + } else if (pool_type == "MEAN") { + SendRecvSumCUDAFunctor functor; + SendRecvCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, s_index, d_index, p_output, + index_size, slice_size, functor); + + auto* dst_count = ctx.Output("Dst_count"); + int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - if (pool_type == "SUM" || pool_type == "MEAN") { #ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); + hipMemset(p_dst_count, 0, input_size * sizeof(int)); #else - cudaMemset(p_output, 0, memset_bytes); + cudaMemset(p_dst_count, 0, input_size * sizeof(int)); #endif - } else if (pool_type == "MAX") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::min()); - } else if (pool_type == "MIN") { - thrust::device_ptr p_output_ptr(p_output); - thrust::fill(thrust::device, p_output_ptr, p_output_ptr + memset_size, - std::numeric_limits::max()); - } - - if (index_size == 0) return; + int64_t grid_count = (index_size + block - 1) / block; + ComputeCountCUDAKernel< + T, IndexT><<( + ctx.device_context()) + .stream()>>>(p_dst_count, d_index, index_size); + + int64_t grid_mean = (input_size * slice_size + block - 1) / block; + ManipulateMeanCUDAKernel< + T><<( + ctx.device_context()) + .stream()>>>(p_output, p_dst_count, input_size, slice_size); + } +} - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index->data(); - const IndexT* d_index = dst_index->data(); - - int block = 512; - int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - SendRecvSumCUDAFunctor functor; - SendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MAX") { - SendRecvMaxCUDAFunctor functor; - SendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_max = (input_size * slice_size + block - 1) / block; - InputResetCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MIN") { - SendRecvMinCUDAFunctor functor; - SendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - int64_t grid_min = (input_size * slice_size + block - 1) / block; - InputResetCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, input_size, slice_size); - } else if (pool_type == "MEAN") { - SendRecvSumCUDAFunctor functor; - SendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); +template +void SendRecvGradOpCUDAKernelLaunchHelper( + const framework::ExecutionContext& ctx) { + auto* X = ctx.Input(framework::GradVarName("Out")); + auto* src_index = ctx.Input("Dst_index"); + auto* dst_index = ctx.Input("Src_index"); + auto* Y = ctx.Output(framework::GradVarName("X")); + std::string pool_type = ctx.Attr("pool_type"); + + const int& index_size = src_index->dims()[0]; + if (index_size == 0) return; + + T* p_output = Y->mutable_data(ctx.GetPlace()); + const auto& src_dims = X->dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + const size_t& memset_bytes = memset_size * sizeof(T); #ifdef PADDLE_WITH_HIP - hipMemset(p_dst_count, 0, input_size * sizeof(int)); + hipMemset(p_output, 0, memset_bytes); #else - cudaMemset(p_dst_count, 0, input_size * sizeof(int)); + cudaMemset(p_output, 0, memset_bytes); #endif - int64_t grid_count = (index_size + block - 1) / block; - ComputeCountCUDAKernel< - T, IndexT><<( - ctx.device_context()) - .stream()>>>(p_dst_count, d_index, index_size); - - int64_t grid_mean = (input_size * slice_size + block - 1) / block; - ManipulateMeanCUDAKernel< - T><<( - ctx.device_context()) - .stream()>>>(p_output, p_dst_count, input_size, slice_size); + int64_t slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) { + slice_size *= src_dims[i]; + } + const T* p_src = X->data(); + const IndexT* s_index = src_index->data(); + const IndexT* d_index = dst_index->data(); + + int block = 512; + int64_t n = slice_size * index_size; + int64_t grid = (n + block - 1) / block; + int64_t input_size = src_dims[0]; + + if (pool_type == "SUM") { + SendRecvSumCUDAFunctor functor; + SendRecvCUDAKernel><<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, s_index, d_index, p_output, + index_size, slice_size, functor); + } else if (pool_type == "MEAN") { + auto* dst_count = ctx.Input("Dst_count"); + const int* s_count = dst_count->data(); + ManipulateMeanGradCUDAKernel<<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, s_index, d_index, p_output, + index_size, slice_size, s_count); + } else if (pool_type == "MAX" || pool_type == "MIN") { + auto* input = ctx.Input("X"); + auto* output = ctx.Input("Out"); + const T* ptr_input = input->data(); + const T* ptr_output = output->data(); + ManipulateMinMaxGradCUDAKernel<<< + grid, block, 0, reinterpret_cast( + ctx.device_context()) + .stream()>>>(p_src, s_index, d_index, p_output, + index_size, slice_size, ptr_input, + ptr_output); + } +} + +template +class SendRecvOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* src_index = ctx.Input("Src_index"); + auto index_type = src_index->type(); + + if (index_type == framework::proto::VarType::INT32) { + SendRecvOpCUDAKernelLaunchHelper(ctx); + } else if (index_type == framework::proto::VarType::INT64) { + SendRecvOpCUDAKernelLaunchHelper(ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported Src_index or Dst_index type, Expected int, int64, but " + "got %s.", + index_type)); } } }; -template +template class SendRecvGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input(framework::GradVarName("Out")); auto* src_index = ctx.Input("Dst_index"); - auto* dst_index = ctx.Input("Src_index"); - auto* Y = ctx.Output(framework::GradVarName("X")); - std::string pool_type = ctx.Attr("pool_type"); - - const int& index_size = src_index->dims()[0]; - if (index_size == 0) return; - - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; - } - const size_t& memset_bytes = memset_size * sizeof(T); - -#ifdef PADDLE_WITH_HIP - hipMemset(p_output, 0, memset_bytes); -#else - cudaMemset(p_output, 0, memset_bytes); -#endif - - int64_t slice_size = 1; - for (int i = 1; i < src_dims.size(); ++i) { - slice_size *= src_dims[i]; - } - const T* p_src = X->data(); - const IndexT* s_index = src_index->data(); - const IndexT* d_index = dst_index->data(); - - int block = 512; - int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; - int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { - SendRecvSumCUDAFunctor functor; - SendRecvCUDAKernel><<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, functor); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - ManipulateMeanGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, s_count); - } else if (pool_type == "MAX" || pool_type == "MIN") { - auto* input = ctx.Input("X"); - auto* output = ctx.Input("Out"); - const T* ptr_input = input->data(); - const T* ptr_output = output->data(); - ManipulateMinMaxGradCUDAKernel<<< - grid, block, 0, reinterpret_cast( - ctx.device_context()) - .stream()>>>(p_src, s_index, d_index, p_output, - index_size, slice_size, ptr_input, - ptr_output); + auto index_type = src_index->type(); + if (index_type == framework::proto::VarType::INT32) { + SendRecvGradOpCUDAKernelLaunchHelper(ctx); + } else if (index_type == framework::proto::VarType::INT64) { + SendRecvGradOpCUDAKernelLaunchHelper(ctx); } } }; @@ -330,21 +359,13 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { using CUDA = paddle::platform::CUDADeviceContext; namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(send_recv, ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(send_recv, ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel, + ops::SendRecvOpCUDAKernel); REGISTER_OP_CUDA_KERNEL(send_recv_grad, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel); + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel, + ops::SendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h index 64dc474b60e0b7..151b10f26eead0 100644 --- a/paddle/fluid/operators/send_recv_op.h +++ b/paddle/fluid/operators/send_recv_op.h @@ -166,88 +166,122 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size, } template -class SendRecvOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input("X"); - auto* src_index = ctx.Input("Src_index"); - auto* dst_index = ctx.Input("Dst_index"); - auto* Y = ctx.Output("Out"); +void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx) { + auto* X = ctx.Input("X"); + auto* src_index = ctx.Input("Src_index"); + auto* dst_index = ctx.Input("Dst_index"); + auto* Y = ctx.Output("Out"); + + const int& index_size = src_index->dims()[0]; + + T* p_output = Y->mutable_data(ctx.GetPlace()); + const auto& src_dims = X->dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + const size_t& memset_bytes = memset_size * sizeof(T); + memset(p_output, 0, memset_bytes); + + if (index_size == 0) return; + + const IndexT* s_index = src_index->data(); + const IndexT* d_index = dst_index->data(); + const std::string& pool_type = ctx.Attr("pool_type"); + if (pool_type == "SUM") { + send_recv_cpu_for_loop>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); + } else if (pool_type == "MIN") { + send_recv_cpu_for_loop>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); + } else if (pool_type == "MAX") { + send_recv_cpu_for_loop>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); + } else if (pool_type == "MEAN") { + auto* dst_count = ctx.Output("Dst_count"); + int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); + memset(p_dst_count, 0, src_dims[0] * sizeof(int)); + send_recv_cpu_for_loop>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, + p_dst_count); + } +} - const int& index_size = src_index->dims()[0]; +template +void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx) { + auto* X = ctx.Input(framework::GradVarName("Out")); + auto* src_index = ctx.Input("Dst_index"); + auto* dst_index = ctx.Input("Src_index"); + auto* Y = ctx.Output(framework::GradVarName("X")); + + const int& index_size = src_index->dims()[0]; - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); + T* p_output = Y->mutable_data(ctx.GetPlace()); + const auto& src_dims = X->dims(); + int64_t memset_size = 1; + for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + const size_t& memset_bytes = memset_size * sizeof(T); + memset(p_output, 0, memset_bytes); - if (index_size == 0) return; + if (index_size == 0) return; - const IndexT* s_index = src_index->data(); - const IndexT* d_index = dst_index->data(); + const IndexT* s_index = src_index->data(); + const IndexT* d_index = dst_index->data(); - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MIN") { - send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MAX") { - send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Output("Dst_count"); - int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); - memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - send_recv_cpu_for_loop>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, - p_dst_count); + const std::string& pool_type = ctx.Attr("pool_type"); + if (pool_type == "SUM") { + send_recv_cpu_for_loop_grad>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); + } else if (pool_type == "MEAN") { + auto* dst_count = ctx.Input("Dst_count"); + const int* s_count = dst_count->data(); + // Functor not used here. + send_recv_cpu_for_loop_grad>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); + } else if (pool_type == "MIN" || pool_type == "MAX") { + const auto* input = ctx.Input("X"); + const auto* output = ctx.Input("Out"); + // Functor not used here. + send_recv_cpu_for_loop_grad>( + src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, + input, output); + } +} + +template +class SendRecvOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* src_index = ctx.Input("Src_index"); + auto index_type = src_index->type(); + + if (index_type == framework::proto::VarType::INT32) { + SendRecvOpKernelLaunchHelper(ctx); + } else if (index_type == framework::proto::VarType::INT64) { + SendRecvOpKernelLaunchHelper(ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported Src_index or Dst_index type, Expected int, int64, but " + "got %s.", + index_type)); } } }; -template +template class SendRecvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* X = ctx.Input(framework::GradVarName("Out")); auto* src_index = ctx.Input("Dst_index"); - auto* dst_index = ctx.Input("Src_index"); - auto* Y = ctx.Output(framework::GradVarName("X")); - - const int& index_size = src_index->dims()[0]; - if (index_size == 0) return; + auto index_type = src_index->type(); - T* p_output = Y->mutable_data(ctx.GetPlace()); - const auto& src_dims = X->dims(); - int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; - const size_t& memset_bytes = memset_size * sizeof(T); - memset(p_output, 0, memset_bytes); - - const IndexT* s_index = src_index->data(); - const IndexT* d_index = dst_index->data(); - - const std::string& pool_type = ctx.Attr("pool_type"); - if (pool_type == "SUM") { - send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); - } else if (pool_type == "MEAN") { - auto* dst_count = ctx.Input("Dst_count"); - const int* s_count = dst_count->data(); - // Functor not used here. - send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); - } else if (pool_type == "MIN" || pool_type == "MAX") { - const auto* input = ctx.Input("X"); - const auto* output = ctx.Input("Out"); - // Functor not used here. - send_recv_cpu_for_loop_grad>( - src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, - input, output); + if (index_type == framework::proto::VarType::INT32) { + SendRecvGradOpKernelLaunchHelper(ctx); + } else if (index_type == framework::proto::VarType::INT64) { + SendRecvGradOpKernelLaunchHelper(ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported Src_index or Dst_index type, Expected int, int64, but " + "got %s.", + index_type)); } } }; diff --git a/python/paddle/fluid/tests/unittests/test_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_send_recv_op.py index aa02942f3736a5..2f87357e2b26b8 100644 --- a/python/paddle/fluid/tests/unittests/test_send_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_send_recv_op.py @@ -12,11 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import unittest + import numpy as np -from op_test import OpTest import paddle import paddle.fluid as fluid +from op_test import OpTest + class TestSendRecvMaxOp(OpTest): def setUp(self): @@ -188,3 +191,79 @@ def compute_send_recv_for_min_max(inputs, attributes): x[forward_src_idx] == results[forward_dst_idx]) return results, gradient / results.size + + +class API_SendRecvOpTest(unittest.TestCase): + def test_static(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[3, 3], dtype="float32") + src_index = paddle.static.data(name="src", shape=[4], dtype="int32") + dst_index = paddle.static.data(name="dst", shape=[4], dtype="int32") + + res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum") + res_mean = paddle.incubate.send_recv(x, src_index, dst_index, + "mean") + res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max") + res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min") + + exe = paddle.static.Executor(paddle.CPUPlace()) + data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype='float32') + data2 = np.array([0, 1, 2, 0], dtype="int32") + data3 = np.array([1, 2, 1, 0], dtype="int32") + + np_sum = np.array( + [[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32") + np_mean = np.array( + [[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32") + np_max = np.array( + [[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32") + np_min = np.array( + [[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") + + ret = exe.run(feed={'x': data1, + 'src': data2, + 'dst': data3}, + fetch_list=[res_sum, res_mean, res_max, res_min]) + + for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret): + self.assertTrue( + np.allclose( + np_res, ret_res, atol=1e-6), + "two value is\ + {}\n{}, check diff!".format(np_res, ret_res)) + + def test_dygraph(self): + device = paddle.CPUPlace() + with paddle.fluid.dygraph.guard(device): + x = paddle.to_tensor( + np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") + src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") + dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") + res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum") + res_mean = paddle.incubate.send_recv(x, src_index, dst_index, + "mean") + res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max") + res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min") + + np_sum = np.array( + [[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32") + np_mean = np.array( + [[0, 2, 3], [1, 4, 5], [1, 4, 5]], dtype="float32") + np_max = np.array( + [[0, 2, 3], [2, 6, 7], [1, 4, 5]], dtype="float32") + np_min = np.array( + [[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="float32") + + ret = [res_sum, res_mean, res_max, res_min] + + for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret): + self.assertTrue( + np.allclose( + np_res, ret_res, atol=1e-6), + "two value is\ + {}\n{}, check diff!".format(np_res, ret_res)) + + +if __name__ == '__main__': + unittest.main() From e7a3c0d32d245aa7b3b841bf55166b24d4c613a6 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 17 Nov 2021 04:03:06 +0000 Subject: [PATCH 20/30] delete redundant input tensor --- paddle/fluid/operators/send_recv_op.cu | 26 ++++++++++++++------------ paddle/fluid/operators/send_recv_op.h | 25 +++++++++++++------------ 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu index 15dcda7e1cb077..1a8fff782b05a2 100644 --- a/paddle/fluid/operators/send_recv_op.cu +++ b/paddle/fluid/operators/send_recv_op.cu @@ -137,14 +137,14 @@ __global__ void ManipulateMinMaxGradCUDAKernel( } template -void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx) { +void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, + const Tensor& src_index) { auto* X = ctx.Input("X"); - auto* src_index = ctx.Input("Src_index"); auto* dst_index = ctx.Input("Dst_index"); auto* Y = ctx.Output("Out"); std::string pool_type = ctx.Attr("pool_type"); - const int& index_size = src_index->dims()[0]; + const int& index_size = src_index.dims()[0]; T* p_output = Y->mutable_data(ctx.GetPlace()); const auto& src_dims = X->dims(); @@ -176,7 +176,7 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx) { slice_size *= src_dims[i]; } const T* p_src = X->data(); - const IndexT* s_index = src_index->data(); + const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index->data(); int block = 512; @@ -253,14 +253,13 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx) { template void SendRecvGradOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx) { + const framework::ExecutionContext& ctx, const Tensor& src_index) { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* src_index = ctx.Input("Dst_index"); auto* dst_index = ctx.Input("Src_index"); auto* Y = ctx.Output(framework::GradVarName("X")); std::string pool_type = ctx.Attr("pool_type"); - const int& index_size = src_index->dims()[0]; + const int& index_size = src_index.dims()[0]; if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); @@ -282,7 +281,7 @@ void SendRecvGradOpCUDAKernelLaunchHelper( slice_size *= src_dims[i]; } const T* p_src = X->data(); - const IndexT* s_index = src_index->data(); + const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index->data(); int block = 512; @@ -327,9 +326,10 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { auto index_type = src_index->type(); if (index_type == framework::proto::VarType::INT32) { - SendRecvOpCUDAKernelLaunchHelper(ctx); + SendRecvOpCUDAKernelLaunchHelper(ctx, *src_index); } else if (index_type == framework::proto::VarType::INT64) { - SendRecvOpCUDAKernelLaunchHelper(ctx); + SendRecvOpCUDAKernelLaunchHelper(ctx, + *src_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported Src_index or Dst_index type, Expected int, int64, but " @@ -346,9 +346,11 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { auto* src_index = ctx.Input("Dst_index"); auto index_type = src_index->type(); if (index_type == framework::proto::VarType::INT32) { - SendRecvGradOpCUDAKernelLaunchHelper(ctx); + SendRecvGradOpCUDAKernelLaunchHelper(ctx, + *src_index); } else if (index_type == framework::proto::VarType::INT64) { - SendRecvGradOpCUDAKernelLaunchHelper(ctx); + SendRecvGradOpCUDAKernelLaunchHelper( + ctx, *src_index); } } }; diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h index 151b10f26eead0..1e47a944799cee 100644 --- a/paddle/fluid/operators/send_recv_op.h +++ b/paddle/fluid/operators/send_recv_op.h @@ -166,13 +166,13 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size, } template -void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx) { +void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, + const Tensor& src_index) { auto* X = ctx.Input("X"); - auto* src_index = ctx.Input("Src_index"); auto* dst_index = ctx.Input("Dst_index"); auto* Y = ctx.Output("Out"); - const int& index_size = src_index->dims()[0]; + const int& index_size = src_index.dims()[0]; T* p_output = Y->mutable_data(ctx.GetPlace()); const auto& src_dims = X->dims(); @@ -183,7 +183,7 @@ void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx) { if (index_size == 0) return; - const IndexT* s_index = src_index->data(); + const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index->data(); const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { @@ -206,13 +206,13 @@ void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx) { } template -void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx) { +void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx, + const Tensor& src_index) { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* src_index = ctx.Input("Dst_index"); auto* dst_index = ctx.Input("Src_index"); auto* Y = ctx.Output(framework::GradVarName("X")); - const int& index_size = src_index->dims()[0]; + const int& index_size = src_index.dims()[0]; T* p_output = Y->mutable_data(ctx.GetPlace()); const auto& src_dims = X->dims(); @@ -223,7 +223,7 @@ void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx) { if (index_size == 0) return; - const IndexT* s_index = src_index->data(); + const IndexT* s_index = src_index.data(); const IndexT* d_index = dst_index->data(); const std::string& pool_type = ctx.Attr("pool_type"); @@ -254,9 +254,9 @@ class SendRecvOpKernel : public framework::OpKernel { auto index_type = src_index->type(); if (index_type == framework::proto::VarType::INT32) { - SendRecvOpKernelLaunchHelper(ctx); + SendRecvOpKernelLaunchHelper(ctx, *src_index); } else if (index_type == framework::proto::VarType::INT64) { - SendRecvOpKernelLaunchHelper(ctx); + SendRecvOpKernelLaunchHelper(ctx, *src_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported Src_index or Dst_index type, Expected int, int64, but " @@ -274,9 +274,10 @@ class SendRecvGradOpKernel : public framework::OpKernel { auto index_type = src_index->type(); if (index_type == framework::proto::VarType::INT32) { - SendRecvGradOpKernelLaunchHelper(ctx); + SendRecvGradOpKernelLaunchHelper(ctx, *src_index); } else if (index_type == framework::proto::VarType::INT64) { - SendRecvGradOpKernelLaunchHelper(ctx); + SendRecvGradOpKernelLaunchHelper(ctx, + *src_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported Src_index or Dst_index type, Expected int, int64, but " From 42708f752d4442923c3775bd1852ee1157055d50 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 17 Nov 2021 06:04:47 +0000 Subject: [PATCH 21/30] fix en example and docs, add default value in pool_type --- python/paddle/incubate/operators/send_recv.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py index 2f145780e02a5d..56fc2a8982ce96 100644 --- a/python/paddle/incubate/operators/send_recv.py +++ b/python/paddle/incubate/operators/send_recv.py @@ -18,14 +18,14 @@ from paddle.fluid import core -def send_recv(x, src_index, dst_index, pool_type, name=None): +def send_recv(x, src_index, dst_index, pool_type="sum", name=None): r""" Send Recv Operator. This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` - to gather the corresponding positions, and then scatter the corresponding output tensor in different + to gather the corresponding positions, and then use `dst_index` to scatter the corresponding output tensor in different pooling types, like sum, mean, max, or min. Args: @@ -34,6 +34,7 @@ def send_recv(x, src_index, dst_index, pool_type, name=None): dst_index (Tensor): An 1-d tensor, and should have the same shape as `src_index`. The available data type is int32, int64. pool_type (str): The pooling type of send_recv, including `sum`, `mean`, `max`, `min`. + Default value is `sum`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -51,7 +52,7 @@ def send_recv(x, src_index, dst_index, pool_type, name=None): src_index = indexes[:, 0] dst_index = indexes[:, 1] out = paddle.incubate.send_recv(x, src_index, dst_index, pool_type="sum") - # Outputs: [[0., 2., 3.], [2., 8., 10.], [2., 6., 7.]] + # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]] """ From a1adbf1aed03bcf432d9c02d42dcc1270ec72f3f Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 17 Nov 2021 08:07:19 +0000 Subject: [PATCH 22/30] add shape judge and max grid judge --- paddle/fluid/operators/send_recv_op.cc | 12 +- paddle/fluid/operators/send_recv_op.cu | 108 ++++++++++++------ python/paddle/incubate/operators/send_recv.py | 28 ++++- 3 files changed, 106 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc index 64bb1fb13beaa6..e9eb36ac0b95ea 100644 --- a/paddle/fluid/operators/send_recv_op.cc +++ b/paddle/fluid/operators/send_recv_op.cc @@ -59,7 +59,11 @@ class SendRecvOP : public framework::OperatorWithKernel { dst_index_dims.size())); } - // TODO(daisiming): If the shape of src_index and dst_index should be same? + PADDLE_ENFORCE_EQ( + src_index_dims[0], dst_index_dims[0], + platform::errors::InvalidArgument( + "Src_index and Dst_index should have the same shape.")); + auto dims = ctx->GetInputDim("X"); ctx->SetOutputDim("Out", dims); @@ -117,14 +121,14 @@ class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker { .InEnum({"SUM", "MEAN", "MIN", "MAX"}); // TODO(daisiming): Add a simple example here. AddComment(R"DOC( -SendRecv Operator. +Graph Learning Send_Recv combine operator. $Out = Recv(Send(X, Src_index), Dst_index, pool_type)$ -This operator is mainly used in Graph domain, and the main purpose is to reduce +This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` to gather corresponding -positions, and then scatter the corresponding output tensor in different pooling +positions, and then use `dst_index` to scatter the corresponding output tensor in different pooling types, like sum, mean, max, or min. )DOC"); diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu index 1a8fff782b05a2..410cc06e6e9223 100644 --- a/paddle/fluid/operators/send_recv_op.cu +++ b/paddle/fluid/operators/send_recv_op.cu @@ -66,13 +66,23 @@ __global__ void SendRecvCUDAKernel(const T* params, const IndexT* src_indices, } } -// For min and max +// For max template -__global__ void InputResetCUDAKernel(T* output, size_t input_size, - size_t slice_size) { +__global__ void InputResetMaxCUDAKernel(T* output, size_t input_size, + size_t slice_size) { CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { - if (*(output + i) == std::numeric_limits::min() || - *(output + i) == std::numeric_limits::max()) { + if (*(output + i) == std::numeric_limits::min()) { + *(output + i) = 0; + } + } +} + +// For min +template +__global__ void InputResetMinCUDAKernel(T* output, size_t input_size, + size_t slice_size) { + CUDA_KERNEL_LOOP_TYPE(i, input_size * slice_size, int64_t) { + if (*(output + i) == std::numeric_limits::max()) { *(output + i) = 0; } } @@ -138,9 +148,9 @@ __global__ void ManipulateMinMaxGradCUDAKernel( template void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { + const Tensor& src_index, + const Tensor& dst_index) { auto* X = ctx.Input("X"); - auto* dst_index = ctx.Input("Dst_index"); auto* Y = ctx.Output("Out"); std::string pool_type = ctx.Attr("pool_type"); @@ -177,11 +187,18 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, } const T* p_src = X->data(); const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); + const IndexT* d_index = dst_index.data(); - int block = 512; +#ifdef PADDLE_WITH_HIP + int block = 256; +#else + int block = 1024; +#endif int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; + const auto& dev_ctx = ctx.cuda_device_context(); + int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; + int64_t grid_tmp = (n + block - 1) / block; + int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t input_size = src_dims[0]; if (pool_type == "SUM") { SendRecvSumCUDAFunctor functor; @@ -198,8 +215,10 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, functor); - int64_t grid_max = (input_size * slice_size + block - 1) / block; - InputResetCUDAKernel< + int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_max = + grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; + InputResetMaxCUDAKernel< T><<( ctx.device_context()) @@ -212,8 +231,10 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, functor); - int64_t grid_min = (input_size * slice_size + block - 1) / block; - InputResetCUDAKernel< + int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_min = + grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; + InputResetMinCUDAKernel< T><<( ctx.device_context()) @@ -242,7 +263,9 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, ctx.device_context()) .stream()>>>(p_dst_count, d_index, index_size); - int64_t grid_mean = (input_size * slice_size + block - 1) / block; + int64_t grid_mean_tmp = (input_size * slice_size + block - 1) / block; + int64_t grid_mean = + grid_mean_tmp < max_grid_dimx ? grid_mean_tmp : max_grid_dimx; ManipulateMeanCUDAKernel< T><<( @@ -253,14 +276,13 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, template void SendRecvGradOpCUDAKernelLaunchHelper( - const framework::ExecutionContext& ctx, const Tensor& src_index) { + const framework::ExecutionContext& ctx, const Tensor& src_index, + const Tensor& dst_index) { auto* X = ctx.Input(framework::GradVarName("Out")); - auto* dst_index = ctx.Input("Src_index"); auto* Y = ctx.Output(framework::GradVarName("X")); std::string pool_type = ctx.Attr("pool_type"); const int& index_size = src_index.dims()[0]; - if (index_size == 0) return; T* p_output = Y->mutable_data(ctx.GetPlace()); const auto& src_dims = X->dims(); @@ -276,19 +298,27 @@ void SendRecvGradOpCUDAKernelLaunchHelper( cudaMemset(p_output, 0, memset_bytes); #endif + if (index_size == 0) return; + int64_t slice_size = 1; for (int i = 1; i < src_dims.size(); ++i) { slice_size *= src_dims[i]; } const T* p_src = X->data(); const IndexT* s_index = src_index.data(); - const IndexT* d_index = dst_index->data(); + const IndexT* d_index = dst_index.data(); - int block = 512; +#ifdef PADDLE_WITH_HIP + int block = 256; +#else + int block = 1024; +#endif int64_t n = slice_size * index_size; - int64_t grid = (n + block - 1) / block; + const auto& dev_ctx = ctx.cuda_device_context(); + int64_t max_grid_dimx = dev_ctx.GetCUDAMaxGridDimSize().x; + int64_t grid_tmp = (n + block - 1) / block; + int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t input_size = src_dims[0]; - if (pool_type == "SUM") { SendRecvSumCUDAFunctor functor; SendRecvCUDAKernel><<< @@ -323,18 +353,21 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* src_index = ctx.Input("Src_index"); - auto index_type = src_index->type(); - - if (index_type == framework::proto::VarType::INT32) { - SendRecvOpCUDAKernelLaunchHelper(ctx, *src_index); - } else if (index_type == framework::proto::VarType::INT64) { - SendRecvOpCUDAKernelLaunchHelper(ctx, - *src_index); + auto* dst_index = ctx.Input("Dst_index"); + auto src_index_type = src_index->type(); + auto dst_index_type = dst_index->type(); + + if (src_index_type == framework::proto::VarType::INT32) { + SendRecvOpCUDAKernelLaunchHelper(ctx, *src_index, + *dst_index); + } else if (src_index_type == framework::proto::VarType::INT64) { + SendRecvOpCUDAKernelLaunchHelper( + ctx, *src_index, *dst_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported Src_index or Dst_index type, Expected int, int64, but " + "Unsupported Src_index or Dst_index dtype, expected int, int64, but " "got %s.", - index_type)); + src_index_type)); } } }; @@ -344,13 +377,20 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* src_index = ctx.Input("Dst_index"); + auto* dst_index = ctx.Input("Src_index"); auto index_type = src_index->type(); + if (index_type == framework::proto::VarType::INT32) { - SendRecvGradOpCUDAKernelLaunchHelper(ctx, - *src_index); + SendRecvGradOpCUDAKernelLaunchHelper( + ctx, *src_index, *dst_index); } else if (index_type == framework::proto::VarType::INT64) { SendRecvGradOpCUDAKernelLaunchHelper( - ctx, *src_index); + ctx, *src_index, *dst_index); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported Src_index or Dst_index dtype, expected int, int64, but " + "got %s.", + src_index_type)); } } }; diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py index 56fc2a8982ce96..52a0ea23955253 100644 --- a/python/paddle/incubate/operators/send_recv.py +++ b/python/paddle/incubate/operators/send_recv.py @@ -21,13 +21,33 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): r""" - Send Recv Operator. + Graph Learning Send_Recv combine operator. - This operator is mainly used in Graph domain, and the main purpose is to reduce intermediate memory + This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` to gather the corresponding positions, and then use `dst_index` to scatter the corresponding output tensor in different pooling types, like sum, mean, max, or min. + .. code-block:: text + + Given: + + X = [[0, 2, 3], + [1, 4, 5], + [2, 6, 7]] + + src_index = [0, 1, 2, 0] + + dst_index = [1, 2, 1, 0] + + pool_type = "sum" + + Then: + + Out = [[0, 2, 3], + [2, 8, 10], + [1, 4, 5]] + Args: x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64. src_index (Tensor): An 1-d tensor, and the available data type is int32, int64. @@ -39,14 +59,14 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): For more information, please refer to :ref:`api_guide_Name`. Returns: - out (Tensor): The output tensor, should have the same shape as input tensor `x`. + out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`. Examples: .. code-block:: python - import paddle import numpy as np + import paddle x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") indexes = paddle.to_tensor(np.array([[0, 1], [1, 2], [2, 1], [0, 0]]), dtype="int32") src_index = indexes[:, 0] From ca1a3c141d21d5d50892a90c96eb61e510dadf88 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 17 Nov 2021 08:29:59 +0000 Subject: [PATCH 23/30] fix comment --- paddle/fluid/operators/send_recv_op.cc | 1 - python/paddle/incubate/operators/send_recv.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc index e9eb36ac0b95ea..5f51cba578a2a3 100644 --- a/paddle/fluid/operators/send_recv_op.cc +++ b/paddle/fluid/operators/send_recv_op.cc @@ -119,7 +119,6 @@ class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker { "tensors of Dst_index.") .SetDefault("SUM") .InEnum({"SUM", "MEAN", "MIN", "MAX"}); - // TODO(daisiming): Add a simple example here. AddComment(R"DOC( Graph Learning Send_Recv combine operator. diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py index 52a0ea23955253..0ee8e2ab2b83c9 100644 --- a/python/paddle/incubate/operators/send_recv.py +++ b/python/paddle/incubate/operators/send_recv.py @@ -50,8 +50,8 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): Args: x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64. - src_index (Tensor): An 1-d tensor, and the available data type is int32, int64. - dst_index (Tensor): An 1-d tensor, and should have the same shape as `src_index`. + src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. + dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. pool_type (str): The pooling type of send_recv, including `sum`, `mean`, `max`, `min`. Default value is `sum`. @@ -67,6 +67,7 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): import numpy as np import paddle + x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") indexes = paddle.to_tensor(np.array([[0, 1], [1, 2], [2, 1], [0, 0]]), dtype="int32") src_index = indexes[:, 0] From f1c9c22a7c47caeac4b0c39666292f9d3842c924 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 17 Nov 2021 08:35:22 +0000 Subject: [PATCH 24/30] fix index type bug --- paddle/fluid/operators/send_recv_op.cu | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/send_recv_op.cu index 410cc06e6e9223..c9725740d5737e 100644 --- a/paddle/fluid/operators/send_recv_op.cu +++ b/paddle/fluid/operators/send_recv_op.cu @@ -354,20 +354,19 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* src_index = ctx.Input("Src_index"); auto* dst_index = ctx.Input("Dst_index"); - auto src_index_type = src_index->type(); - auto dst_index_type = dst_index->type(); + auto index_type = src_index->type(); - if (src_index_type == framework::proto::VarType::INT32) { + if (index_type == framework::proto::VarType::INT32) { SendRecvOpCUDAKernelLaunchHelper(ctx, *src_index, *dst_index); - } else if (src_index_type == framework::proto::VarType::INT64) { + } else if (index_type == framework::proto::VarType::INT64) { SendRecvOpCUDAKernelLaunchHelper( ctx, *src_index, *dst_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported Src_index or Dst_index dtype, expected int, int64, but " "got %s.", - src_index_type)); + index_type)); } } }; @@ -390,7 +389,7 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported Src_index or Dst_index dtype, expected int, int64, but " "got %s.", - src_index_type)); + index_type)); } } }; From 6d7d4fb81808033d4fe76b11723d6f8d48dc96e1 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 17 Nov 2021 08:51:21 +0000 Subject: [PATCH 25/30] add const & --- paddle/fluid/operators/send_recv_op.h | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/send_recv_op.h index 1e47a944799cee..c5f19a1aefed07 100644 --- a/paddle/fluid/operators/send_recv_op.h +++ b/paddle/fluid/operators/send_recv_op.h @@ -77,19 +77,19 @@ void send_recv_cpu_for_loop(const int& input_size, const int& index_size, const IndexT* s_index, const IndexT* d_index, const Tensor& src, Tensor* dst, const std::string& pool_type, - int* dst_count = NULL) { + int* dst_count = nullptr) { Functor functor; if (pool_type == "SUM") { for (int i = 0; i < index_size; ++i) { - IndexT src_idx = s_index[i]; - IndexT dst_idx = d_index[i]; + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; elementwise_inner_operation(src, dst, src_idx, dst_idx, false, functor); } } else if (pool_type == "MEAN") { for (int i = 0; i < index_size; ++i) { - IndexT src_idx = s_index[i]; - IndexT dst_idx = d_index[i]; + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; elementwise_inner_operation(src, dst, src_idx, dst_idx, false, functor); } @@ -106,8 +106,8 @@ void send_recv_cpu_for_loop(const int& input_size, const int& index_size, } else if (pool_type == "MIN" || pool_type == "MAX") { std::set existed_dst; for (int i = 0; i < index_size; ++i) { - IndexT src_idx = s_index[i]; - IndexT dst_idx = d_index[i]; + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; bool in_set = existed_dst.find(dst_idx) != existed_dst.end(); if (!in_set) { elementwise_inner_operation(src, dst, src_idx, @@ -132,15 +132,15 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size, if (pool_type == "SUM") { Functor functor; for (int i = 0; i < index_size; ++i) { - IndexT src_idx = s_index[i]; - IndexT dst_idx = d_index[i]; + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; elementwise_inner_operation(src, dst, src_idx, dst_idx, false, functor); } } else if (pool_type == "MEAN") { for (int i = 0; i < index_size; ++i) { - IndexT src_idx = s_index[i]; - IndexT dst_idx = d_index[i]; + const IndexT& src_idx = s_index[i]; + const IndexT& dst_idx = d_index[i]; auto src_slice = src.Slice(src_idx, src_idx + 1); auto dst_slice = dst->Slice(dst_idx, dst_idx + 1); auto eigen_src = framework::EigenVector::Flatten(src_slice); @@ -149,8 +149,8 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size, } } else if (pool_type == "MIN" || pool_type == "MAX") { for (int i = 0; i < index_size; ++i) { - auto forward_src_idx = d_index[i]; - auto forward_dst_idx = s_index[i]; + const IndexT& forward_src_idx = d_index[i]; + const IndexT& forward_dst_idx = s_index[i]; auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); auto eigen_input = framework::EigenVector::Flatten(input_slice); From ed0c10e1dc67b08309dccc0a18c480f28212fc78 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Wed, 17 Nov 2021 13:55:46 +0000 Subject: [PATCH 26/30] fix en docs --- python/paddle/incubate/operators/send_recv.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py index 0ee8e2ab2b83c9..2ee5594021d8e5 100644 --- a/python/paddle/incubate/operators/send_recv.py +++ b/python/paddle/incubate/operators/send_recv.py @@ -25,8 +25,8 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` - to gather the corresponding positions, and then use `dst_index` to scatter the corresponding output tensor in different - pooling types, like sum, mean, max, or min. + to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor + in different pooling types, like sum, mean, max, or min. .. code-block:: text From 6c59c790993f6dd5ee119374f0cb3bf673e43688 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 18 Nov 2021 02:26:47 +0000 Subject: [PATCH 27/30] delete numpy in examples --- python/paddle/incubate/operators/send_recv.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/send_recv.py index 2ee5594021d8e5..7ef2a34b05c9ec 100644 --- a/python/paddle/incubate/operators/send_recv.py +++ b/python/paddle/incubate/operators/send_recv.py @@ -65,11 +65,10 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): .. code-block:: python - import numpy as np import paddle - x = paddle.to_tensor(np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") - indexes = paddle.to_tensor(np.array([[0, 1], [1, 2], [2, 1], [0, 0]]), dtype="int32") + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] out = paddle.incubate.send_recv(x, src_index, dst_index, pool_type="sum") From 52a48a13f1564104b6a8eb7701f8deedb78e12de Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 18 Nov 2021 02:43:19 +0000 Subject: [PATCH 28/30] add unittest for int input --- .../tests/unittests/test_send_recv_op.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_send_recv_op.py index 2f87357e2b26b8..098d1a2d28cb5f 100644 --- a/python/paddle/fluid/tests/unittests/test_send_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_send_recv_op.py @@ -264,6 +264,36 @@ def test_dygraph(self): "two value is\ {}\n{}, check diff!".format(np_res, ret_res)) + def test_int32_input(self): + device = paddle.CPUPlace() + with paddle.fluid.dygraph.guard(device): + x = paddle.to_tensor( + np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="int32") + src_index = paddle.to_tensor( + np.array([0, 1, 2, 0, 1]), dtype="int32") + dst_index = paddle.to_tensor( + np.array([1, 2, 1, 0, 1]), dtype="int32") + res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum") + res_mean = paddle.incubate.send_recv(x, src_index, dst_index, + "mean") + res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max") + res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min") + + np_sum = np.array( + [[0, 2, 3], [3, 12, 14], [1, 4, 5]], dtype="int32") + np_mean = np.array([[0, 2, 3], [1, 4, 4], [1, 4, 5]], dtype="int32") + np_max = np.array([[0, 2, 3], [2, 6, 6], [1, 4, 5]], dtype="int32") + np_min = np.array([[0, 2, 3], [0, 2, 3], [1, 4, 5]], dtype="int32") + + ret = [res_sum, res_mean, res_max, res_min] + + for np_res, ret_res in zip([np_sum, np_mean, np_max, np_min], ret): + self.assertTrue( + np.allclose( + np_res, ret_res, atol=1e-6), + "two value is\ + {}\n{}, check diff!".format(np_res, ret_res)) + if __name__ == '__main__': unittest.main() From 4bb5669d360906e816ed1384388713035757d586 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 18 Nov 2021 02:52:05 +0000 Subject: [PATCH 29/30] fix send_recv comment --- paddle/fluid/operators/send_recv_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/send_recv_op.cc index 5f51cba578a2a3..151fd1eedaa64c 100644 --- a/paddle/fluid/operators/send_recv_op.cc +++ b/paddle/fluid/operators/send_recv_op.cc @@ -126,9 +126,9 @@ Graph Learning Send_Recv combine operator. This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. -Take `x` as the input tensor, we first use `src_index` to gather corresponding -positions, and then use `dst_index` to scatter the corresponding output tensor in different pooling -types, like sum, mean, max, or min. +Take `x` as the input tensor, we first use `src_index` to gather corresponding data, +and then use `dst_index` to update the corresponding position of output tensor in different +pooling types, like sum, mean, max, or min. )DOC"); } From f0137ecb0e1539dfd18b776fa4cacbe10274c604 Mon Sep 17 00:00:00 2001 From: DesmonDay <908660116@qq.com> Date: Thu, 18 Nov 2021 07:31:16 +0000 Subject: [PATCH 30/30] change send_recv to graph_send_recv --- ...{send_recv_op.cc => graph_send_recv_op.cc} | 55 ++++++------ ...{send_recv_op.cu => graph_send_recv_op.cu} | 85 ++++++++++--------- .../{send_recv_op.h => graph_send_recv_op.h} | 66 +++++++------- ..._recv_op.py => test_graph_send_recv_op.py} | 74 +++++++++------- python/paddle/incubate/__init__.py | 4 +- python/paddle/incubate/operators/__init__.py | 2 +- .../{send_recv.py => graph_send_recv.py} | 20 ++--- 7 files changed, 162 insertions(+), 144 deletions(-) rename paddle/fluid/operators/{send_recv_op.cc => graph_send_recv_op.cc} (77%) rename paddle/fluid/operators/{send_recv_op.cu => graph_send_recv_op.cu} (84%) rename paddle/fluid/operators/{send_recv_op.h => graph_send_recv_op.h} (82%) rename python/paddle/fluid/tests/unittests/{test_send_recv_op.py => test_graph_send_recv_op.py} (77%) rename python/paddle/incubate/operators/{send_recv.py => graph_send_recv.py} (84%) diff --git a/paddle/fluid/operators/send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc similarity index 77% rename from paddle/fluid/operators/send_recv_op.cc rename to paddle/fluid/operators/graph_send_recv_op.cc index 151fd1eedaa64c..6af8388d9eba4e 100644 --- a/paddle/fluid/operators/send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -12,22 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/send_recv_op.h" +#include "paddle/fluid/operators/graph_send_recv_op.h" namespace paddle { namespace operators { -class SendRecvOP : public framework::OperatorWithKernel { +class GraphSendRecvOP : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SendRecv"); + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GraphSendRecv"); OP_INOUT_CHECK(ctx->HasInput("Src_index"), "Input", "Src_index", - "SendRecv"); + "GraphSendRecv"); OP_INOUT_CHECK(ctx->HasInput("Dst_index"), "Input", "Dst_index", - "SendRecv"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SendRecv"); + "GraphSendRecv"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "GraphSendRecv"); auto src_index_dims = ctx->GetInputDim("Src_index"); if (src_index_dims.size() == 2) { @@ -69,7 +69,7 @@ class SendRecvOP : public framework::OperatorWithKernel { if (ctx->Attrs().Get("pool_type") == "MEAN") { OP_INOUT_CHECK(ctx->HasOutput("Dst_count"), "Output", "Dst_count", - "SendRecv"); + "GraphSendRecv"); ctx->SetOutputDim("Dst_count", {dims[0]}); } } @@ -83,7 +83,7 @@ class SendRecvOP : public framework::OperatorWithKernel { } }; -class SendRecvGradOp : public framework::OperatorWithKernel { +class GraphSendRecvGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -101,15 +101,14 @@ class SendRecvGradOp : public framework::OperatorWithKernel { } }; -class SendRecvOpMaker : public framework::OpProtoAndCheckerMaker { +class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "The input tensor with data type float32, " - "float64 or float16"); + "The input tensor with data type float32, float64, int32, int64."); AddInput("Src_index", "The source index tensor."); AddInput("Dst_index", "The destination index tensor."); - AddOutput("Out", "Output tensor of send_recv op."); + AddOutput("Out", "Output tensor of graph_send_recv op."); AddOutput("Dst_count", "Count tensor of Dst_index, mainly for MEAN pool_type.") .AsIntermediate(); @@ -135,13 +134,13 @@ pooling types, like sum, mean, max, or min. }; template -class SendRecvGradOpMaker : public framework::SingleGradOpMaker { +class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { public: using framework::SingleGradOpMaker::SingleGradOpMaker; protected: void Apply(GradOpPtr op) const override { - op->SetType("send_recv_grad"); + op->SetType("graph_send_recv_grad"); op->SetInput("Src_index", this->Input("Src_index")); op->SetInput("Dst_index", this->Input("Dst_index")); @@ -167,16 +166,18 @@ class SendRecvGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; -REGISTER_OPERATOR(send_recv, ops::SendRecvOP, ops::SendRecvOpMaker, - ops::SendRecvGradOpMaker, - ops::SendRecvGradOpMaker); -REGISTER_OPERATOR(send_recv_grad, ops::SendRecvGradOp); -REGISTER_OP_CPU_KERNEL(send_recv, ops::SendRecvOpKernel, - ops::SendRecvOpKernel, - ops::SendRecvOpKernel, - ops::SendRecvOpKernel); - -REGISTER_OP_CPU_KERNEL(send_recv_grad, ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel, - ops::SendRecvGradOpKernel); +REGISTER_OPERATOR(graph_send_recv, ops::GraphSendRecvOP, + ops::GraphSendRecvOpMaker, + ops::GraphSendRecvGradOpMaker, + ops::GraphSendRecvGradOpMaker); +REGISTER_OPERATOR(graph_send_recv_grad, ops::GraphSendRecvGradOp); +REGISTER_OP_CPU_KERNEL(graph_send_recv, ops::GraphSendRecvOpKernel, + ops::GraphSendRecvOpKernel, + ops::GraphSendRecvOpKernel, + ops::GraphSendRecvOpKernel); + +REGISTER_OP_CPU_KERNEL(graph_send_recv_grad, + ops::GraphSendRecvGradOpKernel, + ops::GraphSendRecvGradOpKernel, + ops::GraphSendRecvGradOpKernel, + ops::GraphSendRecvGradOpKernel); diff --git a/paddle/fluid/operators/send_recv_op.cu b/paddle/fluid/operators/graph_send_recv_op.cu similarity index 84% rename from paddle/fluid/operators/send_recv_op.cu rename to paddle/fluid/operators/graph_send_recv_op.cu index c9725740d5737e..d9f56ec4dc0388 100644 --- a/paddle/fluid/operators/send_recv_op.cu +++ b/paddle/fluid/operators/graph_send_recv_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/send_recv_op.h" +#include "paddle/fluid/operators/graph_send_recv_op.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/place.h" @@ -27,7 +27,7 @@ namespace operators { using Tensor = framework::Tensor; template -struct SendRecvSumCUDAFunctor { +struct GraphSendRecvSumCUDAFunctor { DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, const IndexT& out_i) { paddle::platform::CudaAtomicAdd(output + out_i, *(params + in_i)); @@ -35,7 +35,7 @@ struct SendRecvSumCUDAFunctor { }; template -struct SendRecvMaxCUDAFunctor { +struct GraphSendRecvMaxCUDAFunctor { DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, const IndexT& out_i) { paddle::platform::CudaAtomicMax(output + out_i, *(params + in_i)); @@ -43,7 +43,7 @@ struct SendRecvMaxCUDAFunctor { }; template -struct SendRecvMinCUDAFunctor { +struct GraphSendRecvMinCUDAFunctor { DEVICE inline void operator()(const T* params, T* output, const IndexT& in_i, const IndexT& out_i) { paddle::platform::CudaAtomicMin(output + out_i, *(params + in_i)); @@ -51,10 +51,11 @@ struct SendRecvMinCUDAFunctor { }; template -__global__ void SendRecvCUDAKernel(const T* params, const IndexT* src_indices, - const IndexT* dst_indices, T* output, - size_t index_size, size_t slice_size, - Functor functor) { +__global__ void GraphSendRecvCUDAKernel(const T* params, + const IndexT* src_indices, + const IndexT* dst_indices, T* output, + size_t index_size, size_t slice_size, + Functor functor) { CUDA_KERNEL_LOOP_TYPE(i, index_size * slice_size, int64_t) { int64_t indices_i = i / slice_size; int64_t slice_i = i - indices_i * slice_size; @@ -147,9 +148,9 @@ __global__ void ManipulateMinMaxGradCUDAKernel( } template -void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index, - const Tensor& dst_index) { +void GraphSendRecvOpCUDAKernelLaunchHelper( + const framework::ExecutionContext& ctx, const Tensor& src_index, + const Tensor& dst_index) { auto* X = ctx.Input("X"); auto* Y = ctx.Output("Out"); std::string pool_type = ctx.Attr("pool_type"); @@ -201,15 +202,17 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t input_size = src_dims[0]; if (pool_type == "SUM") { - SendRecvSumCUDAFunctor functor; - SendRecvCUDAKernel><<< + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, s_index, d_index, p_output, index_size, slice_size, functor); } else if (pool_type == "MAX") { - SendRecvMaxCUDAFunctor functor; - SendRecvCUDAKernel><<< + GraphSendRecvMaxCUDAFunctor functor; + GraphSendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, s_index, d_index, p_output, @@ -224,8 +227,9 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, ctx.device_context()) .stream()>>>(p_output, input_size, slice_size); } else if (pool_type == "MIN") { - SendRecvMinCUDAFunctor functor; - SendRecvCUDAKernel><<< + GraphSendRecvMinCUDAFunctor functor; + GraphSendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, s_index, d_index, p_output, @@ -240,8 +244,9 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, ctx.device_context()) .stream()>>>(p_output, input_size, slice_size); } else if (pool_type == "MEAN") { - SendRecvSumCUDAFunctor functor; - SendRecvCUDAKernel><<< + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, s_index, d_index, p_output, @@ -275,7 +280,7 @@ void SendRecvOpCUDAKernelLaunchHelper(const framework::ExecutionContext& ctx, } template -void SendRecvGradOpCUDAKernelLaunchHelper( +void GraphSendRecvGradOpCUDAKernelLaunchHelper( const framework::ExecutionContext& ctx, const Tensor& src_index, const Tensor& dst_index) { auto* X = ctx.Input(framework::GradVarName("Out")); @@ -320,8 +325,9 @@ void SendRecvGradOpCUDAKernelLaunchHelper( int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx; int64_t input_size = src_dims[0]; if (pool_type == "SUM") { - SendRecvSumCUDAFunctor functor; - SendRecvCUDAKernel><<< + GraphSendRecvSumCUDAFunctor functor; + GraphSendRecvCUDAKernel><<< grid, block, 0, reinterpret_cast( ctx.device_context()) .stream()>>>(p_src, s_index, d_index, p_output, @@ -349,7 +355,7 @@ void SendRecvGradOpCUDAKernelLaunchHelper( } template -class SendRecvOpCUDAKernel : public framework::OpKernel { +class GraphSendRecvOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* src_index = ctx.Input("Src_index"); @@ -357,10 +363,10 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { auto index_type = src_index->type(); if (index_type == framework::proto::VarType::INT32) { - SendRecvOpCUDAKernelLaunchHelper(ctx, *src_index, - *dst_index); + GraphSendRecvOpCUDAKernelLaunchHelper( + ctx, *src_index, *dst_index); } else if (index_type == framework::proto::VarType::INT64) { - SendRecvOpCUDAKernelLaunchHelper( + GraphSendRecvOpCUDAKernelLaunchHelper( ctx, *src_index, *dst_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -372,7 +378,7 @@ class SendRecvOpCUDAKernel : public framework::OpKernel { }; template -class SendRecvGradOpCUDAKernel : public framework::OpKernel { +class GraphSendRecvGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* src_index = ctx.Input("Dst_index"); @@ -380,10 +386,10 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { auto index_type = src_index->type(); if (index_type == framework::proto::VarType::INT32) { - SendRecvGradOpCUDAKernelLaunchHelper( + GraphSendRecvGradOpCUDAKernelLaunchHelper( ctx, *src_index, *dst_index); } else if (index_type == framework::proto::VarType::INT64) { - SendRecvGradOpCUDAKernelLaunchHelper( + GraphSendRecvGradOpCUDAKernelLaunchHelper( ctx, *src_index, *dst_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( @@ -400,13 +406,14 @@ class SendRecvGradOpCUDAKernel : public framework::OpKernel { using CUDA = paddle::platform::CUDADeviceContext; namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(send_recv, ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel, - ops::SendRecvOpCUDAKernel); - -REGISTER_OP_CUDA_KERNEL(send_recv_grad, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel, - ops::SendRecvGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(graph_send_recv, + ops::GraphSendRecvOpCUDAKernel, + ops::GraphSendRecvOpCUDAKernel, + ops::GraphSendRecvOpCUDAKernel, + ops::GraphSendRecvOpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(graph_send_recv_grad, + ops::GraphSendRecvGradOpCUDAKernel, + ops::GraphSendRecvGradOpCUDAKernel, + ops::GraphSendRecvGradOpCUDAKernel, + ops::GraphSendRecvGradOpCUDAKernel); diff --git a/paddle/fluid/operators/send_recv_op.h b/paddle/fluid/operators/graph_send_recv_op.h similarity index 82% rename from paddle/fluid/operators/send_recv_op.h rename to paddle/fluid/operators/graph_send_recv_op.h index c5f19a1aefed07..1c7ea74be2ff4c 100644 --- a/paddle/fluid/operators/send_recv_op.h +++ b/paddle/fluid/operators/graph_send_recv_op.h @@ -24,7 +24,7 @@ namespace operators { using Tensor = framework::Tensor; template -struct SendRecvSumFunctor { +struct GraphSendRecvSumFunctor { void operator()(const bool& first_flag, const Tensor& src_slice, Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); @@ -34,7 +34,7 @@ struct SendRecvSumFunctor { }; template -struct SendRecvMinFunctor { +struct GraphSendRecvMinFunctor { void operator()(const bool& first_flag, const Tensor& src_slice, Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); @@ -48,7 +48,7 @@ struct SendRecvMinFunctor { }; template -struct SendRecvMaxFunctor { +struct GraphSendRecvMaxFunctor { void operator()(const int& first_flag, const Tensor& src_slice, Tensor* dst_slice) { auto eigen_src = framework::EigenVector::Flatten(src_slice); @@ -73,11 +73,11 @@ void elementwise_inner_operation(const Tensor& src, Tensor* dst, } template -void send_recv_cpu_for_loop(const int& input_size, const int& index_size, - const IndexT* s_index, const IndexT* d_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - int* dst_count = nullptr) { +void graph_send_recv_cpu_for_loop(const int& input_size, const int& index_size, + const IndexT* s_index, const IndexT* d_index, + const Tensor& src, Tensor* dst, + const std::string& pool_type, + int* dst_count = nullptr) { Functor functor; if (pool_type == "SUM") { for (int i = 0; i < index_size; ++i) { @@ -122,13 +122,11 @@ void send_recv_cpu_for_loop(const int& input_size, const int& index_size, } template -void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size, - const IndexT* s_index, const IndexT* d_index, - const Tensor& src, Tensor* dst, - const std::string& pool_type, - const int* dst_count = nullptr, - const Tensor* input = nullptr, - const Tensor* output = nullptr) { +void graph_send_recv_cpu_for_loop_grad( + const int& input_size, const int& index_size, const IndexT* s_index, + const IndexT* d_index, const Tensor& src, Tensor* dst, + const std::string& pool_type, const int* dst_count = nullptr, + const Tensor* input = nullptr, const Tensor* output = nullptr) { if (pool_type == "SUM") { Functor functor; for (int i = 0; i < index_size; ++i) { @@ -166,8 +164,8 @@ void send_recv_cpu_for_loop_grad(const int& input_size, const int& index_size, } template -void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { +void GraphSendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, + const Tensor& src_index) { auto* X = ctx.Input("X"); auto* dst_index = ctx.Input("Dst_index"); auto* Y = ctx.Output("Out"); @@ -187,27 +185,27 @@ void SendRecvOpKernelLaunchHelper(const framework::ExecutionContext& ctx, const IndexT* d_index = dst_index->data(); const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { - send_recv_cpu_for_loop>( + graph_send_recv_cpu_for_loop>( src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); } else if (pool_type == "MIN") { - send_recv_cpu_for_loop>( + graph_send_recv_cpu_for_loop>( src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); } else if (pool_type == "MAX") { - send_recv_cpu_for_loop>( + graph_send_recv_cpu_for_loop>( src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { auto* dst_count = ctx.Output("Dst_count"); int* p_dst_count = dst_count->mutable_data(ctx.GetPlace()); memset(p_dst_count, 0, src_dims[0] * sizeof(int)); - send_recv_cpu_for_loop>( + graph_send_recv_cpu_for_loop>( src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, p_dst_count); } } template -void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx, - const Tensor& src_index) { +void GraphSendRecvGradOpKernelLaunchHelper( + const framework::ExecutionContext& ctx, const Tensor& src_index) { auto* X = ctx.Input(framework::GradVarName("Out")); auto* dst_index = ctx.Input("Src_index"); auto* Y = ctx.Output(framework::GradVarName("X")); @@ -228,35 +226,36 @@ void SendRecvGradOpKernelLaunchHelper(const framework::ExecutionContext& ctx, const std::string& pool_type = ctx.Attr("pool_type"); if (pool_type == "SUM") { - send_recv_cpu_for_loop_grad>( + graph_send_recv_cpu_for_loop_grad>( src_dims[0], index_size, s_index, d_index, *X, Y, pool_type); } else if (pool_type == "MEAN") { auto* dst_count = ctx.Input("Dst_count"); const int* s_count = dst_count->data(); // Functor not used here. - send_recv_cpu_for_loop_grad>( + graph_send_recv_cpu_for_loop_grad>( src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, s_count); } else if (pool_type == "MIN" || pool_type == "MAX") { const auto* input = ctx.Input("X"); const auto* output = ctx.Input("Out"); // Functor not used here. - send_recv_cpu_for_loop_grad>( + graph_send_recv_cpu_for_loop_grad>( src_dims[0], index_size, s_index, d_index, *X, Y, pool_type, nullptr, input, output); } } template -class SendRecvOpKernel : public framework::OpKernel { +class GraphSendRecvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* src_index = ctx.Input("Src_index"); auto index_type = src_index->type(); if (index_type == framework::proto::VarType::INT32) { - SendRecvOpKernelLaunchHelper(ctx, *src_index); + GraphSendRecvOpKernelLaunchHelper(ctx, *src_index); } else if (index_type == framework::proto::VarType::INT64) { - SendRecvOpKernelLaunchHelper(ctx, *src_index); + GraphSendRecvOpKernelLaunchHelper(ctx, + *src_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported Src_index or Dst_index type, Expected int, int64, but " @@ -267,17 +266,18 @@ class SendRecvOpKernel : public framework::OpKernel { }; template -class SendRecvGradOpKernel : public framework::OpKernel { +class GraphSendRecvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* src_index = ctx.Input("Dst_index"); auto index_type = src_index->type(); if (index_type == framework::proto::VarType::INT32) { - SendRecvGradOpKernelLaunchHelper(ctx, *src_index); + GraphSendRecvGradOpKernelLaunchHelper(ctx, + *src_index); } else if (index_type == framework::proto::VarType::INT64) { - SendRecvGradOpKernelLaunchHelper(ctx, - *src_index); + GraphSendRecvGradOpKernelLaunchHelper( + ctx, *src_index); } else { PADDLE_THROW(platform::errors::InvalidArgument( "Unsupported Src_index or Dst_index type, Expected int, int64, but " diff --git a/python/paddle/fluid/tests/unittests/test_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py similarity index 77% rename from python/paddle/fluid/tests/unittests/test_send_recv_op.py rename to python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py index 098d1a2d28cb5f..68b354775d13e6 100644 --- a/python/paddle/fluid/tests/unittests/test_send_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py @@ -21,10 +21,10 @@ from op_test import OpTest -class TestSendRecvMaxOp(OpTest): +class TestGraphSendRecvMaxOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "send_recv" + self.op_type = "graph_send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) src_index = index[:, 0] @@ -34,8 +34,8 @@ def setUp(self): self.attrs = {'pool_type': 'MAX'} - out, self.gradient = compute_send_recv_for_min_max(self.inputs, - self.attrs) + out, self.gradient = compute_graph_send_recv_for_min_max(self.inputs, + self.attrs) self.outputs = {'Out': out} def test_check_output(self): @@ -45,10 +45,10 @@ def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient]) -class TestSendRecvMinOp(OpTest): +class TestGraphSendRecvMinOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "send_recv" + self.op_type = "graph_send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) src_index = index[:, 0] @@ -58,8 +58,8 @@ def setUp(self): self.attrs = {'pool_type': 'MIN'} - out, self.gradient = compute_send_recv_for_min_max(self.inputs, - self.attrs) + out, self.gradient = compute_graph_send_recv_for_min_max(self.inputs, + self.attrs) self.outputs = {'Out': out} @@ -70,10 +70,10 @@ def test_check_grad(self): self.check_grad(['X'], 'Out', user_defined_grads=[self.gradient]) -class TestSendRecvSumOp(OpTest): +class TestGraphSendRecvSumOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "send_recv" + self.op_type = "graph_send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) src_index = index[:, 0] @@ -83,7 +83,7 @@ def setUp(self): self.attrs = {'pool_type': 'SUM'} - out, _ = compute_send_recv_for_sum_mean(self.inputs, self.attrs) + out, _ = compute_graph_send_recv_for_sum_mean(self.inputs, self.attrs) self.outputs = {'Out': out} @@ -94,10 +94,10 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') -class TestSendRecvMeanOp(OpTest): +class TestGraphSendRecvMeanOp(OpTest): def setUp(self): paddle.enable_static() - self.op_type = "send_recv" + self.op_type = "graph_send_recv" x = np.random.random((10, 20)).astype("float64") index = np.random.randint(0, 10, (15, 2)).astype(np.int64) src_index = index[:, 0] @@ -107,7 +107,8 @@ def setUp(self): self.attrs = {'pool_type': 'MEAN'} - out, dst_count = compute_send_recv_for_sum_mean(self.inputs, self.attrs) + out, dst_count = compute_graph_send_recv_for_sum_mean(self.inputs, + self.attrs) self.outputs = {'Out': out, 'Dst_count': dst_count} @@ -118,7 +119,7 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') -def compute_send_recv_for_sum_mean(inputs, attributes): +def compute_graph_send_recv_for_sum_mean(inputs, attributes): x = inputs['X'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] @@ -148,7 +149,7 @@ def compute_send_recv_for_sum_mean(inputs, attributes): return results, count -def compute_send_recv_for_min_max(inputs, attributes): +def compute_graph_send_recv_for_min_max(inputs, attributes): x = inputs['X'] src_index = inputs['Src_index'] dst_index = inputs['Dst_index'] @@ -193,7 +194,7 @@ def compute_send_recv_for_min_max(inputs, attributes): return results, gradient / results.size -class API_SendRecvOpTest(unittest.TestCase): +class API_GraphSendRecvOpTest(unittest.TestCase): def test_static(self): paddle.enable_static() with paddle.static.program_guard(paddle.static.Program()): @@ -201,11 +202,14 @@ def test_static(self): src_index = paddle.static.data(name="src", shape=[4], dtype="int32") dst_index = paddle.static.data(name="dst", shape=[4], dtype="int32") - res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum") - res_mean = paddle.incubate.send_recv(x, src_index, dst_index, - "mean") - res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max") - res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min") + res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "sum") + res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "mean") + res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "max") + res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "min") exe = paddle.static.Executor(paddle.CPUPlace()) data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype='float32') @@ -240,11 +244,14 @@ def test_dygraph(self): np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]]), dtype="float32") src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32") dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32") - res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum") - res_mean = paddle.incubate.send_recv(x, src_index, dst_index, - "mean") - res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max") - res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min") + res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "sum") + res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "mean") + res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "max") + res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "min") np_sum = np.array( [[0, 2, 3], [2, 8, 10], [1, 4, 5]], dtype="float32") @@ -273,11 +280,14 @@ def test_int32_input(self): np.array([0, 1, 2, 0, 1]), dtype="int32") dst_index = paddle.to_tensor( np.array([1, 2, 1, 0, 1]), dtype="int32") - res_sum = paddle.incubate.send_recv(x, src_index, dst_index, "sum") - res_mean = paddle.incubate.send_recv(x, src_index, dst_index, - "mean") - res_max = paddle.incubate.send_recv(x, src_index, dst_index, "max") - res_min = paddle.incubate.send_recv(x, src_index, dst_index, "min") + res_sum = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "sum") + res_mean = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "mean") + res_max = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "max") + res_min = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "min") np_sum = np.array( [[0, 2, 3], [3, 12, 14], [1, 4, 5]], dtype="int32") diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py index 6445ee74c0d065..e5215cf506413c 100644 --- a/python/paddle/incubate/__init__.py +++ b/python/paddle/incubate/__init__.py @@ -18,7 +18,7 @@ from ..fluid.layer_helper import LayerHelper # noqa: F401 from .operators import softmax_mask_fuse_upper_triangle # noqa: F401 from .operators import softmax_mask_fuse # noqa: F401 -from .operators import send_recv +from .operators import graph_send_recv from .tensor import segment_sum from .tensor import segment_mean from .tensor import segment_max @@ -31,7 +31,7 @@ 'ModelAverage', 'softmax_mask_fuse_upper_triangle', 'softmax_mask_fuse', - 'send_recv', + 'graph_send_recv', 'segment_sum', 'segment_mean', 'segment_max', diff --git a/python/paddle/incubate/operators/__init__.py b/python/paddle/incubate/operators/__init__.py index 90bcaddd79f5da..ecf73fb393cc17 100644 --- a/python/paddle/incubate/operators/__init__.py +++ b/python/paddle/incubate/operators/__init__.py @@ -15,4 +15,4 @@ from .softmax_mask_fuse_upper_triangle import softmax_mask_fuse_upper_triangle # noqa: F401 from .softmax_mask_fuse import softmax_mask_fuse # noqa: F401 from .resnet_unit import ResNetUnit #noqa: F401 -from .send_recv import send_recv #noqa: F401 +from .graph_send_recv import graph_send_recv #noqa: F401 diff --git a/python/paddle/incubate/operators/send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py similarity index 84% rename from python/paddle/incubate/operators/send_recv.py rename to python/paddle/incubate/operators/graph_send_recv.py index 7ef2a34b05c9ec..9b8f542658dd65 100644 --- a/python/paddle/incubate/operators/send_recv.py +++ b/python/paddle/incubate/operators/graph_send_recv.py @@ -18,7 +18,7 @@ from paddle.fluid import core -def send_recv(x, src_index, dst_index, pool_type="sum", name=None): +def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None): r""" Graph Learning Send_Recv combine operator. @@ -53,7 +53,7 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. The available data type is int32, int64. - pool_type (str): The pooling type of send_recv, including `sum`, `mean`, `max`, `min`. + pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`. Default value is `sum`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -71,7 +71,7 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32") src_index = indexes[:, 0] dst_index = indexes[:, 1] - out = paddle.incubate.send_recv(x, src_index, dst_index, pool_type="sum") + out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum") # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]] """ @@ -82,23 +82,23 @@ def send_recv(x, src_index, dst_index, pool_type="sum", name=None): % pool_type) if in_dygraph_mode(): - out, tmp = core.ops.send_recv(x, src_index, dst_index, 'pool_type', - pool_type.upper()) + out, tmp = core.ops.graph_send_recv(x, src_index, dst_index, + 'pool_type', pool_type.upper()) return out check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"), - "send_recv") + "graph_send_recv") check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"), - "send_recv") + "graph_send_recv") check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"), - "send_recv") + "graph_send_recv") - helper = LayerHelper("send_recv", **locals()) + helper = LayerHelper("graph_send_recv", **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) dst_count = helper.create_variable_for_type_inference( dtype="int32", stop_gradient=True) helper.append_op( - type="send_recv", + type="graph_send_recv", inputs={"X": x, "Src_index": src_index, "Dst_index": dst_index},