From f25d4c5185541e64052118571e40625b4e186d0a Mon Sep 17 00:00:00 2001 From: Paddle CI_MAC Date: Thu, 2 Sep 2021 14:35:55 +0800 Subject: [PATCH] mirgate_34332 --- .../fluid/operators/mkldnn/slice_mkldnn_op.cc | 175 ++++ .../fluid/operators/mkldnn/split_mkldnn_op.cc | 132 +++ paddle/fluid/operators/slice_op.cc | 190 ++-- paddle/fluid/operators/split_op.cc | 29 +- paddle/fluid/platform/mkldnn_reuse.h | 917 +++++++++++------- .../unittests/mkldnn/test_slice_mkldnn_op.py | 199 ++++ .../unittests/test_static_save_load_bf16.py | 137 +++ 7 files changed, 1320 insertions(+), 459 deletions(-) create mode 100644 paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc create mode 100644 paddle/fluid/operators/mkldnn/split_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc new file mode 100644 index 0000000000..e16c41829b --- /dev/null +++ b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class SliceMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("Input"); + auto* out = ctx.Output("Out"); + + auto x_vec_dims = framework::vectorize(x->dims()); + auto out_vec_dims = framework::vectorize(out->dims()); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + + std::vector axes(ctx.Attr>("axes").begin(), + ctx.Attr>("axes").end()); + std::vector starts(ctx.Attr>("starts").begin(), + ctx.Attr>("starts").end()); + std::vector ends(ctx.Attr>("ends").begin(), + ctx.Attr>("ends").end()); + + auto decrease_axis = ctx.Attr>("decrease_axis"); + + std::vector offsets(x_vec_dims.size(), 0); + std::vector slice_dims(x_vec_dims); + + for (size_t i = 0; i < axes.size(); ++i) { + starts[i] = starts[i] < 0 ? x_vec_dims[axes[i]] + starts[i] : starts[i]; + ends[i] = ends[i] < 0 ? x_vec_dims[axes[i]] + ends[i] + : std::min(ends[i], x_vec_dims[axes[i]]); + offsets[axes[i]] = starts[i]; + slice_dims[axes[i]] = ends[i] - starts[i]; + } + + mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type()); + auto key = platform::CreateKey(dev_ctx, x_vec_dims, axes, starts, ends, + x->format(), x_type); + + platform::ReorderMKLDNNHandler reorder_handler( + x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x->format(), platform::to_void_cast(x->data())); + auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets, + reorder_src_memory_p); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + out, slice_dims, 0, x->format(), ctx.GetPlace()); + + auto reorder_p = + reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat( + reorder_dst_memory_p->get_desc().reshape(out_vec_dims))); + } +}; + +template +class SliceGradMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("Input")); + + auto dx_vec_dims = framework::vectorize(dx->dims()); + auto dout_vec_dims = framework::vectorize(dout->dims()); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + + std::vector axes(ctx.Attr>("axes").begin(), + ctx.Attr>("axes").end()); + std::vector starts(ctx.Attr>("starts").begin(), + ctx.Attr>("starts").end()); + std::vector ends(ctx.Attr>("ends").begin(), + ctx.Attr>("ends").end()); + + auto decrease_axis = ctx.Attr>("decrease_axis"); + + std::vector offsets(dx_vec_dims.size(), 0); + std::vector slice_dims(dx_vec_dims); + + for (size_t i = 0; i < axes.size(); ++i) { + starts[i] = starts[i] < 0 ? dx_vec_dims[axes[i]] + starts[i] : starts[i]; + ends[i] = ends[i] < 0 ? dx_vec_dims[axes[i]] + ends[i] + : std::min(ends[i], dx_vec_dims[axes[i]]); + offsets[axes[i]] = starts[i]; + slice_dims[axes[i]] = ends[i] - starts[i]; + } + + mkldnn::memory::data_type dout_type = + framework::ToMKLDNNDataType(dout->type()); + mkldnn::memory::desc md(dout_vec_dims, platform::MKLDNNGetDataType(), + dout->format()); + mkldnn::memory::format_tag reorder_format_tag = + platform::GetMKLDNNFormat(md.reshape(slice_dims)); + + auto key = platform::CreateKey(dev_ctx, dout_vec_dims, axes, starts, ends, + reorder_format_tag, dout_type); + + platform::ReorderMKLDNNHandler reorder_handler( + slice_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + reorder_format_tag, platform::to_void_cast(dout->data())); + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dx, dx_vec_dims, 0, reorder_format_tag, ctx.GetPlace()); + memset(dx->data(), 0, reorder_dst_memory_p->get_desc().get_size()); + + auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets, + reorder_dst_memory_p); + + auto reorder_p = + reorder_handler.AcquireReorder(slice_mem_p, reorder_src_memory_p); + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p); + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(reorder_format_tag); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(slice, MKLDNN, paddle::platform::CPUPlace, + ops::SliceMKLDNNKernel, + ops::SliceMKLDNNKernel); + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(slice_grad, MKLDNN, paddle::platform::CPUPlace, + ops::SliceGradMKLDNNKernel, + ops::SliceGradMKLDNNKernel); \ No newline at end of file diff --git a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc new file mode 100644 index 0000000000..8a58d9f26f --- /dev/null +++ b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +static inline std::vector> CalculateOutsDims( + const framework::DDim& in_dims, const size_t num, + const std::vector& sections, const size_t axis, + const int outs_number) { + std::vector> outs_dims(outs_number, + framework::vectorize(in_dims)); + + if (num > 0) { + PADDLE_ENFORCE_EQ(in_dims[axis] % num, 0, + platform::errors::InvalidArgument( + "The input's size along the split dimension " + "must be evenly divisible by Attr(num_or_sections). " + "But received Attr(num_or_sections) " + "= %d, input(X)'s shape = [%s], Attr(dim) = %d.", + num, in_dims, axis)); + + const size_t out_axis_dim = in_dims[axis] / num; + + for (auto& out_dim : outs_dims) out_dim[axis] = out_axis_dim; + } else { + for (size_t i = 0; i < outs_dims.size(); ++i) + outs_dims[i][axis] = sections[i]; + } + return outs_dims; +} + +template +class SplitMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx); + } + + void RunKernel(const framework::ExecutionContext& ctx) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + const auto* x = ctx.Input("X"); + auto outs = ctx.MultiOutput("Out"); + + int num = ctx.Attr("num"); + auto sections = ctx.Attr>("sections"); + int axis = ctx.Attr("axis"); + auto outs_number = outs.size(); + const auto x_dims = x->dims(); + + bool need_resize = false; + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + axis = GetDataFromTensor(axis_tensor)[0]; + need_resize = true; + } + + auto sections_tensor_list = ctx.MultiInput("SectionsTensorList"); + if (sections_tensor_list.size() > 0) { + sections = GetDataFromTensorList(sections_tensor_list); + need_resize = true; + } + + if (need_resize) { + const auto outs_dims = + CalculateOutsDims(x->dims(), num, sections, axis, outs_number); + for (size_t i = 0; i < outs.size(); ++i) { + outs[i]->Resize(framework::make_ddim(outs_dims[i])); + } + } + + auto x_vec_dims = framework::vectorize(x_dims); + + mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type()); + auto key = platform::CreateKey(dev_ctx, x_vec_dims, axis, num, sections, + x->format(), x_type); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + std::vector offset(x_vec_dims.size(), 0); + + platform::ReorderMKLDNNHandler reorder_handler( + x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key); + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x->format(), platform::to_void_cast(x->data())); + + for (size_t i = 0; i < outs_number; ++i) { + auto out_vec_dims = framework::vectorize(outs[i]->dims()); + auto slice_mem_p = reorder_handler.AcquireSubmemory( + out_vec_dims, offset, reorder_src_memory_p, i); + + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + outs[i], out_vec_dims, i, x->format(), ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p, i); + + reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p); + + offset[axis] += num > 0 ? x->dims()[axis] / num : sections[i]; + + outs[i]->set_layout(framework::DataLayout::kMKLDNN); + outs[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } + astream.wait(); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(split, MKLDNN, paddle::platform::CPUPlace, + ops::SplitMKLDNNKernel, + ops::SplitMKLDNNKernel); diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 0a41424cfa..ac50ccea9e 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -28,13 +28,10 @@ class SliceOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, - platform::errors::InvalidArgument( - "Input (Input) of slice op should not be null.")); + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "slice"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "slice"); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output (Out) of slice op should not be null.")); + // Case 1: Special treatment when input is a tensor array. auto x_var_type = ctx->GetInputsVarType("Input")[0]; auto axes = ctx->Attrs().Get>("axes"); if (x_var_type == framework::proto::VarType::LOD_TENSOR_ARRAY) { @@ -57,6 +54,8 @@ class SliceOp : public framework::OperatorWithKernel { return; } } + + // Case 2: input is a tensor. auto in_dims = ctx->GetInputDim("Input"); PADDLE_ENFORCE_LT(in_dims.size(), 7, platform::errors::InvalidArgument( @@ -65,103 +64,56 @@ class SliceOp : public framework::OperatorWithKernel { auto starts = ctx->Attrs().Get>("starts"); auto ends = ctx->Attrs().Get>("ends"); - auto infer_flags = ctx->Attrs().Get>("infer_flags"); auto decrease_axis = ctx->Attrs().Get>("decrease_axis"); - - auto starts_size = starts.size(); - auto ends_size = ends.size(); + auto infer_flags = ctx->Attrs().Get>("infer_flags"); if (infer_flags.empty()) { // Initialize infer_flags with 1. // To be compatible with other op tests in which infer_flags is not set. infer_flags = std::vector(axes.size(), 1); } + // 2.1 Check attrs. + auto starts_size = starts.size(); + auto ends_size = ends.size(); + if (ctx->HasInputs("StartsTensorList")) { - auto StartsTensorList = ctx->Inputs("StartsTensorList"); - PADDLE_ENFORCE_GT(StartsTensorList.size(), 0, + starts_size = ctx->Inputs("StartsTensorList").size(); + PADDLE_ENFORCE_GT(starts_size, 0, platform::errors::InvalidArgument( "StartsTensorList size can't be zero")); - starts_size = StartsTensorList.size(); } if (ctx->HasInputs("EndsTensorList")) { - auto EndsTensorList = ctx->Inputs("EndsTensorList"); - PADDLE_ENFORCE_GT(EndsTensorList.size(), 0, - platform::errors::InvalidArgument( - "EndsTensorList size can't be zero")); - ends_size = EndsTensorList.size(); + ends_size = ctx->Inputs("EndsTensorList").size(); + PADDLE_ENFORCE_GT(ends_size, 0, platform::errors::InvalidArgument( + "EndsTensorList size can't be zero")); } - if (ctx->HasInput("StartsTensor") == false) { + if (!ctx->HasInput("StartsTensor")) { PADDLE_ENFORCE_EQ( starts_size, axes.size(), platform::errors::InvalidArgument( "The size of starts must be equal to the size of axes.")); } - if (ctx->HasInput("EndsTensor") == false) { + if (!ctx->HasInput("EndsTensor")) { PADDLE_ENFORCE_EQ( ends_size, axes.size(), platform::errors::InvalidArgument( "The size of ends must be equal to the size of axes.")); } - int dim_value, start, end; - for (size_t i = 0; i < axes.size(); ++i) { - PADDLE_ENFORCE_LT(static_cast(axes[i]), in_dims.size(), - platform::errors::InvalidArgument( - "The index of dimension in axes must be less " - "than the size of input shape.")); - if (infer_flags[i] == -1) { - out_dims[axes[i]] = -1; - } else { - // infer out_dim shape - dim_value = out_dims[axes[i]]; - if (dim_value > 0) { - start = starts[i] < 0 ? (starts[i] + dim_value) : starts[i]; - end = ends[i] < 0 ? (ends[i] + dim_value) : ends[i]; - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim_value); - - PADDLE_ENFORCE_LE(start, dim_value, - platform::errors::InvalidArgument( - "start should be less than or equal to the " - "dimension value, but received " - "start = %d, shape[%d] = %d.", - starts[i], axes[i], out_dims[axes[i]])); - PADDLE_ENFORCE_GT(end, start, - platform::errors::InvalidArgument( - "end should greater than start, but received " - "end = %d, start = %d.", - ends[i], starts[i])); - out_dims[axes[i]] = end - start; - } - } - } - // generate new shape - if (decrease_axis.size() > 0) { - std::vector new_out_shape; - for (size_t i = 0; i < decrease_axis.size(); ++i) { - if (ctx->IsRuntime() && infer_flags[i] != -1) { - PADDLE_ENFORCE_EQ( - out_dims[decrease_axis[i]], 1, - platform::errors::InvalidArgument("decrease dim should be 1")); - } - out_dims[decrease_axis[i]] = 0; - } - - for (int i = 0; i < out_dims.size(); ++i) { - if (out_dims[i] != 0) { - new_out_shape.push_back(out_dims[i]); - } - } - if (new_out_shape.size() == 0) { - new_out_shape.push_back(1); - } + CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, nullptr, + &infer_flags); - out_dims = framework::make_ddim(new_out_shape); + auto slice_dims = + GetSliceDims(in_dims, axes, starts, ends, nullptr, &infer_flags); + if (ctx->IsRuntime()) { + out_dims = GetDecreasedDims(slice_dims, decrease_axis, &infer_flags); + } else { + out_dims = GetDecreasedDims(slice_dims, decrease_axis, nullptr); } + ctx->SetOutputDim("Out", out_dims); - if (axes[0] != 0) { + if (axes.size() > 0 && axes[0] != 0) { ctx->ShareLoD("Input", /*->*/ "Out"); } } @@ -180,11 +132,32 @@ class SliceOp : public framework::OperatorWithKernel { if (platform::is_cuda_pinned_place(in_tensor.place())) { return framework::OpKernelType(in_tensor.type(), ctx.device_context()); } + +#ifdef PADDLE_WITH_MKLDNN + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "Input"); + + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // OneDNN uses blocking format, which cannot be always supported with + // reorders, because if blocked dimension is not divisible by 8 or + // 16(depending on which blocking format is used) submemory cannot be + // created, so in that scenario a fallback is needed + auto tmp_md = dnnl::memory::desc( + framework::vectorize(ctx.Input("Input")->dims()), + dnnl::memory::data_type::f32, ctx.Input("Input")->format()); + if (tmp_md.data.format_desc.blocking.inner_nblks == 0) + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(in_tensor.type(), in_tensor.place()); } return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace()); } + framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) const override { @@ -263,6 +236,14 @@ class SliceOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault({}); AddAttr>("decrease_axis", "(list) decrease_axis") .SetDefault({}); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); AddComment(R"DOC( Slice Operator. @@ -325,12 +306,32 @@ class SliceOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); + auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // OneDNN uses blocking format, which cannot be always supported with + // reorders, because if blocked dimension is not divisible by 8 or + // 16(depending on which blocking format is used) submemory cannot be + // created, so in that scenario a fallback is needed + auto tmp_md = dnnl::memory::desc( + framework::vectorize( + ctx.Input(framework::GradVarName("Out"))->dims()), + dnnl::memory::data_type::f32, + ctx.Input(framework::GradVarName("Out"))->format()); + if (tmp_md.data.format_desc.blocking.inner_nblks == 0) + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } + framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const Tensor &tensor, const framework::OpKernelType &expected_kernel_type) const override { @@ -436,9 +437,9 @@ REGISTER_OP_CPU_KERNEL( ops::SliceKernel, ops::SliceKernel, ops::SliceKernel, + paddle::platform::complex>, ops::SliceKernel); + paddle::platform::complex>); REGISTER_OP_CPU_KERNEL( slice_grad, ops::SliceGradKernel, @@ -446,6 +447,31 @@ REGISTER_OP_CPU_KERNEL( ops::SliceGradKernel, ops::SliceGradKernel, ops::SliceGradKernel, + paddle::platform::complex>, ops::SliceGradKernel); + paddle::platform::complex>); + +REGISTER_OP_CUDA_KERNEL( + slice, ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel, + ops::SliceKernel>, + ops::SliceKernel>); + +REGISTER_OP_CUDA_KERNEL( + slice_grad, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel, + ops::SliceGradKernel>, + ops::SliceGradKernel>); diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc index 0151778075..5bd699e08a 100644 --- a/paddle/fluid/operators/split_op.cc +++ b/paddle/fluid/operators/split_op.cc @@ -73,8 +73,25 @@ class SplitOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // OneDNN uses blocking format, which cannot be always supported with + // reorders, because if blocked dimension is not divisible by 8 or + // 16(depending on which blocking format is used) submemory cannot be + // created, so in that scenario a fallback is needed + auto tmp_md = dnnl::memory::desc( + framework::vectorize(ctx.Input("X")->dims()), + dnnl::memory::data_type::f32, ctx.Input("X")->format()); + if (tmp_md.data.format_desc.blocking.inner_nblks == 0) + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( @@ -136,6 +153,14 @@ Example: "(int, default 0) " "The axis which the input will be split on.") .SetDefault(0); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); } }; diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 54efa55cc4..370d9b3925 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -35,7 +35,213 @@ using user_function = std::function(const float*)>; using memory = mkldnn::memory; template + typename TBackward = mkldnn_dummy_primitive, + typename TBackward_params = mkldnn_dummy_primitive> +class MKLDNNHandlerNoCachingT { + public: + MKLDNNHandlerNoCachingT(mkldnn::engine engine, platform::Place cpu_place) + : engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) { + platform::MKLDNNDeviceContext::tls().log_lib_version(); + } + + std::shared_ptr AcquireForwardPrimitive() { + return std::make_shared(*fwd_pd_); + } + + std::shared_ptr AcquireBackwardPrimitive() { + return std::make_shared(*bwd_pd_); + } + + std::shared_ptr AcquireBackwardWeightsPrimitive() { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, platform::errors::Unavailable("BWD_PD should be set when " + "getting BWD prim .")); + return std::make_shared(*bwd_w_pd_); + } + + std::shared_ptr AcquireSrcMemory( + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive(fwd_pd_->src_desc(), + to_void_cast(input_data)); + } + + template + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + T_out* ptr = + output->mutable_data(place_, fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), ptr); + } + + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc()); + } + + template + std::shared_ptr AcquireDstMemory( + const framework::Tensor* output) { + const T_out* output_data = output->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(), + to_void_cast(output_data)); + } + + std::shared_ptr AcquireDiffDstMemory( + const framework::Tensor* diffdst) { + const T* ptr = diffdst->data(); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_dst_desc(), + to_void_cast(ptr)); + } + + std::shared_ptr AcquireDiffSrcMemory( + framework::Tensor* diffsrc) { + T* ptr = + diffsrc->mutable_data(place_, bwd_pd_->diff_src_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr); + } + + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + framework::Tensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + ptr); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc()); + } + + protected: + // If your primitive descriptor requires attributes, pass them as a + // first argument and paramters to descriptor constructor in the following + // arguments. Otherwise, all arguments will be forwarded to descriptor + // constructor, including the first one. + template + void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + } + + // Using sfinae to specialise variadic function. Workaround for not having + // if constexpr in C++ 11. + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(args)...); + fwd_pd_ = std::make_shared( + fwd_desc, first, engine_); + } + + template + typename std::enable_if::type, + dnnl::primitive_attr>::value>::type + CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) { + auto fwd_desc = typename TForward::desc(std::forward(first), + std::forward(args)...); + fwd_pd_ = + std::make_shared(fwd_desc, engine_); + } + + template + void AcquireBackwardPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = typename TBackward::desc(std::forward(args)...); + bwd_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL(fwd_pd_, + platform::errors::Unavailable( + "Get MKLDNN Forward primitive %s failed.")); + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md, void* ptr) { + return std::make_shared(md, engine_, ptr); + } + + std::shared_ptr AcquireMemoryFromPrimitive( + mkldnn::memory::desc md) { + return std::make_shared(md, engine_); + } + + void AcquireReorder(const std::shared_ptr& user_memory_p, + const std::shared_ptr& target_memory_p) { + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } + + template + std::shared_ptr AcquireMemoryWithReorder( + const mkldnn::memory::desc& user_md, + const mkldnn::memory::desc& target_md, void* ptr, + const std::string& suffix, bool is_persistent = false, + std::function(const F*)> custom_reorder_func = {}) { + std::shared_ptr target_memory_p; + if (custom_reorder_func) { + auto reordered_data = + custom_reorder_func(reinterpret_cast(ptr)); + ptr = reinterpret_cast(reordered_data.get()); + } + auto user_memory_p = std::make_shared(user_md, engine_, ptr); + if (user_md != target_md) { + target_memory_p = std::make_shared(target_md, engine_); + auto reorder_p = + std::make_shared(*user_memory_p, *target_memory_p); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + platform::RecordEvent record_reorder("int_reorder", + platform::EventRole::kUniqueOp); + reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, + {MKLDNN_ARG_TO, *target_memory_p}}); + astream.wait(); + } else { + target_memory_p = user_memory_p; + } + return target_memory_p; + } + + mkldnn::engine engine_; + platform::Place place_; + std::shared_ptr fwd_pd_; + std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; +}; + +template class MKLDNNHandlerT { public: MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, @@ -72,6 +278,21 @@ class MKLDNNHandlerT { return backward_p; } + std::shared_ptr AcquireBackwardWeightsPrimitive() { + const std::string key_p = key_ + "@bwd_w_p"; + auto backward_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key_p)); + if (backward_p == nullptr) { + PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_, platform::errors::Unavailable( + "BWD_PD should be set when " + "getting BWD prim witk key: %s .", + key_p)); + backward_p = std::make_shared(*bwd_w_pd_); + dev_ctx_.SetBlob(key_p, backward_p); + } + return backward_p; + } + std::shared_ptr AcquireSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); @@ -116,23 +337,55 @@ class MKLDNNHandlerT { "@diff_src_mem_p"); } + // Buffer of given Tensor is used for oneDNN computation + std::shared_ptr AcquireDiffWeightsMemory( + framework::Tensor* diff_weights) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + T* ptr = diff_weights->mutable_data( + place_, bwd_w_pd_->diff_weights_desc().get_size()); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), ptr, + "@diff_wei_mem_p"); + } + + // Buffer is allocated by oneDNN to store computation results + std::shared_ptr AcquireDiffWeightsMemory(void) { + PADDLE_ENFORCE_NOT_NULL( + bwd_w_pd_, + platform::errors::Unavailable( + "BWD_W_PD should be set when getting BWD grad of weights.")); + return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(), + "@diff_wei_mem_p"); + } + protected: bool isCached() { - const std::string key_pd = key_common_ + "@fwd_pd"; + const std::string key_pd = key_ + "@fwd_pd"; fwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); - const std::string key_p = key_ + "@fwd_p"; - return (dev_ctx_.GetBlob(key_p) != nullptr); + return (fwd_pd_ != nullptr); } bool isBwdCached() { - const std::string key_pd = key_common_ + "@bwd_pd"; + const std::string key_pd = key_ + "@bwd_pd"; bwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); - const std::string key_p = key_ + "@bwd_p"; - return (dev_ctx_.GetBlob(key_p) != nullptr); + if (bwd_pd_ == nullptr) { + return false; + } else { + // When BWD is cached then still we need to Get FWD PD + const std::string key_fpd = key_ + "@fwd_pd"; + fwd_pd_ = std::static_pointer_cast( + dev_ctx_.GetBlob(key_fpd)); + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, platform::errors::Unavailable( + "Error: FWD PD should be set when BWD PD is cached.")); + return true; + } } // If your primitive descriptor requires attributes, pass them as a @@ -141,23 +394,14 @@ class MKLDNNHandlerT { // constructor, including the first one. template void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) { - // Forward PD has to be passed to Grad op that - // may be executed by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_pd = key_common_ + "@fwd_pd"; + // This is used when we can recreate FWD PD in BWD so + // we do not need to pass FWD to BWD + const std::string key_pd = key_ + "@fwd_pd"; fwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); if (fwd_pd_ == nullptr) { - static std::mutex acquire_barrier; - std::lock_guard block_threads_until_finish_this_job( - acquire_barrier); - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_pd)); - if (fwd_pd_ == nullptr) { - CreateForwardPrimitiveDescriptor(first_arg, - std::forward(args)...); - dev_ctx_.SetBlob(key_pd, fwd_pd_); - } + CreateForwardPrimitiveDescriptor(first_arg, std::forward(args)...); + dev_ctx_.SetBlob(key_pd, fwd_pd_); } } @@ -184,12 +428,12 @@ class MKLDNNHandlerT { template void AcquireBackwardPrimitiveDescriptor(Args&&... args) { - const std::string key_fwd_pd = key_common_ + "@fwd_pd"; - fwd_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_fwd_pd)); + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor PADDLE_ENFORCE_NOT_NULL( - fwd_pd_, platform::errors::Unavailable( - "Get MKLDNN Forward primitive %s failed.", key_fwd_pd)); + fwd_pd_, + platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.", + key_ + "@fwd_pd")); const std::string key_pd = key_ + "@bwd_pd"; bwd_pd_ = std::static_pointer_cast( dev_ctx_.GetBlob(key_pd)); @@ -201,6 +445,27 @@ class MKLDNNHandlerT { } } + template + void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) { + // fwd_pd_ is set during grad by calling + // AcquireForwardPrimitiveDescriptor + PADDLE_ENFORCE_NOT_NULL( + fwd_pd_, + platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.", + key_ + "@fwd_pd")); + const std::string key_pd = key_ + "@bwd_w_pd"; + bwd_w_pd_ = + std::static_pointer_cast( + dev_ctx_.GetBlob(key_pd)); + if (bwd_w_pd_ == nullptr) { + auto bwd_desc = + typename TBackward_params::desc(std::forward(args)...); + bwd_w_pd_ = std::make_shared( + bwd_desc, engine_, *fwd_pd_); + dev_ctx_.SetBlob(key_pd, bwd_w_pd_); + } + } + std::shared_ptr AcquireMemoryFromPrimitive( const std::string& suffix) { return std::static_pointer_cast( @@ -328,6 +593,7 @@ class MKLDNNHandlerT { std::string key_; std::shared_ptr fwd_pd_; std::shared_ptr bwd_pd_; + std::shared_ptr bwd_w_pd_; }; // TODO(grygielski) this class will be deleted later. @@ -528,73 +794,70 @@ class MKLDNNHandler { }; template -class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { +class BinaryMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { public: BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis, - const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, Tensor* z, - float scale_x, float scale_y, float scale_z, - const std::string& uniq_name) - : platform::MKLDNNHandlerT( - dev_ctx, engine, cpu_place, - platform::CreateKey( - dev_ctx, framework::vectorize(x->dims()), uniq_name, - (algo == dnnl::algorithm::binary_mul ? "M" : ""))) { - // bradcasting combined with in-place may require + float scale_x, float scale_y, float scale_z) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for X tensor. Expected: %d (kMKLDNN), Actual: %d", + DataLayout::kMKLDNN, x->layout())); + PADDLE_ENFORCE_NE(x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for X tensor : %d (undef)", + static_cast(x->format()))); + + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for Y tensor. Expected: %d (kMKLDNN), Actual: %d", + DataLayout::kMKLDNN, y->layout())); + PADDLE_ENFORCE_NE(y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Y tensor : %d (undef)", + static_cast(y->format()))); + + const auto src_x_tz = framework::vectorize(x->dims()); + const auto src_y_tz = framework::vectorize(y->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer auto rankdiff = x->dims().size() - y->dims().size(); - if (rankdiff > 0) { - auto suffix = std::to_string(rankdiff); - this->key_ += suffix; - this->key_common_ += suffix; + const auto dst_tz = (z == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz) + : framework::vectorize(z->dims()); + + auto src0_md = dnnl::memory::desc( + src_x_tz, platform::MKLDNNGetDataType(), x->format()); + auto src1_md = dnnl::memory::desc( + src_y_tz, platform::MKLDNNGetDataType(), y->format()); + if (rankdiff > 0) { // Second input is of smaller rank than first + std::vector dims1_ex(rankdiff, 1); + dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), + src_y_tz.begin(), src_y_tz.end()); + src1_md = src1_md.reshape(dims1_ex); + } else if (rankdiff < 0) { // First input is of smaller than second + std::vector dims0_ex(-rankdiff, 1); + dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)), + src_x_tz.begin(), src_x_tz.end()); + src0_md = src0_md.reshape(dims0_ex); } + const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), + MKLDNNMemoryFormat::any); - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - PADDLE_ENFORCE_NE( - x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for X tensor.")); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); - PADDLE_ENFORCE_NE( - y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for Y tensor.")); - - const auto src_x_tz = framework::vectorize(x->dims()); - const auto src_y_tz = framework::vectorize(y->dims()); - // if output tensor(z) is nullptr then we are computing into oneDNN - // managed buffer - const auto dst_tz = - (z == nullptr) ? src_x_tz : framework::vectorize(z->dims()); - - const auto src0_md = dnnl::memory::desc( - src_x_tz, platform::MKLDNNGetDataType(), x->format()); - auto src1_md = dnnl::memory::desc( - src_y_tz, platform::MKLDNNGetDataType(), y->format()); - if (rankdiff > 0) { - std::vector dims1_ex(rankdiff, 1); - dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)), - src_y_tz.begin(), src_y_tz.end()); - src1_md = src1_md.reshape(dims1_ex); - } - const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType(), - MKLDNNMemoryFormat::any); - - auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); - this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, - src1_md, dst_md); - } + auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z); + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, + dst_md); } std::shared_ptr AcquireSecondSrcMemory( const framework::Tensor* input) { const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src1_desc(), to_void_cast(input_data), "@src1_mem_p"); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(), + to_void_cast(input_data)); } private: @@ -632,153 +895,156 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { template class BroadcastDataMKLDNNHandler - : public platform::MKLDNNHandlerT { + : public platform::MKLDNNHandlerNoCachingT { public: BroadcastDataMKLDNNHandler(const dnnl::algorithm algo, - const MKLDNNDeviceContext& dev_ctx, const mkldnn::engine engine, - platform::Place cpu_place, const Tensor* x, - const Tensor* y, float scale_x, float scale_y, - const std::string& uniq_name) - : platform::MKLDNNHandlerT( - dev_ctx, engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), - uniq_name)) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - PADDLE_ENFORCE_NE( - x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for X tensor.")); - - PADDLE_ENFORCE_EQ( - y->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); - PADDLE_ENFORCE_NE( - y->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for Y tensor.")); - - auto src1_tz = framework::vectorize(y->dims()); - const auto src0_tz = framework::vectorize(x->dims()); - - // GetExpectedKernelType checks if smaller vector is a subvector with all - // the dims in correct order on the rightmost part of the bigger vector, - // i.e. a correct vector for broadcasting: - // x = 5, 7, 3, 2, 4, 8 - // y = 4, 8 - src1_tz.reserve(src0_tz.size()); - - for (size_t i = src1_tz.size(); i < src0_tz.size(); ++i) { - src1_tz.insert(src1_tz.begin(), 1L); - } + platform::Place cpu_place, const Tensor* out, + const Tensor* x, float scale_x, float scale_y, + const std::vector& input_dims) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for X tensor.")); + PADDLE_ENFORCE_NE( + x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for X tensor.")); + + const auto src0_tz = framework::vectorize(out->dims()); + + const auto src0_md = dnnl::memory::desc( + src0_tz, platform::MKLDNNGetDataType(), out->format()); + const auto src1_md = dnnl::memory::desc( + input_dims, platform::MKLDNNGetDataType(), out->format()); - const auto src0_md = dnnl::memory::desc( - src0_tz, platform::MKLDNNGetDataType(), x->format()); - const auto src1_md = dnnl::memory::desc( - src1_tz, platform::MKLDNNGetDataType(), x->format()); - - dnnl::primitive_attr attributes; - attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); - attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y}); + dnnl::primitive_attr attributes; + attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); + attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y}); - this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, - src1_md, src0_md); - } + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, src1_md, + src0_md); } - std::shared_ptr AcquireSrcMemory(framework::Tensor* input) { - T* input_data = input->data(); - memset(input_data, 0, this->fwd_pd_->src_desc().get_size()); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src_desc(), to_void_cast(input_data), "@src0_mem_p"); - } - - std::shared_ptr AcquireSecondSrcMemory( - const framework::Tensor* input) { - const T* input_data = input->data(); - return this->AcquireMemoryFromPrimitive( - this->fwd_pd_->src1_desc(), to_void_cast(input_data), "@src1_mem_p"); + template + std::shared_ptr AcquireDstMemory(framework::Tensor* output) { + T_out* ptr = output->mutable_data( + this->place_, this->fwd_pd_->dst_desc().get_size()); + ; + memset(ptr, 0, this->fwd_pd_->dst_desc().get_size()); + return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr); } }; template class ReductionMKLDNNHandler - : public platform::MKLDNNHandlerT { + : public platform::MKLDNNHandlerNoCachingT { public: ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p, - const float eps, const MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine engine, platform::Place cpu_place, - const Tensor* x, const Tensor* y, - const std::string& uniq_name, - std::vector output_dims) - : platform::MKLDNNHandlerT( - dev_ctx, engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), - uniq_name, - (std::to_string(static_cast(algo))))) { - if (!this->isCached()) { - PADDLE_ENFORCE_EQ( - x->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument("Wrong layout set for X tensor.")); - PADDLE_ENFORCE_NE( - x->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument("Wrong format set for X tensor.")); - - const auto src_tz = framework::vectorize(x->dims()); - - const auto src_md = dnnl::memory::desc( - src_tz, platform::MKLDNNGetDataType(), x->format()); - const auto dst_md = memory::desc( - output_dims, platform::MKLDNNGetDataType(), x->format()); - - this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps); - } + const float eps, const mkldnn::engine engine, + platform::Place cpu_place, const Tensor* x, + const Tensor* y, std::vector y_tz) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for X tensor.")); + PADDLE_ENFORCE_NE( + x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for X tensor.")); + + const auto x_tz = framework::vectorize(x->dims()); + + const auto x_md = + dnnl::memory::desc(x_tz, platform::MKLDNNGetDataType(), x->format()); + const auto y_md = + memory::desc(y_tz, platform::MKLDNNGetDataType(), x->format()); + + this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps); } }; template class ActivationMKLDNNHandler - : public MKLDNNHandlerT { + : public MKLDNNHandlerNoCachingT { public: - ActivationMKLDNNHandler(const std::vector& dims, - mkldnn::algorithm algorithm, float alpha, float beta, - const MKLDNNMemoryFormat fmt, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, - const std::string& unique_name, bool is_inplaced) - - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - is_inplaced - ? platform::CreateKey(dev_ctx, dims, "a", algorithm, - unique_name) - : platform::CreateKey(dev_ctx, dims, "a", unique_name)) { - auto md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); + ActivationMKLDNNHandler(mkldnn::algorithm algorithm, + const framework::ExecutionContext& ctx, + const mkldnn::engine engine, Place cpu_place, + const framework::Tensor* in_x) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + // eltwise_linear means we are in scale op + if (algorithm == mkldnn::algorithm::eltwise_linear) { + bool bias_after_scale = ctx.Attr("bias_after_scale"); + auto* scale_tensor = ctx.Input("ScaleTensor"); + alpha = (scale_tensor == nullptr) ? ctx.Attr("scale") + : (float)*(scale_tensor->data()); + beta = ctx.Attr("bias"); + // if bias_after_scale == true + // out = scale*X + bias + // else + // out = scale*(X + bias) = scale*X + scale*bias + if (!bias_after_scale) beta *= alpha; + } else { + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } + } + + PADDLE_ENFORCE(in_x->dims().size() >= 1 || in_x->dims().size() <= 6, + platform::errors::Unimplemented( + "Input dimension size can be 1, 2, 3, 4, " + "5, or 6, but now the dimension size is", + in_x->dims().size())); + + auto src_tz = framework::vectorize(in_x->dims()); + auto src_fmt = src_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto md = + mkldnn::memory::desc(src_tz, platform::MKLDNNGetDataType(), src_fmt); this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, algorithm, md, alpha, beta); } - ActivationMKLDNNHandler(const std::vector& dims, - mkldnn::algorithm algorithm, float alpha, float beta, - const MKLDNNMemoryFormat fmt, - const MKLDNNMemoryFormat diff_fmt, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, - const std::string& unique_name) + ActivationMKLDNNHandler(mkldnn::algorithm algorithm, + const framework::ExecutionContext& ctx, + const mkldnn::engine engine, Place cpu_place, + const framework::Tensor* in_x, const Tensor* out_grad) + : platform::MKLDNNHandlerNoCachingT(engine, + cpu_place) { + float alpha = ctx.HasAttr("alpha") ? ctx.Attr("alpha") : 0; + float beta = ctx.HasAttr("beta") ? ctx.Attr("beta") : 0; + + // paddle uses beta but mkldnn uses alpha for swish + if (algorithm == mkldnn::algorithm::eltwise_swish) { + std::swap(alpha, beta); + } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) { + alpha = ctx.Attr("threshold"); + } + + auto diff_dst_tz = framework::vectorize(out_grad->dims()); + + auto src_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : in_x->format(); + auto diff_fmt = + diff_dst_tz.size() == 2 ? MKLDNNMemoryFormat::nc : out_grad->format(); - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, dims, "a", unique_name)) { + auto dims = framework::vectorize(in_x->dims()); auto diff_dst_md = platform::MKLDNNMemDesc( dims, platform::MKLDNNGetDataType(), diff_fmt); - auto src_md = - platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType(), fmt); + auto src_md = platform::MKLDNNMemDesc( + dims, platform::MKLDNNGetDataType(), src_fmt); + this->AcquireForwardPrimitiveDescriptor(mkldnn::prop_kind::forward_training, + algorithm, src_md, alpha, beta); this->AcquireBackwardPrimitiveDescriptor(algorithm, diff_dst_md, src_md, alpha, beta); } @@ -787,218 +1053,124 @@ class ActivationMKLDNNHandler const framework::Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(), - to_void_cast(input_data), - "@bwd-src_mem_p"); + to_void_cast(input_data)); } }; -template -class LRNMKLDNNHandler - : public MKLDNNHandlerT { +class ReorderMKLDNNHandler : public MKLDNNHandler { public: - LRNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, - const platform::MKLDNNDeviceContext& dev_ctx, - const mkldnn::engine mkldnn_engine, - platform::Place cpu_place, const Tensor* input, - const std::string& unique_name) - - : platform::MKLDNNHandlerT( - dev_ctx, mkldnn_engine, cpu_place, - platform::CreateKey(dev_ctx, framework::vectorize(input->dims()), - unique_name)) { - if (!this->isCached()) { - const int n = ctx.Attr("n"); - // MKL-DNN implements LRN in a caffe way: - // http://caffe.berkeleyvision.org/tutorial/layers/lrn.html - // Where sum of squares is divided by size of normalization window - // this is not the case for PaddlePaddle LRN. - // Hence we need to compensate for this diffrence by - // multipliing alpha by size of window(n) - const float alpha = ctx.Attr("alpha") * static_cast(n); - const float beta = ctx.Attr("beta"); - const float k = ctx.Attr("k"); - bool is_test = ctx.Attr("is_test"); - - auto dims = paddle::framework::vectorize(input->dims()); - - auto src_md = mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), - input->format()); - - this->AcquireForwardPrimitiveDescriptor( - is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - mkldnn::algorithm::lrn_across_channels, src_md, n, alpha, beta, k); - } - } - - LRNMKLDNNHandler(const std::vector& dims, const int n, - const float alpha, const float beta, const float k, - const MKLDNNMemoryFormat fmt, - const MKLDNNMemoryFormat diff_fmt, - const platform::MKLDNNDeviceContext& dev_ctx, - platform::Place cpu_place, const std::string& unique_name) - - : platform::MKLDNNHandlerT( - dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(dev_ctx, dims, unique_name)) { - auto src_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), fmt); - auto diff_md = - mkldnn::memory::desc(dims, platform::MKLDNNGetDataType(), diff_fmt); - - this->AcquireBackwardPrimitiveDescriptor( - mkldnn::algorithm::lrn_across_channels, src_md, diff_md, n, alpha, beta, - k); - } - - std::shared_ptr AcquireWorkspaceMemory( - framework::Tensor* workspace) { - T* ptr = workspace->mutable_data( - this->place_, this->fwd_pd_->workspace_desc().get_size()); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), - ptr, "@wrk_mem_p"); - } - - std::shared_ptr AcquireBackwardWorkspaceMemory( - const framework::Tensor* workspace) { - const T* workspace_data = workspace->data(); - return this->AcquireMemoryFromPrimitive(this->fwd_pd_->workspace_desc(), - to_void_cast(workspace_data), - "@bwd-wrk_mem_p"); - } -}; + ReorderMKLDNNHandler(std::vector& dims, // NOLINT + framework::proto::VarType::Type vtype, + mkldnn::memory::data_type dtype, + const platform::MKLDNNDeviceContext& dev_ctx, + mkldnn::engine engine, const std::string& base_key) + : platform::MKLDNNHandler(dev_ctx, engine, base_key), + dims_(dims), + vtype_(vtype), + vtype_dst_(vtype), + dtype_(dtype), + dtype_dst_(dtype) {} -template -class TransposeMKLDNNHandler : public MKLDNNHandler { - public: - TransposeMKLDNNHandler(std::vector& dims, // NOLINT - std::vector& axis, // NOLINT - const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) + ReorderMKLDNNHandler(std::vector& dims, // NOLINT + framework::proto::VarType::Type vtype, + mkldnn::memory::data_type dtype, + framework::proto::VarType::Type vtype_dst, + mkldnn::memory::data_type dtype_dst, + const platform::MKLDNNDeviceContext& dev_ctx, + mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), dims_(dims), - axis_(axis), - logical_axis_(dims.size(), 0) {} + vtype_(vtype), + vtype_dst_(vtype_dst), + dtype_(dtype), + dtype_dst_(dtype_dst) {} std::shared_ptr AcquireSrcMemory( const MKLDNNMemoryFormat& fmt, void* ptr) { - auto local_key = key_ + "@user_src_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - // Make memory descriptor using input format, unless it - // cannot be trusted (nchw) then make up memory fmt manually - for (size_t i = 0; i < logical_axis_.size(); ++i) { - logical_axis_[i] = i; - } + return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p"); + } - auto src_md = fmt != MKLDNNMemoryFormat::nchw - ? platform::MKLDNNMemDesc( - dims_, platform::MKLDNNGetDataType(), fmt) - : Axis2MemoryDesc(dims_, logical_axis_); - mem_p = std::make_shared(src_md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); + std::shared_ptr AcquireSubmemory( + const std::vector& dims, const std::vector& offset, + const std::shared_ptr& mem_p, int submemory_number = 0) { + std::string local_key = key_; + local_key.append("@submem") + .append(std::to_string(submemory_number)) + .append("_p"); + + auto sub_mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + if (sub_mem_p == nullptr) { + auto sub_md = mem_p->get_desc().submemory_desc(dims, {offset}); + sub_mem_p = std::make_shared(sub_md, engine_, + mem_p->get_data_handle()); + dev_ctx_.SetBlob(local_key, sub_mem_p); } else { - mem_p->set_data_handle(ptr); + sub_mem_p->set_data_handle(mem_p->get_data_handle()); } - return mem_p; + return sub_mem_p; } - std::shared_ptr AcquireDstMemory(framework::Tensor* output, - platform::Place place) { + std::shared_ptr AcquireDstMemory( + framework::Tensor* output, const MKLDNNMemoryFormat& fmt, + platform::Place place) { auto local_key = key_ + "@user_dst_mem_p"; auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - auto dst_md = Axis2MemoryDesc(dims_, axis_); - - auto dst_data = output->mutable_data(place, dst_md.get_size()); + auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt); + auto dst_data = + output->mutable_data(place, vtype_dst_, dst_md.get_size()); mem_p = std::make_shared(dst_md, engine_, dst_data); dev_ctx_.SetBlob(local_key, mem_p); } else { - auto dst_data = output->mutable_data(place); + // Even if memory object exists , we may be using it for diffrent tensor + auto dst_data = + output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size()); mem_p->set_data_handle(dst_data); } return mem_p; } - std::shared_ptr AcquireTranspose( - std::shared_ptr dst_memory_p, - std::shared_ptr src_memory_p) { - auto prim_key = key_ + "@transpose_p"; - auto transpose_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (transpose_p == nullptr) { - transpose_p = - std::make_shared(*(src_memory_p), *(dst_memory_p)); - dev_ctx_.SetBlob(prim_key, transpose_p); - } - return transpose_p; - } - - protected: - mkldnn::memory::desc Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT - std::vector& axis // NOLINT - ) { - size_t ndims = axis.size(); - - std::vector strides(ndims); - unsigned int total_stride = 1; - for (int i = ndims - 1; i >= 0; --i) { - strides[axis[i]] = total_stride; - total_stride *= nchw_tz[axis[i]]; - } - mkldnn::memory::desc mem_d(nchw_tz, platform::MKLDNNGetDataType(), - strides); - - return mem_d; - } - - private: - std::vector dims_; - std::vector axis_; - std::vector logical_axis_; -}; - -class ReorderMKLDNNHandler : public MKLDNNHandler { - public: - ReorderMKLDNNHandler(std::vector& dims, // NOLINT - framework::proto::VarType::Type vtype, - mkldnn::memory::data_type dtype, - const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), - dims_(dims), - vtype_(vtype), - dtype_(dtype) {} - - std::shared_ptr AcquireSrcMemory( - const MKLDNNMemoryFormat& fmt, void* ptr) { - return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p"); - } - std::shared_ptr AcquireDstMemory( - framework::Tensor* output, const MKLDNNMemoryFormat& fmt, + framework::Tensor* output, const std::vector& dims, + const int memory_number, const MKLDNNMemoryFormat& fmt, platform::Place place) { - auto local_key = key_ + "@user_dst_mem_p"; + auto local_key = + key_ + "@user_dst_mem" + std::to_string(memory_number) + "_p"; auto mem_p = std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); if (mem_p == nullptr) { - auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_, fmt); - auto dst_data = output->mutable_data(place, vtype_, dst_md.get_size()); + auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt); + auto dst_data = + output->mutable_data(place, vtype_dst_, dst_md.get_size()); mem_p = std::make_shared(dst_md, engine_, dst_data); dev_ctx_.SetBlob(local_key, mem_p); } else { // Even if memory object exists , we may be using it for diffrent tensor auto dst_data = - output->mutable_data(place, vtype_, mem_p->get_desc().get_size()); + output->mutable_data(place, vtype_dst_, mem_p->get_desc().get_size()); mem_p->set_data_handle(dst_data); } return mem_p; } + std::shared_ptr AcquireReorder( + std::shared_ptr dst_memory_p, + std::shared_ptr src_memory_p, int reorder_number) { + auto prim_key = key_ + "@reorder" + std::to_string(reorder_number) + "_p"; + auto reorder_p = + std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); + if (reorder_p == nullptr) { + reorder_p = + std::make_shared(*(src_memory_p), *(dst_memory_p)); + dev_ctx_.SetBlob(prim_key, reorder_p); + } + return reorder_p; + } + std::shared_ptr AcquireReorder( std::shared_ptr dst_memory_p, std::shared_ptr src_memory_p) { @@ -1015,8 +1187,8 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { private: std::vector dims_; - framework::proto::VarType::Type vtype_; - mkldnn::memory::data_type dtype_; + framework::proto::VarType::Type vtype_, vtype_dst_; + mkldnn::memory::data_type dtype_, dtype_dst_; }; template @@ -1239,7 +1411,7 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireConvolutionPrimitiveDescriptor( const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights, - boost::optional bias, + paddle::optional bias, const mkldnn::memory::desc& dst, const std::vector& strides, const std::vector& dilations, const std::vector& paddings, const mkldnn::engine& engine, @@ -1339,11 +1511,6 @@ using ConvMKLDNNHandler = mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights>; -using ConvTransposeMKLDNNHandler = - ConvMKLDNNTemplateHandler; - template static std::shared_ptr SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py new file mode 100644 index 0000000000..caebcffd0e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_slice_mkldnn_op.py @@ -0,0 +1,199 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import paddle + + +@OpTestTool.skip_if(core.is_compiled_with_cuda(), + "CUDA required dygraph so oneDNN UT must be skipped") +class TestSliceOneDNNOp(OpTest): + def setUp(self): + self.op_type = "slice" + self.config() + self.set_inputs() + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'use_mkldnn': True + } + self.set_attrs() + + def set_inputs(self): + self.inputs = {'Input': self.input} + + def set_attrs(self): + pass + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Input'], 'Out') + + +class TestSliceOneDNNOp1(TestSliceOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + +class TestSliceOneDNNOp2(TestSliceOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, :, 2:-1] + + +class TestSliceDecrease1AxisOneDNNOp(TestSliceOneDNNOp): + def set_attrs(self): + self.attrs['decrease_axis'] = self.decrease_axis + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + +class TestSliceDecrease2AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + +class TestSliceDecrease3AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float32") + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + +class TestSliceDecrease4AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype("float32") + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +class TestSlice5DOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6, 7]).astype("float32") + self.starts = [-1] + self.ends = [1000000] + self.axes = [4] + self.decrease_axis = [4] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, :, -1] + + +class TestSlice3DOneDNNOp(TestSliceDecrease1AxisOneDNNOp): + def config(self): + self.input = np.random.random([5, 4, 5]).astype("float32") + self.starts = [-1] + self.ends = [1000000] + self.axes = [2] + self.decrease_axis = [2] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, -1] + + +# BF16 TESTS +def create_bf16_test_class(parent): + @OpTestTool.skip_if_not_cpu_bf16() + class TestSliceBF16OneDNNOp(parent): + def set_inputs(self): + self.dtype = np.uint16 + self.inputs = {'Input': convert_float_to_uint16(self.input)} + + def calculate_grads(self): + self.dout = self.out + self.dx = np.zeros(shape=self.input.shape) + + begin = [None] * self.input.ndim + end = [None] * self.input.ndim + + for i in range(len(self.axes)): + begin[self.axes[i]] = self.starts[i] + end[self.axes[i]] = self.ends[i] + self.dx[begin[0]:end[0], begin[1]:end[1], begin[2]:end[2], begin[3]: + end[3]] = self.dout + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["Input"], + "Out", + user_defined_grads=[self.dx], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + cls_name = "{0}_{1}".format(parent.__name__, "BF16") + TestSliceBF16OneDNNOp.__name__ = cls_name + globals()[cls_name] = TestSliceBF16OneDNNOp + + +create_bf16_test_class(TestSliceOneDNNOp) +create_bf16_test_class(TestSliceOneDNNOp1) +create_bf16_test_class(TestSliceDecrease1AxisOneDNNOp) +create_bf16_test_class(TestSliceDecrease2AxesOneDNNOp) +create_bf16_test_class(TestSliceDecrease3AxesOneDNNOp) +create_bf16_test_class(TestSliceDecrease4AxesOneDNNOp) + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py new file mode 100644 index 0000000000..bc8c3cc5b2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_static_save_load_bf16.py @@ -0,0 +1,137 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +import paddle.fluid.framework as framework +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.tests.unittests.test_imperative_base import new_program_scope +from paddle.fluid.tests.unittests.test_static_save_load import PtbModel +import numpy as np + + +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestSaveLoadBF16(unittest.TestCase): + def set_place(self): + return fluid.CPUPlace() + + def test_ptb_rnn_cpu_bfloat16(self): + seed = 90 + hidden_size = 10 + vocab_size = 500 + num_layers = 1 + num_steps = 3 + init_scale = 0.1 + batch_size = 4 + batch_num = 100 + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + ptb_model = PtbModel( + "ptb_model", + hidden_size=hidden_size, + vocab_size=vocab_size, + num_layers=num_layers, + num_steps=num_steps, + init_scale=init_scale) + + place = self.set_place() + exe = fluid.Executor(place) + sgd = SGDOptimizer(learning_rate=1e-3) + x = fluid.layers.data( + name="x", shape=[-1, num_steps], dtype='int64') + y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') + init_hidden = fluid.layers.data( + name="init_hidden", shape=[1], dtype='float32') + init_cell = fluid.layers.data( + name="init_cell", shape=[1], dtype='float32') + + static_loss, static_last_hidden, static_last_cell = ptb_model( + x, y, init_hidden, init_cell) + + sgd = paddle.static.amp.bf16.decorate_bf16( + sgd, + amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16( + custom_fp32_list={'transpose2', 'concat'}), + use_bf16_guard=False, + use_pure_bf16=True) + + sgd.minimize(static_loss, framework.default_startup_program()) + out = exe.run(framework.default_startup_program()) + + for i in range(batch_num): + x_data = np.arange(12).reshape(4, 3).astype('int64') + y_data = np.arange(1, 13).reshape(4, 3).astype('int64') + x_data = x_data.reshape((-1, num_steps, 1)) + y_data = y_data.reshape((-1, 1)) + #TODO investigate initializing model with "float32" instead of "uint16" as it was before + # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that) + init_hidden_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='uint16') + init_cell_data = np.zeros( + (num_layers, batch_size, hidden_size), dtype='uint16') + + fetch_list = [static_loss, static_last_hidden, static_last_cell] + out = exe.run(fluid.default_main_program(), + feed={ + "x": x_data, + "y": y_data, + "init_hidden": init_hidden_data, + "init_cell": init_cell_data + }, + fetch_list=fetch_list) + + # get value before save + main_program = framework.default_main_program() + base_map = {} + for var in main_program.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + t = np.array(fluid.global_scope().find_var(var.name) + .get_tensor()) + # make sure all the paramerter or optimizer var have been update + self.assertTrue(np.sum(np.abs(t)) != 0) + base_map[var.name] = t + + fluid.save(main_program, "./test_1") + + # set var to zero + for var in main_program.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + ten = fluid.global_scope().find_var(var.name).get_tensor() + ten.set(np.zeros_like(np.array(ten)), place) + + new_t = np.array(fluid.global_scope().find_var(var.name) + .get_tensor()) + # make sure all the paramerter or optimizer var have been set to zero + self.assertTrue(np.sum(np.abs(new_t)) == 0) + + fluid.load(main_program, "./test_1.pdparams", exe) + + for var in main_program.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + new_t = np.array(fluid.global_scope().find_var(var.name) + .get_tensor()) + base_t = base_map[var.name] + self.assertTrue(np.array_equal(new_t, base_t)) + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() -- Gitee