From 77dbabb0f824df80ecc091ebb6efbdeccf0a34d2 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Thu, 13 Jan 2022 14:24:39 +0800
Subject: [PATCH] mirgate_38871

---
 paddle/fluid/eager/api/utils/tensor_utils.cc  |  62 ++++
 paddle/fluid/eager/backward.cc                | 232 ++++++++++++
 paddle/fluid/eager/eager_tensor.h             |   8 +-
 paddle/fluid/eager/grad_node_info.cc          | 270 ++++++++++++++
 paddle/fluid/eager/grad_node_info.h           | 231 ++++++++++++
 .../grad_node_info_test.cc                    | 161 +++++++++
 .../eager/tests/task_tests/backward_test.cc   | 332 ++++++++++++++++++
 .../cross_batch_accumulation_test.cc          |  88 +++++
 .../fluid/eager/tests/task_tests/hook_test.cc | 218 ++++++++++++
 paddle/fluid/pybind/eager_method.cc           |  44 +++
 .../tests/unittests/test_egr_python_api.py    |  27 ++
 .../unittests/test_imperative_auto_prune.py   |  15 +-
 12 files changed, 1681 insertions(+), 7 deletions(-)
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index e69de29bb2..115c9144df 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/utils/global_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/utils.h"
+
+#include "paddle/pten/api/all.h"
+
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/pten_utils.h"
+#include "paddle/fluid/framework/variable.h"
+
+namespace egr {
+namespace egr_utils_api {
+
+bool IsLeafTensor(const egr::EagerTensor& target) {
+  std::shared_ptr<GradNodeBase> grad_node = EagerUtils::grad_node(target);
+  if (std::dynamic_pointer_cast<GradNodeAccumulation>(grad_node)) {
+    return true;
+  }
+
+  return false;
+}
+
+egr::EagerTensor CreateTensorWithValue(const pten::DDim& ddim,
+                                       const paddle::platform::Place& place,
+                                       const pten::DataType& dtype,
+                                       const pten::DataLayout& layout,
+                                       float value, bool is_leaf) {
+  paddle::experimental::Tensor tensor = paddle::experimental::full(
+      paddle::framework::vectorize(ddim), paddle::experimental::Scalar(value),
+      dtype, pten::TransToPtenBackend(place), layout);
+
+  egr::EagerTensor out = egr::EagerTensor();
+  out.set_tensor(std::make_shared<paddle::experimental::Tensor>(tensor));
+  auto meta = EagerUtils::autograd_meta(&out);
+  if (is_leaf) {
+    auto accumulation_node = std::make_shared<GradNodeAccumulation>();
+    meta->SetGradNode(accumulation_node);
+    meta->SetStopGradient(false);
+  }
+
+  return out;
+}
+
+}  // namespace egr_utils_api
+}  // namespace egr
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index e69de29bb2..01cb1b81e3 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -0,0 +1,232 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/backward.h"
+#include <queue>
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/grad_tensor_holder.h"
+#include "paddle/fluid/eager/utils.h"
+
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+
+#include "glog/logging.h"
+
+namespace egr {
+
+std::unordered_map<GradNodeBase*, int> getInDegreeMap(
+    const std::queue<GradNodeBase*>& init_queue) {
+  // Calculate in_degree for each node
+  // We can completely remove this pass, if in_degree were set during forward
+  // pass
+  std::unordered_map<GradNodeBase*, int> node_in_degree_map;
+
+  // Copy nodes
+  std::queue<GradNodeBase*> queue = init_queue;
+  std::unordered_set<GradNodeBase*> visited;
+
+  // Visit each node exactly once in any order
+  while (!queue.empty()) {
+    GradNodeBase* node = queue.front();
+    queue.pop();
+
+    if (visited.count(node)) {
+      continue;
+    }
+    visited.insert(node);
+
+    // Find and append next nodes
+    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
+    for (const auto& edge_list : edges) {
+      for (const Edge& edge : edge_list) {
+        GradNodeBase* next_node = edge.GetMutableGradNode().get();
+
+        // Next node could be nullptr if it is leaf tensor with no
+        // AccumulationNode attached
+        // Or it could also originated from dispensable inputs
+        if (!next_node) continue;
+
+        // Update in_degree
+        if (!node_in_degree_map.count(next_node))
+          node_in_degree_map[next_node] = 0;
+        node_in_degree_map[next_node]++;
+        queue.push(next_node);
+      }
+    }
+  }
+
+  return node_in_degree_map;
+}
+
+void RunBackwardHooks(
+    const std::vector<std::vector<egr::EagerTensor>>& grad_tensors,
+    egr::GradNodeBase* grad_node) {
+  grad_node->ApplyGradientHooks(grad_tensors);
+  VLOG(6) << "Apply Reduce Hooks for node";
+  grad_node->ApplyReduceHooks();
+}
+
+void RunBackward(const std::vector<egr::EagerTensor>& tensors,
+                 const std::vector<egr::EagerTensor>& grad_tensors,
+                 bool retain_graph) {
+  VLOG(6) << "Start Backward";
+  // *Gradient Hook should happen at node-level
+  // *Inplace version check should perform at node-level
+  // *Cross-batch accumulation happens at forward pass
+
+  /* --- Initialization --- */
+  // 1. Init queue with starting nodes
+  // 2. Prepare initial input buffers
+  std::queue<GradNodeBase*> queue;
+  std::unordered_map<GradNodeBase*, std::unique_ptr<GradTensorHolder>>
+      node_input_buffers_dict;
+  for (size_t i = 0; i < tensors.size(); i++) {
+    const egr::EagerTensor& tensor = tensors[i];
+
+    AutogradMeta* auto_grad_meta = EagerUtils::unsafe_autograd_meta(tensor);
+    // Get grad input info from target tensors
+    auto input_info = auto_grad_meta->OutRankInfo();
+
+    VLOG(2) << "Out Rank of Tensor is slot: " << input_info.first
+            << ", rank: " << input_info.second;
+    // Get target GradNodeBase from target tensors
+    GradNodeBase* grad_node = auto_grad_meta->GetMutableGradNode().get();
+
+    // Prepare GradTensorHolder
+    if (!node_input_buffers_dict.count(grad_node)) {
+      VLOG(6) << "Create Value for grad input tensor " << i;
+      node_input_buffers_dict[grad_node] =
+          std::make_unique<GradTensorHolder>(grad_node->InputMeta());
+    }
+
+    if (grad_tensors.size() > 0) {
+      PADDLE_ENFORCE(
+          grad_tensors.size() == tensors.size(),
+          paddle::platform::errors::Fatal(
+              "Detected size mismatch between tensors and grad_tensors"
+              "grad_tensors should either have "
+              "size = 0 or same size as tensors"));
+      // Feed given tensor if it's provided
+      VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
+      node_input_buffers_dict[grad_node]->add(
+          input_info.first, input_info.second, grad_tensors[i]);
+
+    } else {
+      VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
+      // Initialize tensor with 1.0
+      // Forward Tensor "tensor" is passed to indicate tensortype, datatype and
+      // dims
+      // GradTensorHolder will initialize another tensor with same tensortype,
+      // datatype and dims but filled with 1.0
+      node_input_buffers_dict[grad_node]->add(
+          input_info.first, input_info.second, tensor, true /*fill_one=true*/);
+    }
+
+    // Prepare queue
+    queue.push(grad_node);
+  }
+
+  VLOG(6) << "Update In degree Map for backward";
+  // 3. Compute in_degree for each node
+  std::unordered_map<GradNodeBase*, int> node_in_degree_map =
+      getInDegreeMap(queue);
+
+  /* --- Topological Visit --- */
+  // 1. Pop queue
+  // 2. Run node
+  //    |- node(grads)
+  //    |- Prepare for next node
+  // 3. Update queue
+  VLOG(6) << "Run Backward";
+  while (!queue.empty()) {
+    GradNodeBase* node = queue.front();
+    queue.pop();
+
+    // Run node: This is where Hook happens
+    PADDLE_ENFORCE(
+        node_input_buffers_dict.count(node),
+        paddle::platform::errors::Fatal(
+            "Unable to find next node in the InputBuufer"
+            "Trying to run Node without configuring its GradTensorHolder"));
+
+    std::unique_ptr<GradTensorHolder> node_input_buffer =
+        std::move(node_input_buffers_dict[node]);
+    VLOG(6) << "Run Backward Kernel with input_buffer";
+
+    RunBackwardHooks(node_input_buffer->Buffers(), node);
+    // TODO(jiabin): Support post hook here and make hook run in seperate
+    // operator
+    // Run Pre Backward Node and get outputs
+    std::vector<std::vector<egr::EagerTensor>> grad_output_tensors =
+        (*node)(node_input_buffer->Buffers());
+    // TODO(jiabin): Should we erase it or find a more efficient way.
+    node_input_buffers_dict.erase(node);
+
+    // Prepare GradTensorHolder for next node
+    const std::vector<std::vector<Edge>>& edges = node->GetEdges();
+
+    PADDLE_ENFORCE(edges.size() == grad_output_tensors.size() || edges.empty(),
+                   paddle::platform::errors::Fatal(
+                       "Number of edges should be either empty ( for leaf node "
+                       ") or the same as number of output grad tensors, but we "
+                       "got edges size is: %d, grad_output size is: %d",
+                       edges.size(), grad_output_tensors.size()));
+
+    for (size_t i = 0; i < edges.size(); i++) {
+      for (size_t j = 0; j < edges[i].size(); j++) {
+        const Edge& edge = edges[i][j];
+        auto edge_rank = edge.GetEdgeRankInfo();
+        // Since we make edge has as same rank as bwd outputs, we indexing them
+        // with
+        // the same rank(i, j)
+        VLOG(6) << "Get Edge with slot: " << i << ", rank: " << j;
+        egr::EagerTensor& grad_output_tensor = grad_output_tensors[i][j];
+        if (!grad_output_tensor.defined() ||
+            !grad_output_tensor.initialized()) {
+          VLOG(6) << "We get grad_output_tensor with slot: " << i
+                  << ", rank: " << j << " as uninitialized or undefined tensor";
+        }
+        GradNodeBase* next_node = edge.GetMutableGradNode().get();
+
+        // Next node could be nullptr if it is leaf tensor with no
+        // AccumulationNode attached
+        // Or it could also originated from dispensable inputs
+        if (!next_node) continue;
+
+        if (!node_input_buffers_dict.count(next_node)) {
+          node_input_buffers_dict[next_node] =
+              std::make_unique<GradTensorHolder>(next_node->InputMeta());
+        }
+        VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
+                << ", rank: " << edge_rank.second;
+        node_input_buffers_dict[next_node]->add(
+            edge_rank.first, edge_rank.second, grad_output_tensor);
+
+        // Update queue
+        node_in_degree_map[next_node]--;
+        PADDLE_ENFORCE(node_in_degree_map[next_node] >= 0,
+                       paddle::platform::errors::Fatal(
+                           "Detected in-degree value smaller than zero."
+                           "Node's in-degree cannot be negative"));
+        if (node_in_degree_map[next_node] == 0) {
+          queue.emplace(std::move(next_node));
+        }
+      }
+    }
+  }
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h
index 72fe5732e9..80faad9080 100644
--- a/paddle/fluid/eager/eager_tensor.h
+++ b/paddle/fluid/eager/eager_tensor.h
@@ -195,7 +195,6 @@ class EagerTensor final {
     }
     tensor_->copy_(*(src.tensor_.get()), blocking);
   }
-
   /* Part 6: Operator overloading */
   EagerTensor& operator=(const EagerTensor& x) & {
     tensor_ = x.tensor_;
@@ -238,7 +237,7 @@ class EagerTensor final {
           // Contruct framework::Tensor from egr::EagerTensor
           auto tensor_dense =
               std::dynamic_pointer_cast<pten::DenseTensor>(tensor_->impl());
-          if (tensor_dense) {
+          if (tensor_dense && tensor_dense.get()) {
             paddle::experimental::SharesStorage(tensor_dense.get(),
                                                 framework_tensor);
           } else {
@@ -292,11 +291,10 @@ class EagerTensor final {
   template <typename LEGACY_TYPE, typename TYPE>
   void SetImplWithLegacyTensor() {
     const auto& framework_tensor = var_.Get<LEGACY_TYPE>();
-    if (this->initialized()) {
+    if (defined()) {
       VLOG(8) << "Sync Var to initialized tensor for: " << name();
       paddle::experimental::ReMakePtenDenseTensor(
-          framework_tensor,
-          static_cast<pten::DenseTensor*>(this->impl().get()));
+          framework_tensor, static_cast<pten::DenseTensor*>(impl().get()));
     } else {
       VLOG(8) << "Sync Var to uninitialized tensor for: " << name();
       this->set_impl(std::move(
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index e69de29bb2..49bd416d46 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -0,0 +1,270 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+#include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/errors.h"
+
+#include "glog/logging.h"
+
+/**
+ * Implementation of GradNodeBase, Edge and InputBuffer.
+**/
+namespace egr {
+
+GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
+  bwd_in_meta_.resize(bwd_in_slot_num);
+  bwd_out_meta_.resize(bwd_out_slot_num);
+  // adj_edges has the same num as backward outputs
+  adj_edges_.resize(bwd_out_slot_num);
+}
+
+void GradNodeBase::AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id) {
+  PADDLE_ENFORCE_LT(
+      slot_id, adj_edges_.size(),
+      paddle::platform::errors::InvalidArgument(
+          "Given slot id is out of range of adj_edges outter size, "
+          "adj_edges is designed to has the same size of grad "
+          "inputs's slot num."));
+  for (const auto& meta : *metas) {
+    // adj_edges has as same rank as fwd inputs, and record it's output rank
+    // from
+    // its pre-ops
+    if (meta && !meta->StopGradient()) {
+      auto node = meta->GetMutableGradNode();
+      if (node) {
+        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                         meta->OutRankInfo());
+      } else {
+        meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+        adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                         meta->OutRankInfo());
+      }
+    }
+  }
+}
+
+void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) {
+  PADDLE_ENFORCE_LT(
+      slot_id, adj_edges_.size(),
+      paddle::platform::errors::InvalidArgument(
+          "Given slot id is out of range of adj_edges outter size, "
+          "adj_edges is designed to has the same size of grad "
+          "inputs's slot num."));
+  if (meta && !meta->StopGradient()) {
+    VLOG(6) << "Add Edges for slot: " << slot_id;
+    auto node = meta->GetMutableGradNode();
+    if (node) {
+      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                       meta->OutRankInfo());
+    } else {
+      meta->SetGradNode(std::make_shared<egr::GradNodeAccumulation>());
+      adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(),
+                                       meta->OutRankInfo());
+    }
+  }
+}
+
+const std::vector<GradSlotMeta>& GradNodeBase::InputMeta() const {
+  return bwd_in_meta_;
+}
+
+const std::vector<GradSlotMeta>& GradNodeBase::OutputMeta() const {
+  return bwd_out_meta_;
+}
+
+void GradNodeBase::SetGradInMeta(const std::vector<AutogradMeta*>& fwd_out,
+                                 size_t slot_rank) {
+  size_t slot_size = fwd_out.size();
+  PADDLE_ENFORCE_LE(
+      slot_rank, (bwd_in_meta_.size() - 1),
+      paddle::platform::errors::InvalidArgument(
+          "Slot Rank should less equal than bwd_in_meta_ size, since "
+          "bwd_in_meta_ is designed to hold as same num as backward "
+          "inputs."));
+  auto& meta = bwd_in_meta_.at(slot_rank);
+  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "Bwd_in_meta should only be init once, addition "
+                        "initialization for it is forbidden. If you got this "
+                        "error, it indicates bugs in framework."));
+  // Init stop gradient vector before use to avoid push back
+  meta.Init(slot_size);
+  for (size_t i = 0; i < slot_size; i++) {
+    PADDLE_ENFORCE_NOT_NULL(fwd_out[i],
+                            paddle::platform::errors::PreconditionNotMet(
+                                "Bwd_in_meta should only be called while "
+                                "autograd_meta is not null. If you got this "
+                                "error, it indicates bugs in framework."));
+    if (fwd_out[i]->StopGradient()) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(i, fwd_out[i]->StopGradient());
+    }
+  }
+}
+
+void GradNodeBase::SetGradInMeta(const AutogradMeta& fwd_out,
+                                 size_t slot_rank) {
+  PADDLE_ENFORCE_LE(
+      slot_rank, (bwd_in_meta_.size() - 1),
+      paddle::platform::errors::InvalidArgument(
+          "Slot Rank should less equal than bwd_in_meta_ size, since "
+          "bwd_in_meta_ is designed to hold as same num as backward "
+          "inputs."));
+  auto& meta = bwd_in_meta_.at(slot_rank);
+  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "Bwd_in_meta should only be init once, Additional "
+                        "initialization for it is forbidden. If you got this "
+                        "error, it indicates bugs in framework."));
+  // Init stop gradient vector before use to avoid push back
+  VLOG(7) << "Init bwd_in_meta_ with slot rank: " << slot_rank;
+  meta.Init(1);
+  meta.SetStopGradient(0, fwd_out.StopGradient());
+}
+
+void GradNodeBase::SetGradOutMeta(const std::vector<AutogradMeta*>& fwd_in,
+                                  size_t slot_rank) {
+  size_t slot_size = fwd_in.size();
+  PADDLE_ENFORCE_LE(
+      slot_rank, (bwd_out_meta_.size() - 1),
+      paddle::platform::errors::InvalidArgument(
+          "Slot Rank should less equal than bwd_out_meta_ size, "
+          "since bwd_out_meta_ is designed to hold as same num as "
+          "backward outputs."));
+  auto& meta = bwd_out_meta_.at(slot_rank);
+  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "Bwd_out_meta should only be init once. Additional "
+                        "initialization for it is forbidden. If you got this "
+                        "error, it indicates bugs in framework."));
+  // Init stop gradient vector before use to avoid push back
+  meta.Init(slot_size);
+  for (size_t i = 0; i < slot_size; i++) {
+    if (!fwd_in[i]) {
+      meta.SetStopGradient(i, true);
+      continue;
+    }
+    if (fwd_in[i]->StopGradient()) {
+      // Set Stop Gradient only when its true or non-initialized autograd_meta,
+      // since all default value is false.
+      meta.SetStopGradient(i, fwd_in[i]->StopGradient());
+    }
+  }
+}
+
+void GradNodeBase::SetGradOutMeta(const AutogradMeta& fwd_in,
+                                  size_t slot_rank) {
+  PADDLE_ENFORCE_LE(
+      (slot_rank + 1), bwd_out_meta_.size(),
+      paddle::platform::errors::InvalidArgument(
+          "Slot Rank should less equal than bwd_out_meta_ size, "
+          "since bwd_out_meta_ is designed to hold as same num as "
+          "backward outputs."));
+  auto& meta = bwd_out_meta_.at(slot_rank);
+  PADDLE_ENFORCE_EQ(meta.IsInitialized(), false,
+                    paddle::platform::errors::PreconditionNotMet(
+                        "Bwd_out_meta should only be init once. Additional "
+                        "initialization for it is forbidden. If you got this "
+                        "error, it indicates bugs in framework."));
+  // Init stop gradient vector before use to avoid push back
+  meta.Init(1);
+  meta.SetStopGradient(0, fwd_in.StopGradient());
+}
+
+void GradNodeBase::SetDefaultGradInOutMeta() {
+  PADDLE_ENFORCE((bwd_out_meta_.size() == 1) && (bwd_in_meta_.size() == 1),
+                 paddle::platform::errors::PreconditionNotMet(
+                     "We can only support 1 input and 1 output in default grad "
+                     "meta setter, other size of inputs and outputs should "
+                     "create with Setter and Getters"));
+  // Default stop_gradient is false and slot id is 0, slot size is 1;
+  bwd_out_meta_[0].Init(1);
+  bwd_in_meta_[0].Init(1);
+}
+
+const std::vector<std::vector<Edge>>& GradNodeBase::GetEdges() const {
+  return adj_edges_;
+}
+
+void GradNodeBase::RegisterGradientHook(
+    size_t slot_id, size_t rank,
+    const std::function<egr::EagerTensor(const egr::EagerTensor&)>& hook) {
+  gradient_hooks_.emplace_back(std::make_tuple(slot_id, rank, hook));
+}
+
+void GradNodeBase::RegisterReduceHook(const std::function<void(void)>& hook) {
+  reduce_hooks_.emplace_back(hook);
+}
+
+std::vector<std::vector<egr::EagerTensor>> GradNodeBase::ApplyGradientHooks(
+    const std::vector<std::vector<egr::EagerTensor>>& tensors) {
+  std::vector<std::vector<egr::EagerTensor>> outs(tensors.size());
+  for (auto& tuple : gradient_hooks_) {
+    size_t slot_id = std::get<0>(tuple);
+    size_t rank = std::get<1>(tuple);
+    std::function<egr::EagerTensor(const egr::EagerTensor&)>& hook =
+        std::get<2>(tuple);
+
+    PADDLE_ENFORCE(slot_id < tensors.size(),
+                   paddle::platform::errors::Fatal(
+                       "Slot_id from registered hook should be smaller than "
+                       "slot size of grad_tensors"));
+
+    PADDLE_ENFORCE(rank < tensors[slot_id].size(),
+                   paddle::platform::errors::Fatal(
+                       "rank of slot %d from registered hook should be smaller "
+                       "than rank size of grad_tensors",
+                       slot_id));
+
+    std::vector<egr::EagerTensor>& slot_out = outs[slot_id];
+    slot_out.resize(tensors[slot_id].size());
+    egr::EagerTensor& out = slot_out[rank];
+    if (!out.defined() || !out.initialized()) {
+      VLOG(8) << "Run Hook for tensor: " << tensors[slot_id][rank].name();
+      out = hook(tensors[slot_id][rank]);
+    } else {
+      // TODO(jiabin): Why this?
+      out = hook(out);
+    }
+  }
+
+  for (size_t i = 0; i < outs.size(); i++) {
+    if (outs[i].empty() && (!tensors[i].empty())) {
+      outs[i].resize(tensors[i].size());
+    }
+    // TODO(Jiabin): Optimize this if we only add hook slot by slot
+    for (size_t j = 0; j < outs[i].size(); j++) {
+      if (!outs[i][j].defined() || !outs[i][j].initialized()) {
+        outs[i][j] = tensors[i][j];
+      }
+    }
+  }
+
+  return outs;
+}
+
+void GradNodeBase::ApplyReduceHooks() {
+  for (auto& hook : reduce_hooks_) {
+    hook();
+  }
+}
+}  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index e69de29bb2..f15c50ef75 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -0,0 +1,231 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/pten/api/all.h"
+#include "paddle/pten/include/core.h"
+
+namespace egr {
+/**
+ * GradNodeBase is base class of all grad node, which is what should be used by
+ * eager execution, we define most of backward autograd members here, and for
+ * each Operator, they should hold their onw forward Inputs as TensorWrapper.
+ *
+ * The GradNodeBase will be held in autograd_meta, and it is also a member of
+ * Edge, which indicates the edge of backward graph.
+ *
+ * TODO:(yangzhanlue) GradNodeBase will also in charge of get the correct input
+ * from GradOpDescMaker to GradNodeBase.
+ *
+ * NOTE:GradNodeBase has a method named run, this method should be overrided by
+ * the
+ * specific derived class, it will prepare backward inputs and double backward's
+ * depends. Then, it will call C++ API of backward kernel functions to finish
+ * backward computation.
+ *
+ * NOTE:GradNodeBase holds its own inputs and Outputs
+ *
+ * Edge is defined to descripe depend of backward, an Edge is what linked
+ * between two
+ * node, it should contain a Node and rank of this Node (this is used to
+ * indicate which
+ * input of grad this edge belong).
+ * */
+class Edge;
+class AutogradMeta;
+
+/**
+ * GradSlotMeta is used to Record Forward Tensor info to backward, since paddle
+ * has lots of operators
+ * whose backward logic is depends on if it has some specific inputs or outputs.
+ * So, we need a meta info
+ * to record it's needs.
+ * **/
+class GradSlotMeta {
+ public:
+  GradSlotMeta() = default;
+  void Init(size_t size) {
+    size_ = static_cast<int>(size);
+    stop_gradient_.resize(size, false);
+  }
+
+  bool IsInitialized() const { return size_ != -1; }
+  bool IsStopGradient(size_t rank) const { return stop_gradient_[rank]; }
+  int Size() const { return size_; }
+  void SetStopGradient(size_t rank, bool stop_gradient = true) {
+    stop_gradient_.at(rank) = stop_gradient;
+  }
+
+ private:
+  int size_{-1};
+  std::vector<bool> stop_gradient_{false};
+};
+
+class GradNodeBase {
+ public:
+  GradNodeBase() = default;
+  GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
+  // TODO(jiabin): Should we have other constructor here?
+  virtual ~GradNodeBase() = default;
+
+  /**
+   * operator() designed to contian the real backward execution logic, it should
+   * be
+   * overrided by derived class defined for each operator. It accepts a vector
+   * of
+   * Tensor which contains grads input of current operator
+   *
+   * Note: why we need backward inputs and outputs construct as vector of vector
+   * of egr::EagerTensor?
+   * Since all of paddle op composite in form of {"Slot name ", vector<Var>},
+   * so, vector of vector
+   * is better choice to fit this format.
+   * **/
+  virtual std::vector<std::vector<egr::EagerTensor>> operator()(
+      const std::vector<std::vector<egr::EagerTensor>>& grads) = 0;
+
+  /**
+   * AddEdges is designed to set input tensors' backward Node as current
+   * node's Edges.
+   * This method should be call in forward code and for double backward depends
+   * computation.
+   *
+   * This one is called slot by slot
+   * **/
+  void AddEdges(std::vector<AutogradMeta*>* metas, size_t slot_id);
+  void AddEdges(AutogradMeta* meta, size_t slot_id);
+
+  /**
+   * GetEdges is designed to get all edges of current node**/
+  const std::vector<std::vector<Edge>>& GetEdges() const;
+
+  /**
+   * Get Input Meta of current Grad node**/
+  const std::vector<GradSlotMeta>& InputMeta() const;
+  /**
+   * Get Output Meta of current Grad node**/
+  const std::vector<GradSlotMeta>& OutputMeta() const;
+  /**
+   * Set bwd ins and outs info with forward vars
+   * **/
+
+  void SetGradInMeta(const std::vector<AutogradMeta*>& fwd_out,
+                     size_t slot_rank);
+  void SetGradInMeta(const AutogradMeta& fwd_out, size_t slot_rank);
+
+  void SetGradOutMeta(const std::vector<AutogradMeta*>& fwd_in,
+                      size_t slot_rank);
+  void SetGradOutMeta(const AutogradMeta& fwd_in, size_t slot_rank);
+
+  /**
+   * Default setters for Grad in/out meta this should be used for same special
+   * Node which will not create by user
+   * **/
+  void SetDefaultGradInOutMeta();
+  /**
+   * Register GradientHook or ReduceHook
+   * **/
+  void RegisterGradientHook(
+      size_t slot_id, size_t rank,
+      const std::function<egr::EagerTensor(const egr::EagerTensor&)>& hook);
+  void RegisterReduceHook(const std::function<void(void)>& hook);
+
+  /**
+   * Apply GradientHook or ReduceHook
+   * **/
+  inline bool GradientHooksRegistered() { return gradient_hooks_.size() != 0; }
+  inline bool ReduceHooksRegistered() { return reduce_hooks_.size() != 0; }
+
+  std::vector<std::vector<egr::EagerTensor>> ApplyGradientHooks(
+      const std::vector<std::vector<egr::EagerTensor>>& tensors);
+  void ApplyReduceHooks();
+
+ private:
+  // TODO(jiabin): Use SmallVector instead after merge PR from develop
+
+  // Edges recorded the backward related node info, which indicate all edges
+  // linked
+  // by this Grad Node.
+  // Why we need vector<vector<Edge>>: Edges is as same rank as bwd output.
+  std::vector<std::vector<Edge>> adj_edges_;
+
+  // bwd_out_meta_ is used to record Grad output info for backward
+  std::vector<GradSlotMeta> bwd_out_meta_;
+
+  // bwd_in_meta_ used to record Grad input info for backward
+  std::vector<GradSlotMeta> bwd_in_meta_;
+  // Gradient Hooks
+  // Customer may register a list of hooks which will be called in order during
+  // backward
+  // Each entry consists one pair of <out_rank, std::function>
+  std::vector<std::tuple<
+      /* slot id */ size_t, /* rank */ size_t,
+      /* hook */ std::function<egr::EagerTensor(const egr::EagerTensor&)>>>
+      gradient_hooks_;
+  std::vector<std::function<void(void)>> reduce_hooks_;
+};
+
+class Edge {
+ public:
+  // Default constructor for Edges in order to construct it for AutogradMeta
+  Edge() : in_slot_id_(0), in_rank_(0), grad_node_(nullptr) {}
+
+  // In real use cases we should create Edge from grad node and input rank which
+  // indicate which edge it is.
+  // Since we have slot design in operators we will have to locate an edge with
+  // slot
+  // and rank.
+  Edge(const std::shared_ptr<GradNodeBase>& grad_node, size_t in_slot_id,
+       size_t in_rank)
+      : in_slot_id_(in_slot_id), in_rank_(in_rank), grad_node_(grad_node) {}
+
+  Edge(const std::shared_ptr<GradNodeBase>& grad_node,
+       const std::pair</* slot_id */ size_t, /* rank */ size_t>& rank_info)
+      : in_slot_id_(rank_info.first),
+        in_rank_(rank_info.second),
+        grad_node_(grad_node) {}
+
+  GradNodeBase* GetGradNode() const { return grad_node_.get(); }
+
+  std::shared_ptr<GradNodeBase> GetMutableGradNode() const {
+    return grad_node_;
+  }
+
+  std::pair<size_t, size_t> GetEdgeRankInfo() const {
+    return std::make_pair(in_slot_id_, in_rank_);
+  }
+
+  void SetEdgeRankInfo(size_t slot_id, size_t in_rank) {
+    in_slot_id_ = slot_id;
+    in_rank_ = in_rank;
+  }
+
+  void SetEdgeRankInfo(
+      const std::pair</* slot_id */ size_t, /* rank */ size_t>& edge_rank) {
+    in_slot_id_ = edge_rank.first;
+    in_rank_ = edge_rank.second;
+  }
+
+  // Currently we use grad_node_ to identify if a edge is initialized.
+  bool IsInitialized() const { return grad_node_.get(); }
+
+ private:
+  size_t in_slot_id_;
+  size_t in_rank_;
+  std::shared_ptr<GradNodeBase> grad_node_;
+};
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
index e69de29bb2..a89fb019d5 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+
+TEST(GradNodeInfo, GradSlotMeta) {
+  auto grad_slot = egr::GradSlotMeta();
+  CHECK(grad_slot.IsInitialized() == false);
+  VLOG(6) << "Init GradSlotMeta";
+  grad_slot.Init(2);
+  CHECK(grad_slot.IsInitialized() == true);
+  VLOG(6) << "Set SetStopGradient";
+  grad_slot.SetStopGradient(0);
+  CHECK(grad_slot.IsStopGradient(0) == true);
+  CHECK_EQ(grad_slot.Size(), 2);
+}
+
+TEST(GradNodeInfo, GradNodeBase) {
+  VLOG(6) << "Construct Grad Node";
+  auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(
+      /* val */ 5.0, /* in_num */ 2, /* out_num */ 2);
+  auto grad_test_node1 = std::make_shared<eager_test::GradTestNode>();
+  std::vector<std::vector<egr::EagerTensor>> grads;
+  pten::DenseTensorMeta meta = pten::DenseTensorMeta(
+      pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
+  std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace()),
+      meta);
+  auto* dt_ptr = dt->mutable_data<float>();
+  dt_ptr[0] = 5.0f;
+  egr::EagerTensor et1(dt);
+  grads = {{et1}};
+  VLOG(6) << "Test Grad Node Call";
+  auto res = (*grad_test_node0)(grads);
+  CHECK_EQ(std::dynamic_pointer_cast<pten::DenseTensor>(res[0][0].impl())
+               ->data<float>()[0],
+           6.0f);
+  VLOG(6) << "Test Add Edges";
+  egr::Edge edge0(grad_test_node1, 1, 2);
+  auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
+  auto_grad0->SetStopGradient(false);
+  egr::Edge edge1(grad_test_node1, 3, 4);
+  auto auto_grad1 = std::make_shared<egr::AutogradMeta>(edge1);
+  auto_grad1->SetStopGradient(false);
+  grad_test_node0->AddEdges(auto_grad0.get(), 0);
+  CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().first,
+           size_t(1));
+  CHECK_EQ(grad_test_node0->GetEdges()[0][0].GetEdgeRankInfo().second,
+           size_t(2));
+  std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
+  grad_test_node0->AddEdges(&metas, 1);
+  CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().first,
+           size_t(3));
+  CHECK_EQ(grad_test_node0->GetEdges()[1][0].GetEdgeRankInfo().second,
+           size_t(4));
+
+  VLOG(6) << "Test Set Meta and Get Meta";
+  auto_grad1->SetStopGradient(true);
+  grad_test_node0->SetGradInMeta(metas, 0);
+  grad_test_node0->SetGradInMeta(*auto_grad1.get(), 1);
+  grad_test_node0->SetGradOutMeta(metas, 0);
+  grad_test_node0->SetGradOutMeta(*auto_grad1.get(), 1);
+  CHECK_EQ(grad_test_node0->InputMeta()[0].Size(), 1);
+  CHECK_EQ(grad_test_node0->InputMeta()[1].Size(), 1);
+  CHECK(grad_test_node0->OutputMeta()[0].IsStopGradient(0));
+  CHECK(grad_test_node0->OutputMeta()[1].IsStopGradient(0));
+
+  VLOG(6) << "Test Default Set Meta and Get Meta";
+  auto grad_test_node2 = std::make_shared<eager_test::GradTestNode>(
+      /* val */ 5.0, /* in_num */ 1, /* out_num */ 1);
+  grad_test_node2->SetDefaultGradInOutMeta();
+  CHECK(grad_test_node2->OutputMeta()[0].IsInitialized());
+  CHECK(grad_test_node2->OutputMeta()[0].IsStopGradient(0) == false);
+  CHECK_EQ(grad_test_node2->OutputMeta()[0].Size(), 1);
+
+  VLOG(6) << "Test Gradient Hook";
+  auto gradient_hook = [](const egr::EagerTensor& et) -> egr::EagerTensor {
+    egr::EagerTensor res;
+    pten::DenseTensorMeta meta = pten::DenseTensorMeta(
+        pten::DataType::FLOAT32, paddle::framework::make_ddim({1, 1}));
+    std::shared_ptr<pten::DenseTensor> dt = std::make_shared<pten::DenseTensor>(
+        std::make_shared<paddle::experimental::DefaultAllocator>(
+            paddle::platform::CPUPlace()),
+        meta);
+    auto* dt_ptr = dt->mutable_data<float>();
+    dt_ptr[0] = 6.0f;
+    auto* et_ptr =
+        std::dynamic_pointer_cast<pten::DenseTensor>(et.impl())->data<float>();
+    dt_ptr[0] += et_ptr[0];
+    res.set_impl(dt);
+    VLOG(6) << "Running Gradient Hook";
+    return res;
+  };
+  grad_test_node0->RegisterGradientHook(0, 0, gradient_hook);
+  // 5 + 6
+  auto grad_hook_res = grad_test_node0->ApplyGradientHooks(grads);
+  CHECK_EQ(
+      std::dynamic_pointer_cast<pten::DenseTensor>(grad_hook_res[0][0].impl())
+          ->data<float>()[0],
+      11.0);
+
+  VLOG(6) << "Test Reduce Hook";
+  auto reduce_hook = [&](void) -> void {
+    auto* et_ptr = std::dynamic_pointer_cast<pten::DenseTensor>(et1.impl())
+                       ->mutable_data<float>();
+    et_ptr[0] = 100.0;
+    VLOG(6) << "Running Reduce Hook";
+  };
+  grad_test_node0->RegisterReduceHook(reduce_hook);
+  grad_test_node0->ApplyReduceHooks();
+  CHECK_EQ(std::dynamic_pointer_cast<pten::DenseTensor>(et1.impl())
+               ->data<float>()[0],
+           100.0);
+}
+
+TEST(GradNodeInfo, Edge) {
+  auto grad_test_node0 = std::make_shared<eager_test::GradTestNode>(5, 2, 2);
+  VLOG(6) << "Test Construct Edge";
+  egr::Edge edge0 = egr::Edge();
+  CHECK(edge0.IsInitialized() == false);
+  egr::Edge edge1 = egr::Edge(grad_test_node0, size_t(0), size_t(0));
+  CHECK(edge1.IsInitialized() == true);
+  egr::Edge edge2 =
+      egr::Edge(grad_test_node0, std::make_pair(size_t(1), size_t(0)));
+  VLOG(6) << "Test Set Edge's Grad Node";
+  auto* grad_node = edge1.GetGradNode();
+  CHECK_EQ(grad_node->InputMeta().size(), size_t(2));
+  auto mt_grad_node = edge1.GetMutableGradNode();
+  auto auto_grad1 = std::make_shared<egr::AutogradMeta>();
+  std::vector<egr::AutogradMeta*> metas = {auto_grad1.get()};
+  // Uninitialized AutogradMeta indicates
+  mt_grad_node->SetGradInMeta(metas, 0);
+  CHECK(grad_node->InputMeta()[0].IsStopGradient(0) == true);
+  VLOG(6) << "Test Get/Set Edge Rank Info";
+  CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(1));
+  CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(0));
+  edge2.SetEdgeRankInfo(2, 3);
+  CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(2));
+  CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(3));
+  edge2.SetEdgeRankInfo(std::make_pair(size_t(4), size_t(5)));
+  CHECK_EQ(edge2.GetEdgeRankInfo().first, size_t(4));
+  CHECK_EQ(edge2.GetEdgeRankInfo().second, size_t(5));
+}
diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc
index e69de29bb2..3737fd95ad 100644
--- a/paddle/fluid/eager/tests/task_tests/backward_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc
@@ -0,0 +1,332 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+#include "paddle/fluid/eager/tests/test_utils.h"
+
+#include "paddle/fluid/eager/api/all.h"
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+namespace egr {
+
+TEST(Backward, SingleNodeEmptyGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  egr::EagerTensor target_tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+
+  egr::EagerTensor leaf_tensor;
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+    AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor);
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
+
+    // Connect Node0 -> AccumulationNode via Edge
+    auto meta = egr::AutogradMeta();
+    meta.SetStopGradient(false);
+    meta.SetSingleOutRankWithSlot(0, 0);
+    meta.SetGradNode(acc_node_ptr);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    node0_ptr->AddEdges(&res, 0);
+  }
+  std::vector<egr::EagerTensor> outs = {target_tensor};
+  // Run Backward
+  RunBackward(outs, {});
+
+  // Check Output Value
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
+}
+
+TEST(Backward, SingleNodeCustomGrad) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  std::vector<egr::EagerTensor> target_tensors;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  std::vector<egr::EagerTensor> grad_tensors;
+  // Create Grad Tensor
+  egr::EagerTensor grad_tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor));
+
+  egr::EagerTensor leaf_tensor;
+  {
+    // Create Scale Node
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Tensor and Node via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
+
+    // Connect Node0 -> AccumulationNode via Edge
+    auto meta = egr::AutogradMeta();
+    meta.SetStopGradient(false);
+    meta.SetSingleOutRankWithSlot(0, 0);
+    meta.SetGradNode(acc_node_ptr);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    node0_ptr->AddEdges(&res, 0);
+  }
+
+  // Run Backward
+  RunBackward(target_tensors, grad_tensors);
+
+  // Check Output Value
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
+}
+
+/*
+Node1
+  |
+Node0
+  |
+ inp0
+*/
+TEST(Backward, LinearNodes) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  std::vector<egr::EagerTensor> target_tensors;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+
+  egr::EagerTensor leaf_tensor;
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    // Set grad in/out meta for node0
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+
+    // Set grad in/out meta for node1
+    node1_ptr->SetDefaultGradInOutMeta();
+
+    // Connect Input Tensor and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+
+    // Connect Node0 -> Node1 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node1_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
+
+    // Connect Node1 -> AccumulationNode via Edge
+    auto meta1 = egr::AutogradMeta();
+    meta1.SetStopGradient(false);
+    meta1.SetSingleOutRankWithSlot(0, 0);
+    meta1.SetGradNode(acc_node_ptr);
+    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    node1_ptr->AddEdges(&res1, 0);
+  }
+
+  // Use Empty Grad Tensor
+  RunBackward(target_tensors, {});
+
+  // Check Output Value
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 50.0);
+}
+
+/*
+    Node2
+    |   |
+Node0   Node1
+  |      |
+ inp0   inp1
+*/
+TEST(Backward, WithAccumulation) {
+  // Prepare Device Contexts
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  std::vector<egr::EagerTensor> target_tensors;
+  egr::EagerTensor tensor0 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  egr::EagerTensor tensor1 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor0));
+  target_tensors.emplace_back(std::move(tensor1));
+
+  // Create Grad Tensor
+  std::vector<egr::EagerTensor> grad_tensors;
+  egr::EagerTensor grad_tensor0 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 5.0 /*value*/, false /*is_leaf*/);
+  egr::EagerTensor grad_tensor1 = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 10.0 /*value*/, false /*is_leaf*/);
+  grad_tensors.emplace_back(std::move(grad_tensor0));
+  grad_tensors.emplace_back(std::move(grad_tensor1));
+
+  egr::EagerTensor leaf_tensor;
+  {
+    // Create Node0
+    auto node0_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node0_ptr->SetAttributes_scale(5.0 /*scale*/);
+    node0_ptr->SetDefaultGradInOutMeta();
+
+    // Create Node1
+    auto node1_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node1_ptr->SetAttributes_scale(10.0 /*scale*/);
+    node1_ptr->SetDefaultGradInOutMeta();
+    // Create Node2
+    auto node2_ptr = std::make_shared<GradNodeScale>(1, 1);
+    node2_ptr->SetAttributes_scale(20.0 /*scale*/);
+    node2_ptr->SetDefaultGradInOutMeta();
+    // Connect Inp0 and Node0 via AutoGradMeta
+    AutogradMeta* auto_grad_meta0 =
+        EagerUtils::autograd_meta(&(target_tensors[0]));
+    auto_grad_meta0->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node0_ptr));
+    auto_grad_meta0->SetSingleOutRankWithSlot(0, 0);
+
+    // Connect Inp1 and Node1 via AutoGradMeta
+    AutogradMeta* auto_grad_meta1 =
+        EagerUtils::autograd_meta(&(target_tensors[1]));
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(node1_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+
+    // Connect Node0 -> Node2 via Edge
+    auto meta0 = egr::AutogradMeta();
+    meta0.SetStopGradient(false);
+    meta0.SetSingleOutRankWithSlot(0, 0);
+    meta0.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res0 = {&meta0};
+    node0_ptr->AddEdges(&res0, 0);
+
+    // Connect Node1 -> Node2 via Edge
+    auto meta1 = egr::AutogradMeta();
+    meta1.SetStopGradient(false);
+    meta1.SetSingleOutRankWithSlot(0, 0);
+    meta1.SetGradNode(node2_ptr);
+    std::vector<egr::AutogradMeta*> res1 = {&meta1};
+    node1_ptr->AddEdges(&res1, 0);
+
+    // Connect Tensor and AccumulationNode via AutoGradMeta
+    auto acc_node_ptr = std::make_shared<egr::GradNodeAccumulation>();
+
+    AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor);
+    auto_grad_meta2->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta2->SetSingleOutRankWithSlot(0, 0);
+
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
+
+    // Connect Node2 -> AccumulationNode via Edge
+    auto meta2 = egr::AutogradMeta();
+    meta2.SetStopGradient(false);
+    meta2.SetSingleOutRankWithSlot(0, 0);
+    meta2.SetGradNode(acc_node_ptr);
+    std::vector<egr::AutogradMeta*> res2 = {&meta2};
+    node2_ptr->AddEdges(&res2, 0);
+  }
+
+  RunBackward(target_tensors, grad_tensors);
+
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 2500.0);
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
index e69de29bb2..7f180fa107 100644
--- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+
+#include "paddle/fluid/eager/api/all.h"
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+#include "paddle/fluid/eager/tests/test_utils.h"
+
+namespace egr {
+
+TEST(CrossBatchAccumulation, SingleScaleNode) {
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  std::vector<egr::EagerTensor> target_tensors;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+  egr::EagerTensor& target_tensor = target_tensors[0];
+
+  egr::EagerTensor leaf_tensor = egr::EagerTensor();
+  {
+    auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
+    scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+    scale_node_ptr->SetDefaultGradInOutMeta();
+
+    auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
+
+    AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor);
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
+
+    auto meta = AutogradMeta();
+    meta.SetSingleOutRankWithSlot(0, 0);
+    meta.SetStopGradient(false);
+    meta.SetGradNode(acc_node_ptr);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    scale_node_ptr->AddEdges(&res, 0);
+
+    AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor);
+    auto_grad_meta1->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta1->SetSingleOutRankWithSlot(0, 0);
+    egr_utils_api::RetainGradForTensor(leaf_tensor);
+  }
+
+  RunBackward(target_tensors, {});
+
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 5.0);
+
+  RunBackward(target_tensors, {});
+
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 10.0);
+}
+
+}  // namespace egr
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc
index e69de29bb2..0f8039dade 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc
@@ -0,0 +1,218 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <sstream>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/grad_node_info.h"
+
+#include "paddle/fluid/eager/api/all.h"
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/tensor_meta.h"
+
+#include "paddle/fluid/eager/tests/test_utils.h"
+
+namespace egr {
+
+egr::EagerTensor hook_function(const egr::EagerTensor& t) {
+  auto t_dense = std::dynamic_pointer_cast<pten::DenseTensor>(t.impl());
+
+  auto ret_meta = pten::DenseTensorMeta(t_dense->dtype(), t_dense->dims(),
+                                        t_dense->layout());
+  auto place = t_dense->place();
+  size_t bytes_size =
+      paddle::framework::product(t_dense->dims()) * SizeOf(t_dense->dtype());
+  auto ret_dense = std::make_shared<pten::DenseTensor>(
+      pten::make_intrusive<paddle::experimental::SharedStorage>(
+          paddle::memory::Alloc(place, bytes_size)),
+      std::move(ret_meta));
+
+  float* t_ptr = t_dense->mutable_data<float>();
+  float* ret_ptr = ret_dense->mutable_data<float>();
+  for (int i = 0; i < ret_dense->numel(); i++) {
+    ret_ptr[i] = t_ptr[i] + 3.0;
+  }
+
+  auto ret_impl = std::dynamic_pointer_cast<pten::TensorBase>(ret_dense);
+  egr::EagerTensor ret = egr::EagerTensor();
+  ret.set_impl(ret_impl);
+
+  return ret;
+}
+
+TEST(RetainGrad, HookBeforeRetainGrad) {
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  std::vector<egr::EagerTensor> target_tensors;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+  egr::EagerTensor& target_tensor = target_tensors[0];
+
+  // Create ScaleNode
+  auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
+  scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
+
+  // Set grad in/out meta for node0
+  scale_node_ptr->SetDefaultGradInOutMeta();
+
+  // Create AccumulationNode
+  auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
+
+  // Connect Input Tensor and ScaleNode via AutoGradMeta
+  // Apply RetainGrad
+  {
+    // ScaleNode Hook: +3
+    std::function<egr::EagerTensor(const egr::EagerTensor&)> hook =
+        &hook_function;
+
+    auto auto_grad_meta = std::make_shared<AutogradMeta>();
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    target_tensor.set_autograd_meta(
+        std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
+            auto_grad_meta));
+
+    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
+    egr_utils_api::RetainGradForTensor(
+        target_tensor);  // result: 1.0 + 3.0 = 4.0
+  }
+
+  // Connect ScaleNode -> AccumulationNode via Edge
+  {
+    auto meta = AutogradMeta();
+    meta.SetStopGradient(false);
+    meta.SetSingleOutRankWithSlot(0, 0);
+    meta.SetGradNode(acc_node_ptr);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    scale_node_ptr->AddEdges(&res, 0);
+  }
+
+  // Retain Grad for leaf tensor1
+  egr::EagerTensor leaf_tensor = egr::EagerTensor();
+  {
+    // AccumulationNode Hook: +3
+    std::function<egr::EagerTensor(const egr::EagerTensor&)> hook =
+        &hook_function;
+
+    auto auto_grad_meta = std::make_shared<AutogradMeta>();
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    leaf_tensor.set_autograd_meta(
+        std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
+            auto_grad_meta));
+
+    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
+    egr_utils_api::RetainGradForTensor(
+        leaf_tensor);  // result: 4.0*5.0 + 3.0 = 23.0
+  }
+
+  RunBackward(target_tensors, {});
+
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 4.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
+}
+
+TEST(RetainGrad, HookAfterRetainGrad) {
+  eager_test::InitEnv(paddle::platform::CPUPlace());
+
+  // Prepare Inputs
+  std::vector<egr::EagerTensor> target_tensors;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({4, 16, 16, 32});
+
+  // Create Target Tensor
+  egr::EagerTensor tensor = egr_utils_api::CreateTensorWithValue(
+      ddim, paddle::platform::CPUPlace(), pten::DataType::FLOAT32,
+      pten::DataLayout::NCHW, 1.0 /*value*/, false /*is_leaf*/);
+  target_tensors.emplace_back(std::move(tensor));
+  egr::EagerTensor& target_tensor = target_tensors[0];
+
+  // Create ScaleNode
+  auto scale_node_ptr = std::make_shared<GradNodeScale>(1, 1);
+  scale_node_ptr->SetAttributes_scale(5.0 /*scale*/);
+  // Set grad in/out meta for node0
+  scale_node_ptr->SetDefaultGradInOutMeta();
+  // Create AccumulationNode
+  auto acc_node_ptr = std::make_shared<GradNodeAccumulation>();
+
+  // Connect Input Tensor and ScaleNode via AutoGradMeta
+  // Apply RetainGrad
+  {
+    // ScaleNode Hook: +3
+    std::function<egr::EagerTensor(const egr::EagerTensor&)> hook =
+        &hook_function;
+
+    auto auto_grad_meta = std::make_shared<AutogradMeta>();
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(scale_node_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    target_tensor.set_autograd_meta(
+        std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
+            auto_grad_meta));
+
+    egr_utils_api::RetainGradForTensor(target_tensor);  // result: 1.0
+    egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook);
+  }
+
+  // Connect ScaleNode -> AccumulationNode via Edge
+  {
+    auto meta = AutogradMeta();
+    meta.SetStopGradient(false);
+    meta.SetSingleOutRankWithSlot(0, 0);
+    meta.SetGradNode(acc_node_ptr);
+    std::vector<egr::AutogradMeta*> res = {&meta};
+    scale_node_ptr->AddEdges(&res, 0);
+  }
+
+  // Retain Grad for leaf tensor1
+  egr::EagerTensor leaf_tensor = egr::EagerTensor();
+  {
+    // AccumulationNode Hook: +3
+    std::function<egr::EagerTensor(const egr::EagerTensor&)> hook =
+        &hook_function;
+
+    auto auto_grad_meta = std::make_shared<AutogradMeta>();
+    auto_grad_meta->SetGradNode(
+        std::dynamic_pointer_cast<GradNodeBase>(acc_node_ptr));
+    auto_grad_meta->SetSingleOutRankWithSlot(0, 0);
+    leaf_tensor.set_autograd_meta(
+        std::dynamic_pointer_cast<paddle::experimental::AbstractAutogradMeta>(
+            auto_grad_meta));
+
+    egr_utils_api::RetainGradForTensor(
+        leaf_tensor);  // RetainGrad for leaf tensor gets
+                       // postponed, result: 4.0*5.0 + 3.0 =
+                       // 23.0
+    egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook);
+  }
+
+  RunBackward(target_tensors, {});
+  eager_test::CompareGradTensorWithValue<float>(target_tensor, 1.0);
+  eager_test::CompareGradTensorWithValue<float>(leaf_tensor, 23.0);
+}
+}  // namespace egr
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index c56fe5be4d..a0067f9c64 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -234,6 +234,44 @@ static PyObject* eager_tensor__zero_grads(EagerTensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* eager_tensor__share_buffer_to(EagerTensorObject* self,
+                                               PyObject* args,
+                                               PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  egr::EagerTensor* src_ptr =
+      &(reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 0))
+            ->eager_tensor);
+  PADDLE_ENFORCE_EQ(self->eager_tensor.initialized(), true,
+                    platform::errors::InvalidArgument(
+                        "Tensor %s has not been initialized! please initialize "
+                        "src tensor before share_buffer_with to other.",
+                        self->eager_tensor.name()));
+  src_ptr->set_impl(self->eager_tensor.impl());
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_tensor__is_shared_buffer_with(EagerTensorObject* self,
+                                                     PyObject* args,
+                                                     PyObject* kwargs) {
+  EAGER_SYNC_TRY
+  egr::EagerTensor src_tensor =
+      CastPyArg2EagerTensor(PyTuple_GET_ITEM(args, 0), 0);
+  PADDLE_ENFORCE_EQ(src_tensor.initialized(), true,
+                    platform::errors::InvalidArgument(
+                        "Tensor %s has not been initialized! please initialize "
+                        "src tensor before share_buffer_with to other.",
+                        src_tensor.name()));
+  bool res = false;
+  if (!self->eager_tensor.defined() || !src_tensor.defined()) {
+    return ToPyObject(res);
+  }
+  res = (self->eager_tensor.impl().get() == src_tensor.impl().get());
+  return ToPyObject(res);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* eager_tensor_method_detach(EagerTensorObject* self,
                                             PyObject* args, PyObject* kwargs) {
   EAGER_SYNC_TRY
@@ -278,6 +316,12 @@ PyMethodDef variable_methods[] = {
      METH_VARARGS | METH_KEYWORDS, NULL},
     {"_zero_grads", (PyCFunction)(void (*)(void))eager_tensor__zero_grads,
      METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_is_shared_buffer_to",
+     (PyCFunction)(void (*)(void))eager_tensor__share_buffer_to,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_share_buffer_with",
+     (PyCFunction)(void (*)(void))eager_tensor__is_shared_buffer_with,
+     METH_VARARGS | METH_KEYWORDS, NULL},
     {"detach", (PyCFunction)(void (*)(void))eager_tensor_method_detach,
      METH_VARARGS | METH_KEYWORDS, NULL},
     {NULL, NULL, 0, NULL}};
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index e4576fe2ea..3ab7981cdb 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -645,6 +645,33 @@ class EagerTensorPropertiesTestCase(unittest.TestCase):
                 self.assertTrue(tensor3.stop_gradient, True)
                 self.assertTrue(tensor3.place.is_cpu_place())
 
+        def test_share_buffer_to():
+            arr = np.ones([4, 16, 16, 32]).astype('float32')
+            arr1 = np.zeros([4, 16]).astype('float32')
+            arr2 = np.ones([4, 16, 16, 32]).astype('float32') + np.ones(
+                [4, 16, 16, 32]).astype('float32')
+            tensor = None
+            tensor2 = None
+            tensor = paddle.to_tensor(arr, core.VarDesc.VarType.FP32,
+                                      core.CPUPlace())
+            tensor3 = core.eager.EagerTensor()
+            if core.is_compiled_with_cuda():
+                tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32,
+                                           core.CUDAPlace(0))
+            else:
+                tensor2 = paddle.to_tensor(arr2, core.VarDesc.VarType.FP32,
+                                           core.CPUPlace())
+            self.assertTrue(np.array_equal(tensor.numpy(), arr1))
+            self.assertTrue(np.array_equal(tensor2.numpy(), arr2))
+            tensor2._share_buffer_to(tensor)
+            self.assertTrue(np.array_equal(tensor.numpy(), arr2))
+            self.assertTrue(np.array_equal(tensor2.numpy(), arr2))
+            self.assertTrue(tensor._is_shared_buffer_with(tensor2))
+            self.assertTrue(tensor2._is_shared_buffer_with(tensor))
+            tensor._share_buffer_to(tensor3)
+            self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
+            self.assertTrue(tensor3._is_shared_buffer_with(tensor))
+
     def test_properties(self):
         print("Test_properties")
         with _test_eager_guard():
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
index b82a058ae4..d2e1a4fbb1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py
@@ -15,6 +15,7 @@
 import unittest
 import paddle.fluid as fluid
 import numpy as np
+from paddle.fluid.framework import _test_eager_guard
 
 
 class AutoPruneLayer0(fluid.Layer):
@@ -145,7 +146,7 @@ class MyLayer2(fluid.Layer):
 
 
 class TestImperativeAutoPrune(unittest.TestCase):
-    def test_auto_prune(self):
+    def func_auto_prune(self):
         with fluid.dygraph.guard():
             case1 = AutoPruneLayer0(input_size=5)
             value1 = np.arange(25).reshape(5, 5).astype("float32")
@@ -157,7 +158,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
             self.assertTrue(case1.linear2.weight._grad_ivar() is not None)
             self.assertTrue(case1.linear1.weight._grad_ivar() is not None)
 
-    def test_auto_prune2(self):
+    def test_auto_prune(self):
+        with _test_eager_guard():
+            self.func_auto_prune()
+        self.func_auto_prune()
+
+    def func_auto_prune2(self):
         with fluid.dygraph.guard():
             case2 = AutoPruneLayer1(input_size=5)
             value1 = np.arange(25).reshape(5, 5).astype("float32")
@@ -170,6 +176,11 @@ class TestImperativeAutoPrune(unittest.TestCase):
             self.assertTrue(case2.linear2.weight._grad_ivar() is None)
             self.assertTrue(case2.linear1.weight._grad_ivar() is not None)
 
+    def test_auto_prune2(self):
+        with _test_eager_guard():
+            self.func_auto_prune2()
+        self.func_auto_prune2()
+
     def test_auto_prune3(self):
         with fluid.dygraph.guard():
             case3 = AutoPruneLayer3(input_size=784)
-- 
Gitee