From ff60688dc59b329cef0564b34de8a7e06834cf23 Mon Sep 17 00:00:00 2001
From: PaddlePaddle-Gardener <paddlepaddle_bot@163.com>
Date: Fri, 14 Jan 2022 14:21:03 +0800
Subject: [PATCH] mirgate_38853

---
 paddle/fluid/framework/operator.h             |   4 +-
 paddle/fluid/framework/tensor.cc              |   8 -
 paddle/fluid/framework/tensor.h               |   8 -
 paddle/fluid/framework/tensor_util.cc         |   3 +-
 paddle/fluid/framework/tensor_util.h          |  42 +-
 .../inference/api/details/zero_copy_tensor.cc |  17 +-
 paddle/fluid/inference/lite/tensor_utils.cc   |   5 +-
 .../memory/allocation/aligned_allocator.cc    |  15 +-
 .../memory/allocation/aligned_allocator.h     |   4 +-
 paddle/fluid/memory/allocation/allocator.cc   |   9 +-
 paddle/fluid/memory/allocation/allocator.h    | 110 ++-
 .../memory/allocation/allocator_facade.cc     | 795 +++++++++++++++---
 .../memory/allocation/allocator_facade.h      |  30 +-
 .../auto_growth_best_fit_allocator.cc         |  15 +-
 .../auto_growth_best_fit_allocator.h          |  11 +-
 .../auto_growth_best_fit_allocator_test.cc    |   8 +-
 .../fluid/memory/allocation/base_ptr_test.cu  | 118 +++
 .../memory/allocation/best_fit_allocator.cc   |   6 +-
 .../memory/allocation/best_fit_allocator.h    |   8 +-
 .../memory/allocation/buffered_allocator.cc   |   7 +-
 .../memory/allocation/buffered_allocator.h    |   4 +-
 .../allocation/buffered_allocator_test.cc     |   6 +-
 .../fluid/memory/allocation/cpu_allocator.cc  |   4 +-
 .../fluid/memory/allocation/cpu_allocator.h   |   4 +-
 .../fluid/memory/allocation/cuda_allocator.cc |  14 +-
 .../fluid/memory/allocation/cuda_allocator.h  |   4 +-
 .../cuda_device_context_allocator.h           |  34 +-
 .../allocation/cuda_virtual_mem_allocator.cc  | 228 +++++
 .../allocation/cuda_virtual_mem_allocator.h   |  62 ++
 .../memory/allocation/locked_allocator.cc     |   4 +-
 .../memory/allocation/locked_allocator.h      |   4 +-
 .../allocation/naive_best_fit_allocator.cc    | 228 +++--
 .../allocation/naive_best_fit_allocator.h     |   4 +-
 .../fluid/memory/allocation/npu_allocator.cc  |   6 +-
 .../fluid/memory/allocation/npu_allocator.h   |   4 +-
 .../memory/allocation/npu_pinned_allocator.cc |  31 +-
 .../memory/allocation/npu_pinned_allocator.h  |  10 +-
 .../memory/allocation/pinned_allocator.cc     |  12 +-
 .../memory/allocation/pinned_allocator.h      |   4 +-
 .../memory/allocation/retry_allocator.cc      |   4 +-
 .../fluid/memory/allocation/retry_allocator.h |   4 +-
 .../memory/allocation/retry_allocator_test.cc |   4 +-
 .../allocation/stream_safe_cuda_allocator.cc  | 201 +++++
 .../allocation/stream_safe_cuda_allocator.h   |  81 ++
 .../allocation/test_aligned_allocator.cc      |   4 +-
 .../allocation/thread_local_allocator.h       |   6 +-
 ...l_memory_auto_growth_best_fit_allocator.cc | 256 ++++++
 ...al_memory_auto_growth_best_fit_allocator.h |  84 ++
 paddle/fluid/memory/malloc.h                  |  22 +-
 .../fluid/operators/math/concat_and_split.cu  |  38 +-
 .../device/mlu/device_context_allocator.h     | 162 ++++
 .../fluid/platform/device/npu/npu_op_runner.h |   3 +-
 paddle/fluid/pybind/eager_functions.cc        | 174 ++++
 paddle/pten/api/lib/utils/CMakeLists.txt      |   2 +-
 paddle/pten/api/lib/utils/allocator.h         |  49 ++
 paddle/pten/api/lib/utils/storage.cc          |  40 +
 paddle/pten/api/lib/utils/tensor_utils.cc     | 521 ++++++++++++
 paddle/pten/core/allocator.h                  | 153 ++++
 paddle/pten/core/candidate/allocator.h        | 107 +++
 paddle/pten/core/dense_tensor.h               |   2 +
 paddle/pten/core/storage.h                    |   1 +
 paddle/pten/tests/core/allocator.h            |  96 +++
 paddle/pten/tests/core/test_allocator.cc      |  95 +++
 tools/check_file_diff_approvals.sh            |  19 +-
 64 files changed, 3574 insertions(+), 444 deletions(-)
 create mode 100644 paddle/pten/api/lib/utils/allocator.h
 create mode 100644 paddle/pten/api/lib/utils/storage.cc
 create mode 100644 paddle/pten/api/lib/utils/tensor_utils.cc
 create mode 100644 paddle/pten/core/candidate/allocator.h

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 0a46c83a2b..09e4abc77f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -410,8 +410,8 @@ class ExecutionContext {
     auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T));
     auto& deleter = tmp_allocation_ptr.get_deleter();
     auto* allocation_ptr = tmp_allocation_ptr.release();
-    auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
-        allocation_ptr, deleter);
+    auto shared_allocation =
+        std::shared_ptr<pten::Allocation>(allocation_ptr, deleter);
 
     PADDLE_ENFORCE_GE(
         allocation_ptr->size(), framework::product(dim) * sizeof(T),
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index f11b37825d..6aa10a0580 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -17,14 +17,6 @@ limitations under the License. */
 
 DECLARE_bool(use_stream_safe_cuda_allocator);
 
-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index e86009e9aa..fcdb837bc8 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -32,14 +32,6 @@ limitations under the License. */
 
 #include "paddle/pten/core/dense_tensor.h"
 
-namespace paddle {
-namespace memory {
-namespace allocation {
-class Allocation;
-}  // namespace allocation
-}  // namespace memory
-}  // namespace paddle
-
 namespace paddle {
 
 namespace framework {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 7fd125834a..5fd5812200 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -151,8 +151,7 @@ void TensorCopyImpl(const TENSOR& src, const platform::Place& dst_place,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 73829898be..11858e4166 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -30,6 +30,11 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
 #include "paddle/fluid/platform/device_context.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+#endif
+
+#include "paddle/pten/core/dense_tensor.h"
 
 namespace paddle {
 namespace framework {
@@ -72,6 +77,8 @@ class Tensor;
 
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
+                const platform::DeviceContext& ctx, pten::DenseTensor* dst);
 
 // NOTE(zcd): If the src.place() and dst_place are two different GPU,
 // the copy operation is carried out on the dst_place's stream. This is
@@ -82,6 +89,8 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
 // not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
+void TensorCopy(const pten::DenseTensor& src, const platform::Place& dst_place,
+                pten::DenseTensor* dst);
 
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
@@ -174,8 +183,7 @@ void TensorFromArray(const T* src, const size_t& array_size,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
@@ -232,13 +240,20 @@ void TensorFromVector(const std::vector<T>& src,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
   }
 #endif
+#ifdef PADDLE_WITH_MLU
+  if (platform::is_mlu_place(dst_place)) {
+    memory::Copy(
+        BOOST_GET_CONST(platform::MLUPlace, dst_place), dst_ptr, src_place,
+        src_ptr, size,
+        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
+  }
+#endif
 }
 
 // The fully specialized function should be inline to avoid
@@ -295,8 +310,7 @@ inline void TensorFromVector(const std::vector<bool>& src,
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation* allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation* allocation = npu_pinned_tensor.Holder().get();
     npu_pinned_allocator->RecordEvent(
         allocation,
         reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
@@ -371,6 +385,14 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
                  size, nullptr);
   }
 #endif
+#ifdef PADDLE_WITH_MLU
+  else if (platform::is_mlu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
+  }
+#endif
 }
 
 template <>
@@ -412,6 +434,14 @@ inline void TensorToVector(const Tensor& src,
                  BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
                  size, nullptr);
   }
+#endif
+#ifdef PADDLE_WITH_MLU
+  else if (platform::is_mlu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, BOOST_GET_CONST(platform::MLUPlace, src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::MLUDeviceContext&>(ctx).stream());
+  }
 #endif
   for (unsigned int i = 0; i < src.numel(); i++) {
     (*dst)[i] = static_cast<bool>(array[i]);
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index bb537f0c65..2f2f4c0ead 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -223,9 +223,10 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
   auto t_place = tensor->place();
 
   paddle::framework::Tensor out;
-  auto mem_allocation = std::make_shared<paddle::memory::Allocation>(
-      static_cast<void *>(data), ele_num * sizeof(T),
-      paddle::platform::CPUPlace());
+  auto mem_allocation =
+      std::make_shared<paddle::memory::allocation::Allocation>(
+          static_cast<void *>(data), ele_num * sizeof(T),
+          paddle::platform::CPUPlace());
   out.ResetHolder(mem_allocation);
 
   if (paddle::platform::is_cpu_place(t_place)) {
@@ -239,6 +240,14 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
       std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
 #else
     std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#endif
+  } else if (paddle::platform::is_ipu_place(t_place)) {
+#ifdef PADDLE_WITH_IPU
+    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+#else
+    PADDLE_THROW(paddle::platform::errors::Unavailable(
+        "Can not create tensor with IPU place because paddle is not compiled "
+        "with IPU."));
 #endif
   } else if (place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -290,7 +299,7 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
     paddle::memory::Copy(paddle::platform::CPUPlace(),
                          static_cast<void *>(data), npu_place, t_data,
                          ele_num * sizeof(T), dev_ctx->stream());
-    aclrtSynchronizeStream(dev_ctx->stream());
+    paddle::platform::NPUStreamSync(dev_ctx->stream());
 #else
     PADDLE_THROW(paddle::platform::errors::Unavailable(
         "Can not create tensor with NPU place because paddle is not compiled "
diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc
index b1e0eb5ef1..0d5cd29a0c 100644
--- a/paddle/fluid/inference/lite/tensor_utils.cc
+++ b/paddle/fluid/inference/lite/tensor_utils.cc
@@ -257,9 +257,8 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) {
   size_t memory_size =
       GetLiteTensorNumel(*src) *
       framework::SizeOfType(GetNativePrecisionType(src->precision()));
-  std::shared_ptr<memory::allocation::Allocation> holder(
-      new memory::allocation::Allocation(src_raw_data, memory_size,
-                                         GetNativePlace(src->target())));
+  std::shared_ptr<pten::Allocation> holder(new pten::Allocation(
+      src_raw_data, memory_size, GetNativePlace(src->target())));
   dst->Resize(paddle::framework::make_ddim(src->shape()));
   SetLoD(dst->mutable_lod(), src->lod());
   dst->ResetHolderWithType(holder, GetNativePrecisionType(src->precision()));
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
index f0b7f1a4b0..258cff32b4 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -23,15 +23,16 @@ namespace allocation {
 // For memory address alignment
 class AlignedAllocation : public Allocation {
  public:
-  AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
+  AlignedAllocation(DecoratedAllocationPtr underlying_allocation, size_t offset)
       : Allocation(
             reinterpret_cast<uint8_t*>(underlying_allocation->ptr()) + offset,
+            underlying_allocation->base_ptr(),
             underlying_allocation->size() - offset,
             underlying_allocation->place()),
         underlying_allocation_(std::move(underlying_allocation)) {}
 
  private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
 };
 
 AlignedAllocator::AlignedAllocator(
@@ -51,13 +52,17 @@ bool AlignedAllocator::IsAllocThreadSafe() const {
   return underlying_allocator_->IsAllocThreadSafe();
 }
 
-Allocation* AlignedAllocator::AllocateImpl(size_t size) {
+pten::Allocation* AlignedAllocator::AllocateImpl(size_t size) {
   auto raw_allocation = underlying_allocator_->Allocate(size + alignment_);
   size_t offset = AlignedPtrOffset(raw_allocation->ptr(), alignment_);
-  return new AlignedAllocation(std::move(raw_allocation), offset);
+  auto* p = new AlignedAllocation(
+      static_unique_ptr_cast<Allocation>(std::move(raw_allocation)), offset);
+  return p;
 }
 
-void AlignedAllocator::FreeImpl(Allocation* allocation) { delete allocation; }
+void AlignedAllocator::FreeImpl(pten::Allocation* allocation) {
+  delete allocation;
+}
 
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h
index 6fef5cae8d..ffd5ad0fae 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.h
+++ b/paddle/fluid/memory/allocation/aligned_allocator.h
@@ -30,9 +30,9 @@ class AlignedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  Allocation* AllocateImpl(size_t size) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
-  void FreeImpl(Allocation* allocation) override;
+  void FreeImpl(pten::Allocation* allocation) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc
index 4998f3dbb9..0ef6f5cbab 100644
--- a/paddle/fluid/memory/allocation/allocator.cc
+++ b/paddle/fluid/memory/allocation/allocator.cc
@@ -18,11 +18,10 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 
-bool Allocator::IsAllocThreadSafe() const { return false; }
-
-void Allocator::FreeImpl(Allocation* allocation) {
-  Allocator* allocator = allocation->TopDecoratedAllocator();
-  allocator->Free(allocation);
+void Allocator::FreeImpl(pten::Allocation* allocation) {
+  static_cast<Allocation*>(allocation)
+      ->TopDecoratedAllocator()
+      ->Free(allocation);
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index b11c657b96..3f04d47516 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -22,6 +22,9 @@
 #include "paddle/fluid/framework/inlined_vector.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/allocator.h"
+
+DECLARE_string(allocator_strategy);
 
 namespace paddle {
 namespace memory {
@@ -78,40 +81,26 @@ class Allocator;
  * e.g., something what is done in AlignedAllocator, etc.
  * In this case, we should declare a derived class of Allocation, which
  * contains an underlying Allocation allocated by the underlying allocator.
- * Therefore, `decorated_allocators_` of the new Allocation object would
+ * Therefore, `decorated_allocators_` of the new Allocation object
+ * would
  * be a new chain, differing from the underlying Allocation object.
  */
-class Allocation {
+class Allocation : public pten::Allocation {
  public:
-  inline Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), size_(size), place_(place) {}
-
-  Allocation(const Allocation& o) = delete;
-  Allocation& operator=(const Allocation& o) = delete;
-  Allocation(Allocation&& o) = delete;
-  Allocation& operator=(Allocation&& o) = delete;
-
-  // Returns the holding pointer.
-  // NOTE: For performance consideration, it is better not to make this method
-  // as a virtual method. If we want to implement a `defragmentation` later,
-  // we might need to make `ptr_` field as a protected field, and add a virtual
-  // method like `defragmentation` to change `ptr_`.
-  inline void* ptr() const { return ptr_; }
-
-  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
-  // last valid element.
-  //
-  // NOTE: Some allocator might alloc more memory than request. The size
-  // could larger than its request. For example,
-  //    the AlignedAllocator will always allocate memory as size + kAlignment.
-  //    The raw pointer might not aligned, so an offset might be added to raw
-  //    the pointer. The size of this allocation will be
-  //    `size + kAlignemnt - offset`.
-  inline size_t size() const { return size_; }
-
-  inline const platform::Place& place() const { return place_; }
-
-  virtual ~Allocation() {}
+  Allocation(void* ptr, size_t size, platform::Place place)
+      : pten::Allocation(ptr, size, place), base_ptr_(ptr) {}
+  Allocation(void* ptr, void* base_ptr, size_t size,
+             const platform::Place& place)
+      : pten::Allocation(ptr, size, place), base_ptr_(base_ptr) {}
+
+  void* base_ptr() const {
+    PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
+                      paddle::platform::errors::Unimplemented(
+                          "base_ptr() is only implemented for auto_growth "
+                          "strategy, not support %s strategy",
+                          FLAGS_allocator_strategy));
+    return base_ptr_;
+  }
 
  private:
   inline void RegisterDecoratedAllocator(Allocator* allocator) {
@@ -125,9 +114,7 @@ class Allocation {
   }
 
  private:
-  void* ptr_;
-  size_t size_;
-  platform::Place place_;
+  void* base_ptr_;  // the point that directly requested from system
 
   /**
    * NOTE(zjl): Since decorated_allocators_ is usually a small vector.
@@ -147,53 +134,42 @@ class Allocation {
   friend class Allocator;
 };
 
+using AllocationPtr = pten::Allocator::AllocationPtr;
+using DecoratedAllocationPtr =
+    std::unique_ptr<Allocation, pten::Allocator::DeleterType>;
+
 // Base interface class of memory Allocator.
-class Allocator {
+class Allocator : public pten::Allocator {
  public:
-  virtual ~Allocator() {}
-
-  class AllocationDeleter {
-   public:
-    inline void operator()(Allocation* allocation) const {
-      Allocator* allocator = allocation->TopDecoratedAllocator();
-      allocator->Free(allocation);
-    }
-  };
-
-  using AllocationPtr = std::unique_ptr<Allocation, AllocationDeleter>;
+  static void AllocationDeleter(pten::Allocation* allocation) {
+    Allocator* allocator =
+        static_cast<Allocation*>(allocation)->TopDecoratedAllocator();
+    allocator->Free(allocation);
+  }
 
   // Allocate an allocation.
   // size may be 0, but it would be too complex if we handle size == 0
   // in each Allocator. So we handle size == 0 inside AllocatorFacade
   // in our design.
-  inline AllocationPtr Allocate(size_t size) {
+  AllocationPtr Allocate(size_t size) override {
     auto ptr = AllocateImpl(size);
-    ptr->RegisterDecoratedAllocator(this);
-    return AllocationPtr(ptr);
+    static_cast<Allocation*>(ptr)->RegisterDecoratedAllocator(this);
+    return AllocationPtr(ptr, AllocationDeleter);
   }
 
-  // This function should not be called outside Allocator class
-  inline void Free(Allocation* allocation) {
-    allocation->PopDecoratedAllocator();
+  void Free(pten::Allocation* allocation) {
+    static_cast<Allocation*>(allocation)->PopDecoratedAllocator();
     FreeImpl(allocation);
   }
 
-  inline uint64_t Release(const platform::Place& place) {
-    return ReleaseImpl(place);
-  }
-
-  // True if the `Allocate` is thread safe.
-  virtual bool IsAllocThreadSafe() const;
+  uint64_t Release(const platform::Place& place) { return ReleaseImpl(place); }
 
  protected:
-  virtual Allocation* AllocateImpl(size_t size) = 0;
-  virtual void FreeImpl(Allocation* allocation);
+  virtual pten::Allocation* AllocateImpl(size_t size) = 0;
+  virtual void FreeImpl(pten::Allocation* allocation);
   virtual uint64_t ReleaseImpl(const platform::Place& place) { return 0; }
 };
 
-using AllocationDeleter = Allocator::AllocationDeleter;
-using AllocationPtr = Allocator::AllocationPtr;
-
 inline size_t AlignedSize(size_t size, size_t alignment) {
   auto remaining = size % alignment;
   return remaining == 0 ? size : size + alignment - remaining;
@@ -205,6 +181,14 @@ inline size_t AlignedPtrOffset(const void* ptr, size_t alignment) {
   return diff == 0 ? 0 : alignment - diff;
 }
 
+template <typename Derived, typename Base, typename BaseDel>
+decltype(auto) static_unique_ptr_cast(std::unique_ptr<Base, BaseDel>&& p) {
+  static_assert(std::is_base_of<Base, Derived>::value,
+                "Derived type must derive from Base.");
+  auto d = static_cast<Derived*>(p.release());
+  return std::unique_ptr<Derived, BaseDel>(d, p.get_deleter());
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 281902f3a2..474b4fe3d4 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -15,31 +15,51 @@
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
 
 #include "gflags/gflags.h"
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h"
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
-#ifdef PADDLE_WITH_ASCEND_CL
-#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
-#endif
-#include "paddle/fluid/memory/allocation/aligned_allocator.h"
 #include "paddle/fluid/memory/allocation/retry_allocator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include <shared_mutex>
 #include "paddle/fluid/memory/allocation/cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
 #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#endif
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/device_context.h"
+
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/cuda_graph.h"
+#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
+#endif
+
+#if CUDA_VERSION >= 10020
+#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
+#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#endif
 #endif
+
 #ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
+
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
+
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
-#include "paddle/fluid/platform/npu_info.h"
 
 PADDLE_DEFINE_EXPORTED_int64(
     gpu_allocator_retry_time, 10000,
@@ -51,6 +71,15 @@ PADDLE_DEFINE_EXPORTED_bool(
     "Whether to use system allocator to allocate CPU and GPU memory. "
     "Only used for unittests.");
 
+PADDLE_DEFINE_EXPORTED_bool(use_virtual_memory_auto_growth, false,
+                            "Use VirtualMemoryAutoGrowthBestFitAllocator.");
+
+// NOTE(Ruibiao): This FLAGS is just to be compatibled with
+// the old single-stream CUDA allocator. It will be removed
+// after StreamSafeCudaAllocator has been fully tested.
+PADDLE_DEFINE_EXPORTED_bool(use_stream_safe_cuda_allocator, false,
+                            "Enable StreamSafeCUDAAllocator");
+
 DECLARE_string(allocator_strategy);
 
 namespace paddle {
@@ -65,16 +94,16 @@ class CUDAGraphAllocator
   class PrivateAllocation : public Allocation {
    public:
     PrivateAllocation(CUDAGraphAllocator* allocator,
-                      AllocationPtr underlying_allocation)
-        : Allocation(underlying_allocation->ptr(),
-                     underlying_allocation->size(),
-                     underlying_allocation->place()),
+                      DecoratedAllocationPtr underlying_allocation)
+        : Allocation(
+              underlying_allocation->ptr(), underlying_allocation->base_ptr(),
+              underlying_allocation->size(), underlying_allocation->place()),
           allocator_(allocator->shared_from_this()),
           underlying_allocation_(std::move(underlying_allocation)) {}
 
    private:
     std::shared_ptr<Allocator> allocator_;
-    AllocationPtr underlying_allocation_;
+    DecoratedAllocationPtr underlying_allocation_;
   };
 
   explicit CUDAGraphAllocator(const std::shared_ptr<Allocator>& allocator)
@@ -87,12 +116,14 @@ class CUDAGraphAllocator
   }
 
  protected:
-  Allocation* AllocateImpl(size_t size) {
+  pten::Allocation* AllocateImpl(size_t size) {
     VLOG(10) << "Allocate " << size << " for CUDA Graph";
-    return new PrivateAllocation(this, underlying_allocator_->Allocate(size));
+    return new PrivateAllocation(this,
+                                 static_unique_ptr_cast<Allocation>(
+                                     underlying_allocator_->Allocate(size)));
   }
 
-  void FreeImpl(Allocation* allocation) {
+  void FreeImpl(pten::Allocation* allocation) {
     VLOG(10) << "delete for CUDA Graph";
     delete allocation;
   }
@@ -106,46 +137,85 @@ class AllocatorFacadePrivate {
  public:
   using AllocatorMap = std::map<platform::Place, std::shared_ptr<Allocator>>;
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  using CUDAAllocatorMap =
+      std::map<platform::CUDAPlace,
+               std::map<gpuStream_t, std::shared_ptr<Allocator>>>;
+#endif
+
   explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) {
     strategy_ = GetAllocatorStrategy();
     switch (strategy_) {
       case AllocatorStrategy::kNaiveBestFit: {
         InitNaiveBestFitCPUAllocator();
-#ifdef PADDLE_WITH_XPU
-        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
-          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
         }
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
+        PADDLE_ENFORCE_EQ(
+            FLAGS_use_stream_safe_cuda_allocator, false,
+            paddle::platform::errors::Unimplemented(
+                "StreamSafeCUDAAllocator is only implemented for auto_growth "
+                "strategy, not support naive_best_fit strategy"));
+
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
 #endif
+#ifdef PADDLE_WITH_XPU
+        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
+        }
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
         for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id));
         }
         InitNaiveBestFitNPUPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_MLU
+        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
+        }
 #endif
         break;
       }
 
       case AllocatorStrategy::kAutoGrowth: {
         InitNaiveBestFitCPUAllocator();
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        allow_free_idle_chunk_ = allow_free_idle_chunk;
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
+               ++dev_id) {
+            InitStreamSafeCUDAAllocator(platform::CUDAPlace(dev_id), nullptr);
+          }
+        } else {
+          for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
+               ++dev_id) {
+            InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
+                                        allow_free_idle_chunk_);
+          }
+        }
+        InitNaiveBestFitCUDAPinnedAllocator();
+#endif
 #ifdef PADDLE_WITH_XPU
         for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
         }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
-          InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
-                                      allow_free_idle_chunk);
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
+#endif
+#ifdef PADDLE_WITH_MLU
+        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
         }
-        InitNaiveBestFitCUDAPinnedAllocator();
 #endif
         break;
       }
@@ -157,12 +227,27 @@ class AllocatorFacadePrivate {
           InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
         }
 #endif
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
-             ++dev_id) {
+        PADDLE_ENFORCE_EQ(
+            FLAGS_use_stream_safe_cuda_allocator, false,
+            paddle::platform::errors::Unimplemented(
+                "StreamSafeCUDAAllocator is only implemented for auto_growth "
+                "strategy, not support thread_local strategy"));
+
+        for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
           InitThreadLocalCUDAAllocator(platform::CUDAPlace(dev_id));
         }
         InitNaiveBestFitCUDAPinnedAllocator();
+#endif
+#ifdef PADDLE_WITH_MLU
+        for (int dev_id = 0; dev_id < platform::GetMLUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitMLUAllocator(platform::MLUPlace(dev_id));
+        }
 #endif
         break;
       }
@@ -182,27 +267,9 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe();
   }
 
-  inline const AllocatorMap& GetAllocatorMap() {
-#ifdef PADDLE_WITH_CUDA
-    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
-      auto id = platform::CUDAGraph::CapturingID();
-      auto iter = cuda_graph_allocator_map_.find(id);
-      PADDLE_ENFORCE_NE(
-          iter, cuda_graph_allocator_map_.end(),
-          platform::errors::PermissionDenied(
-              "No memory pool is prepared for CUDA Graph capturing."));
-      return iter->second->allocators_;
-    } else {
-      return allocators_;
-    }
-#else
-    return allocators_;
-#endif
-  }
-
   inline const std::shared_ptr<Allocator>& GetAllocator(
       const platform::Place& place, size_t size) {
-    VLOG(4) << "GetAllocator"
+    VLOG(6) << "GetAllocator"
             << " " << place << " " << size;
     const auto& allocators =
         (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_
@@ -215,25 +282,142 @@ class AllocatorFacadePrivate {
     return iter->second;
   }
 
- private:
-  void InitSystemAllocators() {
-    if (!system_allocators_.empty()) return;
-    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
-#ifdef PADDLE_WITH_XPU
-    int device_count = platform::GetXPUDeviceCount();
-    for (int i = 0; i < device_count; ++i) {
-      platform::XPUPlace p(i);
-      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  bool HasCUDAAllocator(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream) {
+    auto it = cuda_allocators_.find(place);
+    if (it == cuda_allocators_.end()) {
+      return false;
+    }
+    const std::map<gpuStream_t, std::shared_ptr<Allocator>>& allocator_map =
+        it->second;
+    return allocator_map.find(stream) != allocator_map.end();
+  }
+
+  const std::shared_ptr<Allocator>& GetAllocator(
+      const platform::CUDAPlace& place, const gpuStream_t& stream,
+      bool create_if_not_found = false) {
+    {  // shared_lock_guard
+      std::shared_lock<std::shared_timed_mutex> lock_guard(
+          cuda_allocator_mutex_);
+      if (LIKELY(HasCUDAAllocator(place, stream))) {
+        return cuda_allocators_[place][stream];
+      } else {
+        PADDLE_ENFORCE_NE(create_if_not_found, false,
+                          platform::errors::NotFound(
+                              "No allocator found for stream %s in place %s "
+                              "with create_if_not_found = false",
+                              stream, place));
+      }
     }
+
+    {  // unique_lock_guard
+      std::unique_lock<std::shared_timed_mutex> lock_guard(
+          cuda_allocator_mutex_);
+      InitStreamSafeCUDAAllocator(place, stream);
+      return cuda_allocators_[place][stream];
+    }
+  }
+
+  gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
+  }
+
+  void RecordStream(std::shared_ptr<pten::Allocation> allocation,
+                    const gpuStream_t& stream) {
+    if (allocation->size() == 0) {
+      return;
+    }
+
+    StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
+        dynamic_cast<StreamSafeCUDAAllocation*>(allocation.get());
+    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
+                            platform::errors::InvalidArgument(
+                                "Failed to dynamic cast %p from Allocation* to "
+                                "StreamSafeCUDAAllocation*",
+                                allocation.get()));
+    stream_safe_cuda_allocation->RecordStream(stream);
+  }
+
+  const gpuStream_t& GetStream(
+      const std::shared_ptr<pten::Allocation>& allocation) const {
+    const StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
+        dynamic_cast<const StreamSafeCUDAAllocation*>(allocation.get());
+    PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
+                            platform::errors::InvalidArgument(
+                                "Failed to dynamic cast %p from Allocation* to "
+                                "StreamSafeCUDAAllocation*",
+                                allocation.get()));
+    return stream_safe_cuda_allocation->GetOwningStream();
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
+    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
+                      platform::errors::InvalidArgument(
+                          "CUDA Graph is only supported when the "
+                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
+                          "FLAGS_allocator_strategy=\"%s\"",
+                          FLAGS_allocator_strategy));
+    auto& allocator = cuda_graph_allocator_map_[id];
+    PADDLE_ENFORCE_EQ(
+        allocator.get(), nullptr,
+        platform::errors::InvalidArgument(
+            "The memory pool of the CUDA Graph with ID %d have been prepared.",
+            id));
+    allocator.reset(
+        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
+    for (auto& item : allocator->allocators_) {
+      auto& old_allocator = item.second;
+      old_allocator = CUDAGraphAllocator::Create(old_allocator);
+    }
+    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
+  }
+
+  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
+    auto iter = cuda_graph_allocator_map_.find(id);
+    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
+                      platform::errors::InvalidArgument(
+                          "Cannot find CUDA Graph with ID = %d", id));
+    cuda_graph_allocator_map_.erase(iter);
+    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
+  }
 #endif
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    system_allocators_[platform::CUDAPinnedPlace()] =
-        std::make_shared<CPUPinnedAllocator>();
-    int device_count = platform::GetCUDADeviceCount();
-    for (int i = 0; i < device_count; ++i) {
-      platform::CUDAPlace p(i);
-      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
+#endif
+
+ private:
+  class ZeroSizeAllocator : public Allocator {
+   public:
+    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
+    bool IsAllocThreadSafe() const override { return true; }
+
+   protected:
+    pten::Allocation* AllocateImpl(size_t size) override {
+      return new Allocation(nullptr, 0, place_);
+    }
+    void FreeImpl(pten::Allocation* allocation) override { delete allocation; }
+
+   private:
+    platform::Place place_;
+  };
+
+  const AllocatorMap& GetAllocatorMap() {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
+      auto id = platform::CUDAGraph::CapturingID();
+      auto iter = cuda_graph_allocator_map_.find(id);
+      PADDLE_ENFORCE_NE(
+          iter, cuda_graph_allocator_map_.end(),
+          platform::errors::PermissionDenied(
+              "No memory pool is prepared for CUDA Graph capturing."));
+      VLOG(10) << "Choose CUDA Graph memory pool to allocate memory";
+      return iter->second->allocators_;
+    } else {
+      return allocators_;
     }
+#else
+    return allocators_;
 #endif
   }
 
@@ -248,16 +432,138 @@ class AllocatorFacadePrivate {
         std::make_shared<NaiveBestFitAllocator>(platform::CUDAPinnedPlace());
   }
 
+  void InitStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    PADDLE_ENFORCE_EQ(
+        strategy_, AllocatorStrategy::kAutoGrowth,
+        platform::errors::Unimplemented(
+            "Only support auto-growth strategey for StreamSafeCUDAAllocator, "
+            "the allocator strategy %d is unsupported for multi-stream",
+            static_cast<int>(strategy_)));
+    if (LIKELY(!HasCUDAAllocator(p, stream))) {
+      VLOG(8) << "Init CUDA allocator for stream " << stream << " in place "
+              << p;
+      InitAutoGrowthCUDAAllocator(p, stream);
+      WrapStreamSafeCUDAAllocator(p, stream);
+      WrapCUDARetryAllocator(p, stream, FLAGS_gpu_allocator_retry_time);
+    }
+  }
+
   void InitNaiveBestFitCUDAAllocator(platform::CUDAPlace p) {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
   }
 
-  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
-    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
+  void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+#if defined(PADDLE_WITH_HIP)
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
+        cuda_allocator, platform::GpuMinChunkSize(), 0, allow_free_idle_chunk_);
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 10020
+    CUdevice device;
+    int val;
+    try {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cuDeviceGetAttribute(
+              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+              device));
+    } catch (...) {
+      val = 0;
+    }
+
+    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
+      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
+      cuda_allocators_[p][stream] =
+          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(), p);
+    } else {
+      auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+      cuda_allocators_[p][stream] =
+          std::make_shared<AutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(),
+              allow_free_idle_chunk_);
+    }
+#else
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    auto alignment = platform::GpuMinChunkSize();
+    bool need_addr_align = true;
+    // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda
+    // API in that case may got cuda error(3), i.e.,
+    // cudaErrorInitializationError. And, the CUDAAllocator is only initialized
+    // but not really used.
+    // Here, the try-catch block is added to handle the case that
+    // GetDeviceProperties() may failed in the multiple process(for example, in
+    // dataloader with num_worker > 0)
+    try {
+      const auto& prop = platform::GetDeviceProperties(p.GetDeviceId());
+      need_addr_align = prop.textureAlignment < alignment;
+      VLOG(4) << "GetDeviceProperties ok, textureAlignment: "
+              << prop.textureAlignment
+              << ", set need_addr_align=" << need_addr_align;
+    } catch (...) {
+      need_addr_align = true;
+      VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true";
+    }
+    // The address returned is aligned already,
+    // ref:
+    // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295
+    std::shared_ptr<Allocator> underlying_allocator{nullptr};
+    if (need_addr_align) {
+      VLOG(10) << "use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator =
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment);
+    } else {
+      VLOG(10) << "not use AlignedAllocator with alignment: " << alignment;
+      underlying_allocator = cuda_allocator;
+    }
+
+    cuda_allocators_[p][stream] = std::make_shared<AutoGrowthBestFitAllocator>(
+        underlying_allocator, alignment, 0, allow_free_idle_chunk_);
+#endif
+#endif
   }
 
+  // NOTE(Ruibiao): Old single-stream version, will be removed later
   void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p,
                                    bool allow_free_idle_chunk) {
+#if defined(PADDLE_WITH_HIP)
+    auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+    allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+        cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+#if CUDA_VERSION >= 10020
+    CUdevice device;
+    int val;
+    try {
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cuDeviceGet(&device, p.GetDeviceId()));
+
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          paddle::platform::dynload::cuDeviceGetAttribute(
+              &val, CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
+              device));
+    } catch (...) {
+      val = 0;
+    }
+
+    if (val > 0 && FLAGS_use_virtual_memory_auto_growth) {
+      auto cuda_allocator = std::make_shared<CUDAVirtualMemAllocator>(p);
+      allocators_[p] =
+          std::make_shared<VirtualMemoryAutoGrowthBestFitAllocator>(
+              cuda_allocator, platform::GpuMinChunkSize(), p);
+    } else {
+      auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
+      allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
+          cuda_allocator, platform::GpuMinChunkSize(), allow_free_idle_chunk);
+    }
+
+#else
     auto cuda_allocator = std::make_shared<CUDAAllocator>(p);
     auto alignment = platform::GpuMinChunkSize();
     bool need_addr_align = true;
@@ -292,6 +598,39 @@ class AllocatorFacadePrivate {
     }
     allocators_[p] = std::make_shared<AutoGrowthBestFitAllocator>(
         underlying_allocator, alignment, 0, allow_free_idle_chunk);
+#endif
+#endif
+  }
+
+  void InitThreadLocalCUDAAllocator(platform::CUDAPlace p) {
+    allocators_[p] = std::make_shared<ThreadLocalCUDAAllocator>(p);
+  }
+
+  void WrapStreamSafeCUDAAllocator(platform::CUDAPlace p, gpuStream_t stream) {
+    const std::shared_ptr<Allocator>& underlying_allocator =
+        cuda_allocators_[p][stream];
+    cuda_allocators_[p][stream] = std::make_shared<StreamSafeCUDAAllocator>(
+        underlying_allocator, p, stream);
+  }
+
+  void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
+                              size_t retry_time) {
+    PADDLE_ENFORCE_GT(
+        retry_time, 0,
+        platform::errors::InvalidArgument(
+            "Retry time should be larger than 0, but got %d", retry_time));
+    std::shared_ptr<Allocator> allocator = cuda_allocators_[p][stream];
+    allocator = std::make_shared<RetryAllocator>(allocator, retry_time);
+  }
+
+  static void CheckCUDAAllocThreadSafe(const CUDAAllocatorMap& allocators) {
+    for (auto& place_pair : allocators) {
+      for (auto& stream_pair : place_pair.second) {
+        PADDLE_ENFORCE_EQ(stream_pair.second->IsAllocThreadSafe(), true,
+                          platform::errors::InvalidArgument(
+                              "Public allocators must be thread safe"));
+      }
+    }
   }
 #endif
 
@@ -301,6 +640,18 @@ class AllocatorFacadePrivate {
   }
 #endif
 
+#ifdef PADDLE_WITH_IPU
+  void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
+#ifdef PADDLE_WITH_MLU
+  void InitNaiveBestFitMLUAllocator(platform::MLUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
+
 #ifdef PADDLE_WITH_ASCEND_CL
   void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
     allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
@@ -310,32 +661,49 @@ class AllocatorFacadePrivate {
     allocators_[platform::NPUPinnedPlace()] =
         std::make_shared<paddle::memory::allocation::NPUPinnedAllocator>();
   }
-
 #endif
 
-  class ZeroSizeAllocator : public Allocator {
-   public:
-    explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
-
-    bool IsAllocThreadSafe() const override { return true; }
-
-   protected:
-    Allocation* AllocateImpl(size_t size) override {
-      return new Allocation(nullptr, 0, place_);
+  void InitSystemAllocators() {
+    if (!system_allocators_.empty()) return;
+    system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
+#ifdef PADDLE_WITH_XPU
+    int device_count = platform::GetXPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::XPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
     }
-
-    void FreeImpl(Allocation* allocation) override { delete allocation; }
-
-   private:
-    platform::Place place_;
-  };
+#endif
+#ifdef PADDLE_WITH_IPU
+    int device_count = platform::GetIPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::IPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    system_allocators_[platform::CUDAPinnedPlace()] =
+        std::make_shared<CPUPinnedAllocator>();
+    int device_count = platform::GetGPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::CUDAPlace p(i);
+      system_allocators_[p] = std::make_shared<CUDAAllocator>(p);
+    }
+#endif
+#ifdef PADDLE_WITH_MLU
+    int device_count = platform::GetMLUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::MLUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
+  }
 
   void InitZeroSizeAllocators() {
     if (!zero_size_allocators_.empty()) return;
     std::vector<platform::Place> places;
     places.emplace_back(platform::CPUPlace());
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    int device_count = platform::GetCUDADeviceCount();
+    int device_count = platform::GetGPUDeviceCount();
     for (int dev_id = 0; dev_id < device_count; ++dev_id) {
       places.emplace_back(platform::CUDAPlace(dev_id));
     }
@@ -353,6 +721,18 @@ class AllocatorFacadePrivate {
       places.emplace_back(platform::NPUPlace(dev_id));
     }
 #endif
+#ifdef PADDLE_WITH_IPU
+    int device_count = platform::GetIPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::IPUPlace(dev_id));
+    }
+#endif
+#ifdef PADDLE_WITH_MLU
+    int device_count = platform::GetMLUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::MLUPlace(dev_id));
+    }
+#endif
 
     for (auto& p : places) {
       zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
@@ -371,8 +751,14 @@ class AllocatorFacadePrivate {
     CheckAllocThreadSafe(allocators_);
     CheckAllocThreadSafe(zero_size_allocators_);
     CheckAllocThreadSafe(system_allocators_);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    if (FLAGS_use_stream_safe_cuda_allocator) {
+      CheckCUDAAllocThreadSafe(cuda_allocators_);
+    }
+#endif
   }
 
+  // NOTE(Ruibiao): Old single-stream version, will be removed later
   void WrapCUDARetryAllocator(size_t retry_time) {
     PADDLE_ENFORCE_GT(
         retry_time, 0,
@@ -385,53 +771,21 @@ class AllocatorFacadePrivate {
     }
   }
 
-#ifdef PADDLE_WITH_CUDA
-
- public:
-  void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
-    PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth,
-                      platform::errors::InvalidArgument(
-                          "CUDA Graph is only supported when the "
-                          "FLAGS_allocator_strategy=\"auto_growth\", but got "
-                          "FLAGS_allocator_strategy=\"%s\"",
-                          FLAGS_allocator_strategy));
-    auto& allocator = cuda_graph_allocator_map_[id];
-    PADDLE_ENFORCE_EQ(
-        allocator.get(), nullptr,
-        platform::errors::InvalidArgument(
-            "The memory pool of the CUDA Graph with ID %d have been prepared.",
-            id));
-    allocator.reset(
-        new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
-    for (auto& item : allocator->allocators_) {
-      auto& old_allocator = item.second;
-      old_allocator = CUDAGraphAllocator::Create(old_allocator);
-    }
-    VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
-  }
-
-  void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
-    auto iter = cuda_graph_allocator_map_.find(id);
-    PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(),
-                      platform::errors::InvalidArgument(
-                          "Cannot find CUDA Graph with ID = %d", id));
-    cuda_graph_allocator_map_.erase(iter);
-    VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id;
-  }
-#endif
-
- private:
-  AllocatorMap allocators_;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // a standalone CUDA allocator to support multi-stream GC in new executor
+  CUDAAllocatorMap cuda_allocators_;
+  std::shared_timed_mutex cuda_allocator_mutex_;
 #ifdef PADDLE_WITH_CUDA
   std::unordered_map<CUDAGraphID, std::unique_ptr<AllocatorFacadePrivate>>
       cuda_graph_allocator_map_;
+#endif
 #endif
   AllocatorStrategy strategy_;
-
+  AllocatorMap allocators_;
   static AllocatorMap zero_size_allocators_;
   static AllocatorMap system_allocators_;
+  bool allow_free_idle_chunk_;
 };
-
 AllocatorFacadePrivate::AllocatorMap
     AllocatorFacadePrivate::zero_size_allocators_;
 AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_;
@@ -447,24 +801,207 @@ AllocatorFacade& AllocatorFacade::Instance() {
   return instance;
 }
 
-std::shared_ptr<Allocation> AllocatorFacade::AllocShared(
+const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
+    const platform::Place& place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place,
+                              /* A non-zero num to choose allocator_ */ 1);
+    }
+#endif
+
+    platform::CUDAPlace cuda_place =
+        BOOST_GET_CONST(platform::CUDAPlace, place);
+    return m_->GetAllocator(cuda_place, m_->GetDefaultStream(cuda_place));
+  }
+#endif
+
+  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+}
+
+std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
     const platform::Place& place, size_t size) {
-  return std::shared_ptr<Allocation>(Alloc(place, size));
+  return std::shared_ptr<pten::Allocation>(Alloc(place, size));
 }
 
 AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
                                      size_t size) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      size > 0 && FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_->GetAllocator(place, size)->Allocate(size);
+    }
+#endif
+
+    platform::CUDAPlace cuda_place =
+        BOOST_GET_CONST(platform::CUDAPlace, place);
+    return Alloc(cuda_place, size, m_->GetDefaultStream(cuda_place));
+  }
+#endif
+
   return m_->GetAllocator(place, size)->Allocate(size);
 }
 
 uint64_t AllocatorFacade::Release(const platform::Place& place) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
+      FLAGS_use_system_allocator == false) {
+#ifdef PADDLE_WITH_CUDA
+    if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+      return m_
+          ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+          ->Release(place);
+    }
+#endif
+
+    platform::CUDAPlace cuda_place =
+        BOOST_GET_CONST(platform::CUDAPlace, place);
+    return Release(cuda_place, m_->GetDefaultStream(cuda_place));
+  }
+#endif
   return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
 }
 
-const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
-    const platform::Place& place) {
-  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
+std::shared_ptr<pten::Allocation> AllocatorFacade::AllocShared(
+    const platform::Place& place, size_t size, const platform::Stream& stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'AllocaShared' function. To enable it, you can enter"
+          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
+          "terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return std::shared_ptr<pten::Allocation>(Alloc(place, size, s));
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
+#endif
+}
+
+bool AllocatorFacade::InSameStream(
+    const std::shared_ptr<pten::Allocation>& allocation,
+    const platform::Stream& stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'InSameStream' function. To enable it, you can enter"
+          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
+          "terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+  gpuStream_t s = reinterpret_cast<gpuStream_t>(stream.id());
+  return s == GetStream(allocation);
+#else
+  PADDLE_THROW(platform::errors::PreconditionNotMet("Not compiled with GPU."));
+#endif
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size,
+                                     const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'Alloc' function. To enable it, you can enter"
+          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
+          "terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
+  platform::CUDAPlace p = BOOST_GET_CONST(platform::CUDAPlace, place);
+  if (LIKELY(size > 0 && FLAGS_use_system_allocator == false)) {
+    return m_->GetAllocator(p, stream, /* create_if_not_found = */ true)
+        ->Allocate(size);
+  } else {
+    return m_->GetAllocator(p, size)->Allocate(size);
+  }
+}
+
+uint64_t AllocatorFacade::Release(const platform::CUDAPlace& place,
+                                  const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "multi-stream 'Release' function. To enable it, you can enter"
+          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
+          "terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
+  return m_->GetAllocator(place, stream)->Release(place);
+}
+
+void AllocatorFacade::RecordStream(std::shared_ptr<pten::Allocation> allocation,
+                                   const gpuStream_t& stream) {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "'RecordStream' function. To enable it, you can enter"
+          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
+          "terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
+  m_->RecordStream(allocation, stream);
+}
+
+const gpuStream_t& AllocatorFacade::GetStream(
+    const std::shared_ptr<pten::Allocation>& allocation) const {
+  PADDLE_ENFORCE_EQ(
+      FLAGS_use_stream_safe_cuda_allocator, true,
+      platform::errors::Unimplemented(
+          "StreamSafeCUDAAllocator is disabled, you should not call this "
+          "'GetStream' function. To enable it, you can enter"
+          "'export FLAGS_use_stream_safe_cuda_allocator=true' in the "
+          "terminal."));
+
+#ifdef PADDLE_WITH_CUDA
+  if (UNLIKELY(platform::CUDAGraph::IsCapturing())) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not allow to use StreamSafeCUDAAllocator with CUDAGraphAllocator"));
+  }
+#endif
+
+  return m_->GetStream(allocation);
 }
 
 #ifdef PADDLE_WITH_CUDA
@@ -476,7 +1013,7 @@ void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) {
   return m_->RemoveMemoryPoolOfCUDAGraph(id);
 }
 #endif
-
+#endif
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 8d889ec38e..76e2f0b5a9 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -19,13 +19,15 @@
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
 #ifdef PADDLE_WITH_CUDA
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 namespace memory {
 namespace allocation {
+
 #ifdef PADDLE_WITH_ASCEND_CL
 using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 #endif
@@ -40,22 +42,40 @@ using NPUPinnedAllocator = paddle::memory::allocation::NPUPinnedAllocator;
 class AllocatorFacadePrivate;
 class AllocatorFacade {
  public:
-  ~AllocatorFacade();
+  using Allocation = pten::Allocation;
   AllocatorFacade(const AllocatorFacade& o) = delete;
   const AllocatorFacade& operator=(const AllocatorFacade& o) = delete;
+  ~AllocatorFacade();
 
   static AllocatorFacade& Instance();
 
+  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
+
   // Allocate a shared allocation.
   std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
                                           size_t size);
-
   // Allocate a unique allocation.
   AllocationPtr Alloc(const platform::Place& place, size_t size);
-
   // Release unused memory pool.
   uint64_t Release(const platform::Place& place);
-  const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place);
+
+  std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
+                                          size_t size,
+                                          const platform::Stream& stream);
+
+  bool InSameStream(const std::shared_ptr<Allocation>& allocation,
+                    const platform::Stream& stream);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // TODO(zhiqiu): change gpuStream_t to platform::Stream if needed.
+  AllocationPtr Alloc(const platform::Place& place, size_t size,
+                      const gpuStream_t& stream);
+  uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
+  void RecordStream(std::shared_ptr<Allocation> allocation,
+                    const gpuStream_t& stream);
+  const gpuStream_t& GetStream(
+      const std::shared_ptr<Allocation>& allocation) const;
+#endif
 
 #ifdef PADDLE_WITH_CUDA
   void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id);
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 9f34f5198a..ad62af8480 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -45,7 +45,8 @@ AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator(
       chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)),
       allow_free_idle_chunk_(allow_free_idle_chunk) {}
 
-Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
+pten::Allocation *AutoGrowthBestFitAllocator::AllocateImpl(
+    size_t unaligned_size) {
   size_t size = AlignedSize(unaligned_size, alignment_);
   VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size;
 
@@ -78,11 +79,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
     size_t realloc_size = std::max(size, chunk_size_);
 
     try {
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(realloc_size)));
     } catch (BadAlloc &ex) {
       if (FLAGS_free_when_no_cache_hit) throw ex;
       FreeIdleChunks();
-      chunks_.emplace_back(underlying_allocator_->Allocate(realloc_size));
+      chunks_.emplace_back(static_unique_ptr_cast<Allocation>(
+          underlying_allocator_->Allocate(realloc_size)));
     }
 
     auto *chunk = &(*chunks_.rbegin());
@@ -100,11 +103,13 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) {
     VLOG(2) << "Not found and reallocate " << realloc_size << "("
             << static_cast<void *>(p) << "), and remaining " << remaining_size;
   }
+  VLOG(10) << "Alloc " << block_it->size_ << " bytes, ptr = " << block_it->ptr_;
   return new BlockAllocation(block_it);
 }
 
-void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
-  VLOG(10) << "Free " << allocation->size() << " bytes";
+void AutoGrowthBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
+  VLOG(10) << "Free " << allocation->size()
+           << " bytes, ptr = " << allocation->ptr();
   std::lock_guard<SpinLock> guard(spinlock_);
   auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
   auto &blocks = block_it->chunk_->blocks_;
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index d1fa6cce01..94aff93ec5 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -36,9 +36,9 @@ class AutoGrowthBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
-  void FreeImpl(Allocation *allocation) override;
+  void FreeImpl(pten::Allocation *allocation) override;
 
   // Release the memory block which is not used in pool.
   uint64_t ReleaseImpl(const platform::Place &place) override {
@@ -64,16 +64,17 @@ class AutoGrowthBestFitAllocator : public Allocator {
   };
 
   struct Chunk {
-    explicit Chunk(AllocationPtr allocation)
+    explicit Chunk(DecoratedAllocationPtr allocation)
         : allocation_(std::move(allocation)) {}
 
-    AllocationPtr allocation_;
+    DecoratedAllocationPtr allocation_;
     List<Block> blocks_;
   };
 
   struct BlockAllocation : public Allocation {
     explicit BlockAllocation(const List<Block>::iterator &it)
-        : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()),
+        : Allocation(it->ptr_, it->chunk_->allocation_->base_ptr(), it->size_,
+                     it->chunk_->allocation_->place()),
           block_it_(it) {}
 
     List<Block>::iterator block_it_;
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 926af8292d..5942fbe730 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -28,12 +28,12 @@ namespace allocation {
 
 class RecordedAllocator : public Allocator {
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     allocated_size_ += size;
     return new Allocation(malloc(size), size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) {
+  void FreeImpl(pten::Allocation *allocation) {
     allocated_size_ -= allocation->size();
     free(allocation->ptr());
     delete allocation;
@@ -79,7 +79,7 @@ class LimitedResourceAllocator : public Allocator {
   size_t AllocatedSize() const { return allocated_size_; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     if (allocated_size_ + size > capacity_) {
       throw BadAlloc("", __FILE__, __LINE__);
     }
@@ -88,7 +88,7 @@ class LimitedResourceAllocator : public Allocator {
     return new Allocation(malloc(size), size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) {
+  void FreeImpl(pten::Allocation *allocation) {
     allocated_size_ -= allocation->size();
     free(allocation->ptr());
     delete allocation;
diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/allocation/base_ptr_test.cu
index e69de29bb2..5edabfcb9f 100644
--- a/paddle/fluid/memory/allocation/base_ptr_test.cu
+++ b/paddle/fluid/memory/allocation/base_ptr_test.cu
@@ -0,0 +1,118 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <random>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CUDAAllocatoionBasePtrTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    alloc_times_ = 100;
+    batch_size_ = 10;
+    max_alloc_size_ = platform::GpuMaxAllocSize() / alloc_times_;
+    random_engine_ = std::default_random_engine(time(NULL));
+    dis_ = std::uniform_int_distribution<int>(0, max_alloc_size_);
+  }
+
+  void OneByOneAllocTest() {
+    for (size_t i = 0; i < alloc_times_; ++i) {
+      size_t size = dis_(random_engine_);
+      AllocationPtr allocation = Alloc(place_, size);
+
+      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+      void* system_ptr =
+          platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+      EXPECT_EQ(base_ptr, system_ptr);
+    }
+
+    Release(place_);
+  }
+
+  void BatchByBatchAllocTest() {
+    std::vector<AllocationPtr> allocations;
+    allocations.reserve(batch_size_);
+    size_t batch_num = alloc_times_ / batch_size_;
+
+    for (size_t i = 0; i < batch_num; ++i) {
+      for (size_t j = 0; j < batch_size_; ++j) {
+        size_t size = dis_(random_engine_);
+        AllocationPtr allocation = Alloc(place_, size);
+
+        void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+        void* system_ptr =
+            platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+        EXPECT_EQ(base_ptr, system_ptr);
+
+        allocations.emplace_back(std::move(allocation));
+      }
+      allocations.clear();
+    }
+
+    Release(place_);
+  }
+
+  void ContinuousAllocTest() {
+    std::vector<AllocationPtr> allocations;
+    allocations.reserve(alloc_times_);
+
+    for (size_t i = 0; i < alloc_times_; ++i) {
+      size_t size = dis_(random_engine_);
+      AllocationPtr allocation = Alloc(place_, size);
+
+      void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+      void* system_ptr =
+          platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+      EXPECT_EQ(base_ptr, system_ptr);
+
+      allocations.emplace_back(std::move(allocation));
+    }
+
+    allocations.clear();
+    Release(place_);
+  }
+
+  void ZeroSizeAllocTest() {
+    AllocationPtr allocation = Alloc(place_, 0);
+    void* base_ptr = static_cast<Allocation*>(allocation.get())->base_ptr();
+    void* system_ptr =
+        platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+    EXPECT_EQ(base_ptr, system_ptr);
+  }
+
+ private:
+  platform::CUDAPlace place_;
+  size_t max_alloc_size_;
+  size_t alloc_times_;
+  size_t batch_size_;
+  std::default_random_engine random_engine_;
+  std::uniform_int_distribution<int> dis_;
+};
+
+TEST_F(CUDAAllocatoionBasePtrTest, base_ptr_test) {
+  OneByOneAllocTest();
+  BatchByBatchAllocTest();
+  ContinuousAllocTest();
+  ZeroSizeAllocTest();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 0955b52126..3cba70bd5b 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -33,7 +33,7 @@ static int HighestBitPos(size_t N) {
   }
 }
 
-BestFitAllocator::BestFitAllocator(Allocation* allocation)
+BestFitAllocator::BestFitAllocator(pten::Allocation* allocation)
     : allocation_(allocation) {
   details::Chunk chunk;
   chunk.size_ = allocation_->size();
@@ -115,7 +115,7 @@ size_t BestFitAllocator::NumFreeChunks() const {
   }
   return num;
 }
-void BestFitAllocator::FreeImpl(Allocation* allocation) {
+void BestFitAllocator::FreeImpl(pten::Allocation* allocation) {
   auto* bf_allocation = dynamic_cast<BestFitAllocation*>(allocation);
   PADDLE_ENFORCE_NOT_NULL(
       bf_allocation,
@@ -150,7 +150,7 @@ void BestFitAllocator::FreeImpl(Allocation* allocation) {
   InsertFreeNode(chunk_it);
   delete allocation;
 }
-Allocation* BestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation* BestFitAllocator::AllocateImpl(size_t size) {
   auto highest_set_bit = static_cast<size_t>(HighestBitPos(size));
   MapIt map_it;
   for (; highest_set_bit < free_chunks_.size(); ++highest_set_bit) {
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h
index 42f69e6d70..297d876178 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.h
@@ -108,7 +108,7 @@ class BestFitAllocation : public Allocation {
 // the prev-chunk and the next-chunk when possible.
 class BestFitAllocator : public Allocator {
  public:
-  explicit BestFitAllocator(Allocation* allocation);
+  explicit BestFitAllocator(pten::Allocation* allocation);
 
   void* BasePtr() const { return allocation_->ptr(); }
 
@@ -127,11 +127,11 @@ class BestFitAllocator : public Allocator {
   void InsertFreeNode(const ListIt& it);
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
-  Allocation* allocation_;  // not owned
+  pten::Allocation* allocation_;  // not owned
   details::ChunkList chunks_;
   details::FreeChunkBin free_chunks_;
 };
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc
index 325cb010bf..11739ebba9 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator.cc
@@ -46,12 +46,13 @@ void BufferedAllocator::FreeCache(size_t size) {
 
 bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; }
 
-void BufferedAllocator::FreeImpl(Allocation *allocation) {
+void BufferedAllocator::FreeImpl(pten::Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
-  allocations_.emplace(allocation->size(), AllocationPtr(allocation));
+  allocations_.emplace(allocation->size(),
+                       AllocationPtr(allocation, Allocator::AllocationDeleter));
 }
 
-Allocation *BufferedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *BufferedAllocator::AllocateImpl(size_t size) {
   {
     platform::LockGuardPtr<std::mutex> guard(mtx_);
     auto it = allocations_.lower_bound(size);
diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h
index 5e1733bd83..0ccccef573 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator.h
+++ b/paddle/fluid/memory/allocation/buffered_allocator.h
@@ -45,8 +45,8 @@ class BufferedAllocator : public Allocator {
   void FreeCache(size_t size);
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
index 0bfa10a161..21c30efccd 100644
--- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc
@@ -27,7 +27,7 @@ namespace memory {
 namespace allocation {
 
 inline std::unique_ptr<BufferedAllocator> GetBufferedAllocator(
-    Allocation *allocation, bool thread_safe) {
+    pten::Allocation *allocation, bool thread_safe) {
   std::unique_ptr<Allocator> allocator(new BestFitAllocator(allocation));
   if (thread_safe) {
     allocator.reset(new LockedAllocator(std::move(allocator)));
@@ -68,7 +68,7 @@ class StubAllocator : public Allocator {
   size_t GetFreeCount() const { return destruct_count_; }
 
  protected:
-  void FreeImpl(Allocation *allocation) override {
+  void FreeImpl(pten::Allocation *allocation) override {
     auto *alloc = dynamic_cast<StubAllocation *>(allocation);
     PADDLE_ENFORCE_NOT_NULL(
         alloc, platform::errors::InvalidArgument(
@@ -77,7 +77,7 @@ class StubAllocator : public Allocator {
     ++destruct_count_;
     delete allocation;
   }
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     ++construct_count_;
     if (size == 0) {
       return new StubAllocation(nullptr, 0, platform::CPUPlace());
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc
index 128591f5a8..bf0bd891be 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.cc
+++ b/paddle/fluid/memory/allocation/cpu_allocator.cc
@@ -24,7 +24,7 @@ namespace allocation {
 
 bool CPUAllocator::IsAllocThreadSafe() const { return true; }
 
-void CPUAllocator::FreeImpl(Allocation *allocation) {
+void CPUAllocator::FreeImpl(pten::Allocation *allocation) {
   void *p = allocation->ptr();
 #ifdef _WIN32
   _aligned_free(p);
@@ -34,7 +34,7 @@ void CPUAllocator::FreeImpl(Allocation *allocation) {
   delete allocation;
 }
 
-Allocation *CPUAllocator::AllocateImpl(size_t size) {
+pten::Allocation *CPUAllocator::AllocateImpl(size_t size) {
   void *p;
 #ifdef _WIN32
   p = _aligned_malloc(size, kAlignment);
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 058ff63381..a64089dd2d 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -37,8 +37,8 @@ class CPUAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 };
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc
index b1a45afa99..ff9bbf4ab3 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_allocator.cc
@@ -25,34 +25,34 @@
 
 #include <string>
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 bool CUDAAllocator::IsAllocThreadSafe() const { return true; }
-void CUDAAllocator::FreeImpl(Allocation* allocation) {
+void CUDAAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
       BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
           "GPU memory is freed in incorrect device. This may be a bug"));
-  platform::RecordedCudaFree(allocation->ptr(), allocation->size(),
-                             place_.device);
+  platform::RecordedGpuFree(allocation->ptr(), allocation->size(),
+                            place_.device);
   delete allocation;
 }
 
-Allocation* CUDAAllocator::AllocateImpl(size_t size) {
+pten::Allocation* CUDAAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_, [this] { platform::SetDeviceId(place_.device); });
 
   void* ptr;
-  auto result = platform::RecordedCudaMalloc(&ptr, size, place_.device);
+  auto result = platform::RecordedGpuMalloc(&ptr, size, place_.device);
   if (LIKELY(result == gpuSuccess)) {
     return new Allocation(ptr, size, platform::Place(place_));
   }
 
   size_t avail, total, actual_avail, actual_total;
-  bool is_limited = platform::RecordedCudaMemGetInfo(
+  bool is_limited = platform::RecordedGpuMemGetInfo(
       &avail, &total, &actual_avail, &actual_total, place_.device);
   size_t allocated = total - avail;
 
diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h
index 5969d4d20d..57e85a3dc2 100644
--- a/paddle/fluid/memory/allocation/cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_allocator.h
@@ -28,8 +28,8 @@ class CUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 3d6f1d7bcb..a6696634c1 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -41,8 +41,9 @@ namespace allocation {
  */
 class CUDADeviceContextAllocation : public Allocation {
  public:
-  explicit CUDADeviceContextAllocation(AllocationPtr allocation)
-      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
+  explicit CUDADeviceContextAllocation(DecoratedAllocationPtr allocation)
+      : Allocation(allocation->ptr(), allocation->base_ptr(),
+                   allocation->size(), allocation->place()),
         underlying_allocation_(std::move(allocation)) {}
 
   ~CUDADeviceContextAllocation() {
@@ -55,7 +56,7 @@ class CUDADeviceContextAllocation : public Allocation {
             << p_allocation;
     dev_ctx_->AddStreamCallback([p_allocation] {
       VLOG(4) << "Delete CUDADeviceContextAllocation at " << p_allocation;
-      AllocationDeleter()(p_allocation);
+      Allocator::AllocationDeleter(p_allocation);
     });
   }
 
@@ -64,7 +65,7 @@ class CUDADeviceContextAllocation : public Allocation {
   }
 
  private:
-  AllocationPtr underlying_allocation_;
+  DecoratedAllocationPtr underlying_allocation_;
   const platform::CUDADeviceContext *dev_ctx_{nullptr};
 };
 
@@ -81,10 +82,10 @@ class CUDADeviceContextAllocator : public Allocator {
       : place_(place), default_stream_(default_stream) {
     platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         hipEventCreateWithFlags(&event_, hipEventDisableTiming));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_ENFORCE_GPU_SUCCESS(
         cudaEventCreate(&event_, cudaEventDisableTiming));
 #endif
   }
@@ -93,35 +94,34 @@ class CUDADeviceContextAllocator : public Allocator {
     if (event_) {
       platform::CUDADeviceGuard guard(place_.device);
 #ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event_));
+      PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event_));
 #else
-      PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event_));
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event_));
 #endif
     }
   }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     PADDLE_ENFORCE_NOT_NULL(
         default_stream_,
         platform::errors::PreconditionNotMet(
             "Default stream is not set for CUDADeviceContextAllocator"));
     platform::CUDADeviceGuard guard(place_.device);
-    auto allocation =
-        new CUDADeviceContextAllocation(memory::Alloc(place_, size));
+    auto allocation = new CUDADeviceContextAllocation(
+        static_unique_ptr_cast<Allocation>(memory::Alloc(place_, size)));
 // Wait for the event on stream
 #ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, default_stream_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamWaitEvent(default_stream_, event_, 0));
 #else
-    PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event_, default_stream_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        cudaStreamWaitEvent(default_stream_, event_, 0));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, default_stream_));
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(default_stream_, event_, 0));
 #endif
     return allocation;
   }
 
-  void FreeImpl(Allocation *allocation) override { delete allocation; }
+  void FreeImpl(pten::Allocation *allocation) override { delete allocation; }
 
  private:
   platform::CUDAPlace place_;
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
index e69de29bb2..2ae2cf20ee 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.cc
@@ -0,0 +1,228 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <cuda_runtime.h>
+#endif
+
+#include <string>
+#include "paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h"
+#include "paddle/fluid/platform/enforce.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/dynload/cuda_driver.h"
+#endif
+#if CUDA_VERSION >= 10020
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+CUDAVirtualMemAllocator::CUDAVirtualMemAllocator(
+    const platform::CUDAPlace& place)
+    : place_(place) {
+  CUmemAllocationProp prop = {};
+
+  // Setup the properties common for all the chunks
+  // The allocations will be device pinned memory.
+  // This property structure describes the physical location where the memory
+  // will be allocated via cuMemCreate allong with additional properties In this
+  // case, the allocation will be pinnded device memory local to a given device.
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.location.id = place.device;
+  prop_ = prop;
+
+  // Prepare the access descriptor array indicating where and how the backings
+  // should be visible.
+  for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
+    if (place.device != dev_id) {
+      int capable = 0;
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaDeviceCanAccessPeer(&capable, place.device, dev_id));
+      if (!capable) {
+        VLOG(1) << "device(" << place.device
+                << ") can not access peer to device(" << dev_id << ")";
+        continue;
+      }
+    }
+    CUmemAccessDesc access_desc = {};
+    // Specify which device we are adding mappings for.
+    access_desc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access_desc.location.id = dev_id;
+
+    // Specify both read and write access.
+    access_desc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+    access_desc_.push_back(access_desc);
+  }
+
+  // Get the minimum granularity needed for all devices
+  // (the max of the minimum granularity of each participating device)
+  granularity_ = 0;
+  for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
+    size_t granularity;
+    prop.location.id = dev_id;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        paddle::platform::dynload::cuMemGetAllocationGranularity(
+            &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+    granularity_ = std::max(granularity, granularity_);
+  }
+
+  size_t actual_avail, actual_total;
+  paddle::platform::CUDADeviceGuard guard(place.device);
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+
+  virtual_mem_size_ = AlignedSize(actual_total, granularity_);
+
+  // Reserve the required contiguous virtual address space for the allocations
+  // The maximum video memory size we can apply for is the video memory size of
+  // GPU,
+  // so the virtual address space size we reserve is equal to the GPU video
+  // memory size
+  PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cuMemAddressReserve(
+      &virtual_mem_base_, virtual_mem_size_, 0, 0, 0));
+
+  virtual_mem_alloced_offset_ = 0;
+}
+
+bool CUDAVirtualMemAllocator::IsAllocThreadSafe() const { return false; }
+
+void CUDAVirtualMemAllocator::FreeImpl(pten::Allocation* allocation) {
+  PADDLE_ENFORCE_EQ(
+      BOOST_GET_CONST(platform::CUDAPlace, allocation->place()), place_,
+      platform::errors::PermissionDenied(
+          "GPU memory is freed in incorrect device. This may be a bug"));
+
+  auto iter = virtual_2_physical_map_.find(
+      reinterpret_cast<CUdeviceptr>(allocation->ptr()));
+  if (iter == virtual_2_physical_map_.end()) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Can not find virtual memory address at %s", allocation->ptr()));
+  }
+
+  int prev_id;
+  cudaGetDevice(&prev_id);
+  if (prev_id != place_.device) {
+    cudaSetDevice(place_.device);
+  }
+
+  auto result =
+      paddle::platform::dynload::cuMemUnmap(iter->first, iter->second.second);
+  if (result != CUDA_ERROR_DEINITIALIZED) {
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
+  }
+
+  if (result != CUDA_ERROR_DEINITIALIZED) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::RecordedGpuMemRelease(
+        iter->second.first, iter->second.second, place_.device));
+  }
+
+  if (prev_id != place_.device) {
+    cudaSetDevice(prev_id);
+  }
+
+  virtual_2_physical_map_.erase(iter);
+
+  delete allocation;
+}
+
+pten::Allocation* CUDAVirtualMemAllocator::AllocateImpl(size_t size) {
+  size = AlignedSize(size, granularity_);
+
+  CUdeviceptr ptr = virtual_mem_base_ + virtual_mem_alloced_offset_;
+
+  if (ptr + size > virtual_mem_base_ + virtual_mem_size_) {
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "\n\nOut of memory error on GPU Virtual Memory %d. "
+        "Cannot allocate %s memory on GPU Virtual Memory %d, %s memory has "
+        "been allocated and "
+        "available memory is only %s.\n\n"
+        "Please decrease the batch size of your model.\n\n",
+        place_.device, string::HumanReadableSize(size), place_.device,
+        string::HumanReadableSize(virtual_mem_alloced_offset_),
+        string::HumanReadableSize(virtual_mem_size_ -
+                                  virtual_mem_alloced_offset_),
+        place_.device));
+    return nullptr;
+  }
+
+  CUmemGenericAllocationHandle handle;
+
+  paddle::platform::CUDADeviceGuard guard(place_.device);
+
+  // Create physical memory backing allocation.
+  auto result =
+      platform::RecordedGpuMemCreate(&handle, size, &prop_, 0, place_.device);
+
+  if (result != CUDA_SUCCESS) {
+    if (result == CUDA_ERROR_OUT_OF_MEMORY) {
+      size_t actual_avail, actual_total;
+      PADDLE_ENFORCE_GPU_SUCCESS(cudaMemGetInfo(&actual_avail, &actual_total));
+      size_t actual_allocated = actual_total - actual_avail;
+
+      PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+          "\n\nOut of memory error on GPU %d. "
+          "Cannot allocate %s memory on GPU %d, %s memory has been allocated "
+          "and "
+          "available memory is only %s.\n\n"
+          "Please check whether there is any other process using GPU %d.\n"
+          "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n"
+          "2. If no, please decrease the batch size of your model.\n\n",
+          place_.device, string::HumanReadableSize(size), place_.device,
+          string::HumanReadableSize(actual_allocated),
+          string::HumanReadableSize(actual_avail), place_.device));
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(result);
+    }
+    return nullptr;
+  }
+
+  // Assign the chunk to the appropriate VA range and release the handle.
+  // After mapping the memory, it can be referenced by virtual address.
+  // The allocation will be kept live until it is unmapped.
+  result = paddle::platform::dynload::cuMemMap(ptr, size, 0, handle, 0);
+
+  if (result != CUDA_SUCCESS) {
+    platform::RecordedGpuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
+    return nullptr;
+  }
+
+  // Apply the access descriptors to the whole VA range.
+  result = paddle::platform::dynload::cuMemSetAccess(
+      ptr, size, access_desc_.data(), access_desc_.size());
+
+  if (result != CUDA_SUCCESS) {
+    paddle::platform::dynload::cuMemUnmap(ptr, size);
+    platform::RecordedGpuMemRelease(handle, size, place_.device);
+    PADDLE_ENFORCE_GPU_SUCCESS(result);
+    return nullptr;
+  }
+
+  virtual_2_physical_map_.emplace(ptr, std::make_pair(handle, size));
+
+  virtual_mem_alloced_offset_ += size;
+
+  return new Allocation(reinterpret_cast<void*>(ptr), size,
+                        platform::Place(place_));
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
index e69de29bb2..0e1e59d200 100644
--- a/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_virtual_mem_allocator.h
@@ -0,0 +1,62 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#include "paddle/fluid/platform/cuda_device_guard.h"
+#endif
+
+#include <mutex>  // NOLINT
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/place.h"
+
+#if CUDA_VERSION >= 10020
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+// Allocate memory using NVIDIA's virtual memory management technology
+class CUDAVirtualMemAllocator : public Allocator {
+ public:
+  explicit CUDAVirtualMemAllocator(const platform::CUDAPlace& place);
+
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
+
+ private:
+  platform::CUDAPlace place_;
+
+  CUdeviceptr virtual_mem_base_;
+  size_t virtual_mem_size_;
+  size_t virtual_mem_alloced_offset_;
+  size_t granularity_;
+
+  CUmemAllocationProp prop_;
+  std::vector<CUmemAccessDesc> access_desc_;
+
+  std::map<CUdeviceptr, std::pair<CUmemGenericAllocationHandle, size_t>>
+      virtual_2_physical_map_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc
index 6e8f870b23..a0c8efddbd 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.cc
+++ b/paddle/fluid/memory/allocation/locked_allocator.cc
@@ -37,12 +37,12 @@ LockedAllocator::LockedAllocator(
   }
 }
 
-void LockedAllocator::FreeImpl(Allocation *allocation) {
+void LockedAllocator::FreeImpl(pten::Allocation *allocation) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   underlying_allocator_->Free(allocation);
 }
 
-Allocation *LockedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *LockedAllocator::AllocateImpl(size_t size) {
   platform::LockGuardPtr<std::mutex> guard(mtx_);
   return underlying_allocator_->Allocate(size).release();
 }
diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h
index 1b8418bc84..d17c8b24e2 100644
--- a/paddle/fluid/memory/allocation/locked_allocator.h
+++ b/paddle/fluid/memory/allocation/locked_allocator.h
@@ -29,8 +29,8 @@ class LockedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 2c00b34dd1..ffe7ccf919 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -20,9 +20,8 @@
 #include "glog/logging.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/platform/npu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
 #include "paddle/fluid/string/printf.h"
@@ -30,9 +29,7 @@
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
-#ifdef PADDLE_WITH_XPU
-#include "paddle/fluid/platform/xpu/xpu_header.h"
-#endif
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 PADDLE_DEFINE_EXPORTED_bool(
     init_allocated_mem, false,
@@ -114,30 +111,43 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
+// For Graphcore IPU
+template <>
+void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  VLOG(10) << "IPUPlace, Allocate on cpu.";
+
+  void *p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+template <>
+void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
+                              size_t size) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+template <>
+uint64_t Release<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Release();
+}
+template <>
+size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
 // For kunlun XPU
 template <>
 void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
 #ifdef PADDLE_WITH_XPU
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void *p = nullptr;
-  int dev_id = -1;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  ret = xpu_set_device(place.device);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
+
+  platform::XPUDeviceGuard gurad(place.device);
+  int ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
   if (ret != XPU_SUCCESS) {
     std::cout << "xpu memory malloc(" << size << ") failed, try again\n";
     xpu_wait();
@@ -151,12 +161,6 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "xpu memory FLAGS_init_allocated_mem is not implemented."));
   }
-  ret = xpu_set_device(dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
   VLOG(10) << "  pointer=" << p;
   return p;
 #else
@@ -172,30 +176,9 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
 #ifdef PADDLE_WITH_XPU
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
-  int dev_id = -1;
-  int ret = xpu_current_device(&dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
-  if (dev_id >= 64) {
-    // if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
-    dev_id -= 64;
-  }
-  ret = xpu_set_device(place.device);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
+
+  platform::XPUDeviceGuard gurad(place.device);
   xpu_free(p);
-  ret = xpu_set_device(dev_id);
-  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
-                    platform::errors::External(
-                        "XPU API return wrong value[%d], please check whether "
-                        "Baidu Kunlun Card is properly installed.",
-                        ret));
 #else
   PADDLE_THROW(
       platform::errors::PermissionDenied("'XPUPlace' is not supported."));
@@ -327,8 +310,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
     size_t avail, total;
     platform::NPUMemoryUsage(&avail, &total);
     PADDLE_THROW(platform::errors::ResourceExhausted(
-        "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
-        "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
+        "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
+        "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
         string::HumanReadableSize(size), place.device,
         string::HumanReadableSize(avail), string::HumanReadableSize(total),
         string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
@@ -336,7 +319,7 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
         string::HumanReadableSize(Used<platform::NPUPlace>(place))));
   } else {
     if (FLAGS_init_allocated_mem) {
-      aclrtMemset(ptr, size, 0xEF, size);
+      platform::NPUMemsetSync(ptr, 0xEF, size, size);
     }
   }
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
@@ -387,8 +370,7 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
   void *ptr = buddy_allocator->Alloc(size);
 
   if (ptr == nullptr) {
-    LOG(WARNING) << "aclrtMallocHost Cannot allocate " << size
-                 << " bytes in NPUPinnedPlace";
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in NPUPinnedPlace";
   }
   if (FLAGS_init_allocated_mem) {
     memset(ptr, 0xEF, size);
@@ -622,6 +604,134 @@ uint64_t Release<platform::CUDAPinnedPlace>(
 #endif
 }
 
+// For MLU
+#ifdef PADDLE_WITH_MLU
+class MLUBuddyAllocatorList {
+ private:
+  MLUBuddyAllocatorList() : devices_(platform::GetMLUSelectedDevices()) {
+    auto mlu_num = devices_.size();
+    allocators_.resize(mlu_num);
+    init_flags_.reserve(mlu_num);
+    for (size_t i = 0; i < mlu_num; ++i) {
+      init_flags_.emplace_back(new std::once_flag());
+    }
+  }
+
+  static MLUBuddyAllocatorList *CreateNewInstance() {
+    return new MLUBuddyAllocatorList();
+  }
+
+ public:
+  static MLUBuddyAllocatorList *Instance() {
+    static auto *instance = CreateNewInstance();
+    return instance;
+  }
+
+  BuddyAllocator *Get(int mlu_id) {
+    auto pos = std::distance(
+        devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
+    PADDLE_ENFORCE_LT(pos, devices_.size(),
+                      platform::errors::OutOfRange(
+                          "The index exceeds the size of devices, the size of "
+                          "devices is %d, the index is %d",
+                          devices_.size(), pos));
+
+    std::call_once(*init_flags_[pos], [this, pos] {
+      platform::SetMLUDeviceId(devices_[pos]);
+      allocators_[pos].reset(new BuddyAllocator(
+          std::unique_ptr<detail::SystemAllocator>(
+              new detail::MLUAllocator(devices_[pos])),
+          platform::MLUMinChunkSize(), platform::MLUMaxChunkSize()));
+      VLOG(10) << "\n\nNOTE:\n"
+               << "You can set GFlags environment variable "
+               << "(mlu reuse gpu GFlags) "
+               << "'FLAGS_fraction_of_gpu_memory_to_use' "
+               << "or 'FLAGS_initial_gpu_memory_in_mb' "
+               << "or 'FLAGS_reallocate_gpu_memory_in_mb' "
+               << "to change the memory size for MLU usage.\n"
+               << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is "
+               << FLAGS_fraction_of_gpu_memory_to_use
+               << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is "
+               << FLAGS_initial_gpu_memory_in_mb
+               << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is "
+               << FLAGS_reallocate_gpu_memory_in_mb << "\n\n";
+    });
+
+    return allocators_[pos].get();
+  }
+
+ private:
+  std::vector<int> devices_;
+  std::vector<std::unique_ptr<std::once_flag>> init_flags_;
+  std::vector<std::unique_ptr<BuddyAllocator>> allocators_;
+};
+
+BuddyAllocator *GetMLUBuddyAllocator(int mlu_id) {
+  return MLUBuddyAllocatorList::Instance()->Get(mlu_id);
+}
+#endif
+
+template <>
+size_t Used<platform::MLUPlace>(const platform::MLUPlace &place) {
+#ifdef PADDLE_WITH_MLU
+  return GetMLUBuddyAllocator(place.device)->Used();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'MLUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
+#ifdef PADDLE_WITH_MLU
+  auto *buddy_allocator = GetMLUBuddyAllocator(place.device);
+  auto *ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    platform::MLUDeviceGuard(place.device);
+    size_t avail = 0, total = 0;
+    platform::MLUMemoryUsage(&avail, &total);
+    PADDLE_THROW(platform::errors::ResourceExhausted(
+        "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
+        "%s, MLUMinChunkSize %s, MLU memory used: %s.",
+        string::HumanReadableSize(size), place.device,
+        string::HumanReadableSize(avail), string::HumanReadableSize(total),
+        string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
+        string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
+        string::HumanReadableSize(Used<platform::MLUPlace>(place))));
+  } else {
+    if (FLAGS_init_allocated_mem) {
+      cnrtMemset(ptr, 0xEF, size);
+    }
+  }
+  return ptr;
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'MLUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p,
+                              size_t size) {
+#ifdef PADDLE_WITH_MLU
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetMLUBuddyAllocator(place.device)->Free(p);
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'MLUPlace' is not supported in CPU only device."));
+#endif
+}
+
+template <>
+uint64_t Release<platform::MLUPlace>(const platform::MLUPlace &place) {
+#ifdef PADDLE_WITH_MLU
+  return GetMLUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'MLUPlace' is not supported in CPU only device."));
+#endif
+}
+
 struct AllocVisitor : public boost::static_visitor<void *> {
   inline explicit AllocVisitor(size_t size) : size_(size) {}
 
@@ -680,7 +790,7 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
 
 namespace allocation {
 
-Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
+pten::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
   auto *tmp_alloc = new Allocation(ptr, size, place_);
   platform::MemEvenRecorder::Instance().PushMemRecord(
@@ -688,7 +798,7 @@ Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
   return tmp_alloc;
 }
 
-void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
+void NaiveBestFitAllocator::FreeImpl(pten::Allocation *allocation) {
   boost::apply_visitor(
       legacy::FreeVisitor(allocation->ptr(), allocation->size()),
       allocation->place());
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 474a308a06..b7b3647ff9 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -34,8 +34,8 @@ class NaiveBestFitAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc
index faf7ae6221..d9fa7ec27f 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_allocator.cc
@@ -14,15 +14,15 @@
 
 #include "paddle/fluid/memory/allocation/npu_allocator.h"
 #include <string>
+#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/npu_info.h"
 
 namespace paddle {
 namespace memory {
 namespace allocation {
 
 bool NPUAllocator::IsAllocThreadSafe() const { return true; }
-void NPUAllocator::FreeImpl(Allocation* allocation) {
+void NPUAllocator::FreeImpl(pten::Allocation* allocation) {
   PADDLE_ENFORCE_EQ(
       BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_,
       platform::errors::PermissionDenied(
@@ -32,7 +32,7 @@ void NPUAllocator::FreeImpl(Allocation* allocation) {
   delete allocation;
 }
 
-Allocation* NPUAllocator::AllocateImpl(size_t size) {
+pten::Allocation* NPUAllocator::AllocateImpl(size_t size) {
   std::call_once(once_flag_,
                  [this] { platform::SetNPUDeviceId(place_.device); });
 
diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h
index bf66897350..88b0c9a24b 100644
--- a/paddle/fluid/memory/allocation/npu_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_allocator.h
@@ -28,8 +28,8 @@ class NPUAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
 
  private:
   platform::NPUPlace place_;
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
index 9178825efa..2389973fa9 100644
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc
@@ -23,22 +23,22 @@ void NPUPinnedAllocator::ProcessEventsAndFree() {
   for (auto it = npu_events_.begin(); it != npu_events_.end();) {
     aclrtEvent event = it->second;
     aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-    PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
+    platform::NPUEventQuery(event, &status);
 
     if (status == ACL_EVENT_STATUS_COMPLETE) {
-      Allocation *allocation = it->first;
+      auto *allocation = it->first;
       void *ptr = allocation->ptr();
       free(ptr);
       npu_events_.erase(it++);
       delete allocation;
-      PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+      platform::NPUEventDestroy(event);
     } else {
       ++it;
     }
   }
 }
 
-Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
   std::lock_guard<std::mutex> lock(mtx_);
   ProcessEventsAndFree();
   void *ptr;
@@ -50,18 +50,29 @@ Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) {
   return new Allocation(ptr, size, platform::NPUPinnedPlace());
 }
 
-void NPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void NPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) {
   std::lock_guard<std::mutex> lock(mtx_);
   void *ptr = allocation->ptr();
   auto iter = npu_events_.find(allocation);
+
+  // Managed by GC if not called RecordEvent.
+  if (iter == npu_events_.end()) {
+    // double free? No such problem has been found so far.
+    // Or maybe we need a set<Allocation*> to record which
+    // Allocation managed by GC.
+    free(ptr);
+    delete allocation;
+    return;
+  }
+
   aclrtEvent event = iter->second;
   aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, &status));
+  platform::NPUEventQuery(event, &status);
   if (status == ACL_EVENT_STATUS_COMPLETE) {
     free(ptr);
     npu_events_.erase(allocation);
     delete allocation;
-    PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event));
+    platform::NPUEventDestroy(event);
   }
   return;
 }
@@ -72,12 +83,12 @@ uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) {
   return static_cast<uint64_t>(0);
 }
 
-void NPUPinnedAllocator::RecordEvent(Allocation *allocation,
+void NPUPinnedAllocator::RecordEvent(pten::Allocation *allocation,
                                      aclrtStream stream) {
   std::lock_guard<std::mutex> lock(mtx_);
   aclrtEvent event = nullptr;
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event));
-  PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream));
+  platform::NPUEventCreate(&event);
+  platform::NPUEventRecord(event, stream);
   npu_events_.insert({allocation, event});
 }
 
diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
index b330b6e352..716b12eea1 100644
--- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h
@@ -21,7 +21,7 @@
 
 #include "acl/acl.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/npu_info.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle {
@@ -32,16 +32,16 @@ class NPUPinnedAllocator : public Allocator {
  public:
   bool IsAllocThreadSafe() const override { return true; }
   void ProcessEventsAndFree();
-  void RecordEvent(Allocation *allocation, aclrtStream stream);
+  void RecordEvent(pten::Allocation *allocation, aclrtStream stream);
   constexpr static size_t kAlignment = 4096UL;
 
  protected:
-  Allocation *AllocateImpl(size_t size) override;
-  void FreeImpl(Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
   uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
-  std::unordered_map<Allocation *, aclrtEvent> npu_events_;
+  std::unordered_map<pten::Allocation *, aclrtEvent> npu_events_;
   mutable std::mutex mtx_;
 };
 
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc
index 5aa0514432..f1175fc437 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
@@ -18,20 +18,20 @@ namespace paddle {
 namespace memory {
 namespace allocation {
 bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
-void CPUPinnedAllocator::FreeImpl(Allocation *allocation) {
+void CPUPinnedAllocator::FreeImpl(pten::Allocation *allocation) {
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipHostFree(allocation->ptr()));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipHostFree(allocation->ptr()));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaFreeHost(allocation->ptr()));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
 #endif
   delete allocation;
 }
-Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
+pten::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
   void *ptr;
 #ifdef PADDLE_WITH_HIP
-  PADDLE_ENFORCE_CUDA_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
+  PADDLE_ENFORCE_GPU_SUCCESS(hipHostMalloc(&ptr, size, hipHostMallocPortable));
 #else
-  PADDLE_ENFORCE_CUDA_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
 #endif
   return new Allocation(ptr, size, platform::CUDAPinnedPlace());
 }
diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h
index 4f535ef337..800e3ff3bb 100644
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
@@ -25,8 +25,8 @@ class CPUPinnedAllocator : public Allocator {
   bool IsAllocThreadSafe() const override;
 
  protected:
-  void FreeImpl(Allocation *allocation) override;
-  Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  pten::Allocation *AllocateImpl(size_t size) override;
 };
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc
index 1607af3808..856b6c2e9a 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator.cc
@@ -39,7 +39,7 @@ class WaitedAllocateSizeGuard {
   size_t requested_size_;
 };
 
-void RetryAllocator::FreeImpl(Allocation* allocation) {
+void RetryAllocator::FreeImpl(pten::Allocation* allocation) {
   // Delete underlying allocation first.
   size_t size = allocation->size();
   underlying_allocator_->Free(allocation);
@@ -51,7 +51,7 @@ void RetryAllocator::FreeImpl(Allocation* allocation) {
   }
 }
 
-Allocation* RetryAllocator::AllocateImpl(size_t size) {
+pten::Allocation* RetryAllocator::AllocateImpl(size_t size) {
   auto alloc_func = [&, this]() {
     return underlying_allocator_->Allocate(size).release();
   };
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 031a5e2b97..b427a37907 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -45,8 +45,8 @@ class RetryAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  void FreeImpl(Allocation* allocation) override;
-  Allocation* AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation* allocation) override;
+  pten::Allocation* AllocateImpl(size_t size) override;
   uint64_t ReleaseImpl(const platform::Place& place) override {
     return underlying_allocator_->Release(place);
   }
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 787f3d9dca..d636c73e07 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -98,12 +98,12 @@ class DummyAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
         "Here is a test exception, always BadAlloc."));
   }
 
-  void FreeImpl(Allocation *) override {}
+  void FreeImpl(pten::Allocation *) override {}
 };
 
 TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index e69de29bb2..05c6a7adaf 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -0,0 +1,201 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
+    DecoratedAllocationPtr underlying_allocation, gpuStream_t owning_stream)
+    : Allocation(underlying_allocation->ptr(),
+                 underlying_allocation->base_ptr(),
+                 underlying_allocation->size(), underlying_allocation->place()),
+      underlying_allocation_(std::move(underlying_allocation)),
+      owning_stream_(std::move(owning_stream)) {}
+
+void StreamSafeCUDAAllocation::RecordStream(const gpuStream_t& stream) {
+  VLOG(8) << "Try record stream " << stream << " for address " << ptr();
+  if (stream == owning_stream_) {
+    VLOG(9) << "Record the same stream of " << stream;
+    return;
+  }
+
+  std::lock_guard<SpinLock> lock_guard(outstanding_event_map_lock_);
+  gpuEvent_t record_event;
+  auto it = outstanding_event_map_.find(stream);
+  if (it == outstanding_event_map_.end()) {
+    gpuEvent_t new_event;
+#ifdef PADDLE_WITH_CUDA
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        cudaEventCreateWithFlags(&new_event, cudaEventDisableTiming));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        hipEventCreateWithFlags(&new_event, hipEventDisableTiming));
+#endif
+    outstanding_event_map_[stream] = new_event;
+    record_event = new_event;
+    VLOG(9) << "Create a new event " << new_event;
+  } else {
+    record_event = it->second;
+    VLOG(9) << "Reuse event " << record_event;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(record_event, stream));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(hipEventRecord(record_event, stream));
+#endif
+  VLOG(8) << "Record event " << record_event << " to stream " << stream;
+}
+
+bool StreamSafeCUDAAllocation::CanBeFreed() {
+  // NOTE(Ruibiao): This function will not execute concurrently,
+  // so outstanding_event_lock_ is not required here
+  for (auto it = outstanding_event_map_.begin();
+       it != outstanding_event_map_.end(); ++it) {
+    gpuEvent_t& event = it->second;
+#ifdef PADDLE_WITH_CUDA
+    gpuError_t err = cudaEventQuery(event);
+    if (err == cudaErrorNotReady) {
+      VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
+      // Erase the completded event before "it"
+      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
+      return false;
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaEventDestroy(event));
+#else
+    gpuError_t err = hipEventQuery(event);
+    if (err == hipErrorNotReady) {
+      VLOG(9) << "Event " << event << " for " << ptr() << " is not completed";
+      // Erase the completded event before "it"
+      outstanding_event_map_.erase(outstanding_event_map_.begin(), it);
+      return false;
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(err);
+    PADDLE_ENFORCE_GPU_SUCCESS(hipEventDestroy(event));
+#endif
+    VLOG(8) << "Destroy event " << event;
+  }
+  return true;
+}
+
+const gpuStream_t& StreamSafeCUDAAllocation::GetOwningStream() const {
+  return owning_stream_;
+}
+
+StreamSafeCUDAAllocator::StreamSafeCUDAAllocator(
+    std::shared_ptr<Allocator> underlying_allocator, platform::CUDAPlace place,
+    gpuStream_t default_stream)
+    : underlying_allocator_(std::move(underlying_allocator)),
+      place_(std::move(place)),
+      default_stream_(std::move(default_stream)) {
+  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+  allocator_map_[place].emplace_back(this);
+}
+
+StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
+  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+  std::vector<StreamSafeCUDAAllocator*>& allocators = allocator_map_[place_];
+  allocators.erase(std::remove(allocators.begin(), allocators.end(), this),
+                   allocators.end());
+}
+
+bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
+
+pten::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
+  ProcessUnfreedAllocations();
+  VLOG(8) << "Try allocate " << size << " bytes";
+  AllocationPtr underlying_allocation;
+  try {
+    underlying_allocation = underlying_allocator_->Allocate(size);
+  } catch (BadAlloc&) {
+    VLOG(4) << "Allocation failed when allocating " << size << " bytes";
+    ReleaseImpl(place_);
+    try {
+      underlying_allocation = underlying_allocator_->Allocate(size);
+    } catch (...) {
+      VLOG(3)
+          << "Still allocation failed after release memory from all streams";
+      throw;
+    }
+  } catch (...) {
+    throw;
+  }
+  StreamSafeCUDAAllocation* allocation = new StreamSafeCUDAAllocation(
+      static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
+      default_stream_);
+  VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
+          << allocation->ptr();
+  return allocation;
+}
+
+void StreamSafeCUDAAllocator::FreeImpl(pten::Allocation* allocation) {
+  StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
+      dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
+  PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
+                          platform::errors::InvalidArgument(
+                              "Failed to dynamic cast %p from Allocation* to "
+                              "StreamSafeCUDAAllocation*",
+                              allocation));
+  VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
+  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
+  if (stream_safe_cuda_allocation->CanBeFreed()) {
+    VLOG(9) << "Directly delete allocation";
+    delete stream_safe_cuda_allocation;
+  } else {
+    VLOG(9) << "Put into unfreed_allocation list";
+    unfreed_allocations_.emplace_back(stream_safe_cuda_allocation);
+  }
+}
+
+uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
+  std::lock_guard<SpinLock> lock_guard(allocator_map_lock_);
+  std::vector<StreamSafeCUDAAllocator*>& allocators =
+      allocator_map_[BOOST_GET_CONST(platform::CUDAPlace, place)];
+  uint64_t released_size = 0;
+  for (StreamSafeCUDAAllocator* allocator : allocators) {
+    released_size += allocator->ProcessUnfreedAllocationsWithRelease();
+  }
+  VLOG(8) << "Release " << released_size << " bytes memory from all streams";
+  return released_size;
+}
+
+void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
+  std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
+  for (auto it = unfreed_allocations_.begin();
+       it != unfreed_allocations_.end();) {
+    if ((*it)->CanBeFreed()) {
+      delete *it;
+      it = unfreed_allocations_.erase(it);
+    } else {
+      ++it;
+    }
+  }
+}
+
+uint64_t StreamSafeCUDAAllocator::ProcessUnfreedAllocationsWithRelease() {
+  ProcessUnfreedAllocations();
+  return underlying_allocator_->Release(place_);
+}
+
+std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator*>>
+    StreamSafeCUDAAllocator::allocator_map_;
+SpinLock StreamSafeCUDAAllocator::allocator_map_lock_;
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
index e69de29bb2..f54cdc7496 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h
@@ -0,0 +1,81 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <deque>
+#include <list>
+#include <map>
+#include <mutex>
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#else
+#include <hip/hip_runtime.h>
+#endif
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class StreamSafeCUDAAllocation : public Allocation {
+ public:
+  StreamSafeCUDAAllocation(DecoratedAllocationPtr underlying_allocation,
+                           gpuStream_t owning_stream);
+  void RecordStream(const gpuStream_t &stream);
+  bool CanBeFreed();
+
+  const gpuStream_t &GetOwningStream() const;
+
+ private:
+  DecoratedAllocationPtr underlying_allocation_;
+  std::map<gpuStream_t, gpuEvent_t> outstanding_event_map_;
+  gpuStream_t owning_stream_;
+  SpinLock outstanding_event_map_lock_;
+};
+
+class StreamSafeCUDAAllocator : public Allocator {
+ public:
+  StreamSafeCUDAAllocator(std::shared_ptr<Allocator> underlying_allocator,
+                          platform::CUDAPlace place,
+                          gpuStream_t default_stream);
+  ~StreamSafeCUDAAllocator();
+  bool IsAllocThreadSafe() const override;
+
+ protected:
+  pten::Allocation *AllocateImpl(size_t size) override;
+  void FreeImpl(pten::Allocation *allocation) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override;
+
+ private:
+  void ProcessUnfreedAllocations();
+  uint64_t ProcessUnfreedAllocationsWithRelease();
+
+  static std::map<platform::CUDAPlace, std::vector<StreamSafeCUDAAllocator *>>
+      allocator_map_;
+  static SpinLock allocator_map_lock_;
+
+  std::shared_ptr<Allocator> underlying_allocator_;
+  platform::CUDAPlace place_;
+  gpuStream_t default_stream_;
+  std::list<StreamSafeCUDAAllocation *> unfreed_allocations_;
+  SpinLock unfreed_allocation_lock_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/test_aligned_allocator.cc b/paddle/fluid/memory/allocation/test_aligned_allocator.cc
index 3eb1f140ed..987c7ea772 100644
--- a/paddle/fluid/memory/allocation/test_aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/test_aligned_allocator.cc
@@ -32,12 +32,12 @@ struct StubAllocator : public Allocator {
   size_t AllocNum() const { return alloc_num_; }
 
  protected:
-  Allocation *AllocateImpl(size_t size) override {
+  pten::Allocation *AllocateImpl(size_t size) override {
     ++alloc_num_;
     return new Allocation(new uint8_t[size], size, platform::CPUPlace());
   }
 
-  void FreeImpl(Allocation *allocation) override {
+  void FreeImpl(pten::Allocation *allocation) override {
     delete[] static_cast<uint8_t *>(allocation->ptr());
     delete allocation;
     --alloc_num_;
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index 654fb3fe7b..9c93065170 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/memory/detail/buddy_allocator.h"
 #include "paddle/fluid/memory/detail/system_allocator.h"
-#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
 namespace paddle {
 namespace memory {
@@ -83,11 +83,11 @@ class ThreadLocalCUDAAllocator : public Allocator {
   bool IsAllocThreadSafe() const override { return true; }
 
  protected:
-  Allocation* AllocateImpl(size_t size) override {
+  pten::Allocation* AllocateImpl(size_t size) override {
     return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->AllocateImpl(
         size);
   }
-  void FreeImpl(Allocation* allocation) override {
+  void FreeImpl(pten::Allocation* allocation) override {
     auto* tl_allocation = static_cast<ThreadLocalAllocation*>(allocation);
     auto allocator_impl = tl_allocation->GetAllocator();
     allocator_impl->FreeImpl(tl_allocation);
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
index e69de29bb2..face27debe 100644
--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.cc
@@ -0,0 +1,256 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <mutex>
+
+#include "paddle/fluid/memory/allocation/aligned_allocator.h"
+#include "paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+bool NeedSplit(size_t block_size, size_t alignment, size_t allock_size) {
+  return block_size > (allock_size * 2) ||
+         (block_size - allock_size) > alignment;
+}
+
+VirtualMemoryAutoGrowthBestFitAllocator::
+    VirtualMemoryAutoGrowthBestFitAllocator(
+        const std::shared_ptr<Allocator> &underlying_allocator,
+        size_t alignment, const platform::CUDAPlace &place)
+    : underlying_allocator_(
+          std::make_shared<AlignedAllocator>(underlying_allocator, alignment)),
+      alignment_(alignment),
+      place_(place) {}
+
+pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocateImpl(
+    size_t size) {
+  std::lock_guard<SpinLock> guard(spinlock_);
+  size = AlignedSize(size, alignment_);
+  auto result = AllocFromFreeBlocks(size);
+
+  if (!result) {
+    ExtendAndMerge(size);
+    result = AllocFromFreeBlocks(size);
+  }
+
+  return result;
+}
+
+void VirtualMemoryAutoGrowthBestFitAllocator::FreeImpl(
+    pten::Allocation *allocation) {
+  std::lock_guard<SpinLock> guard(spinlock_);
+  auto block_it = static_cast<BlockAllocation *>(allocation)->block_it_;
+  TryMergeBlock2Blocks(block_it);
+  delete allocation;
+}
+
+void VirtualMemoryAutoGrowthBestFitAllocator::TryMergeBlock2Blocks(
+    std::list<Block>::iterator block) {
+  if (block->ptr_ == all_blocks_.front().ptr_ &&
+      block->ptr_ == all_blocks_.back().ptr_) {
+    block->is_free_ = true;
+    free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+  } else if (block->ptr_ == all_blocks_.front().ptr_) {
+    auto next = std::next(block);
+    if (next->is_free_ &&
+        reinterpret_cast<uint8_t *>(block->ptr_) + block->size_ == next->ptr_) {
+      // merge with next
+      block->size_ += next->size_;
+      block->is_free_ = true;
+      free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+      all_blocks_.erase(next);
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    } else {
+      block->is_free_ = true;
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    }
+  } else if (block->ptr_ == all_blocks_.back().ptr_) {
+    auto pre = std::prev(block);
+    if (pre->is_free_ &&
+        reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ == block->ptr_) {
+      // merge with pre
+      free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+      pre->size_ += block->size_;
+      all_blocks_.erase(block);
+      free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+    } else {
+      block->is_free_ = true;
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    }
+  } else {
+    auto pre = std::prev(block);
+    auto next = std::next(block);
+    if (pre->is_free_ &&
+        reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ == block->ptr_ &&
+        !(next->is_free_ &&
+          reinterpret_cast<uint8_t *>(block->ptr_) + block->size_ ==
+              next->ptr_)) {
+      // merge with pre
+      free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+      pre->size_ += block->size_;
+      all_blocks_.erase(block);
+      free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+    } else if (next->is_free_ &&
+               reinterpret_cast<uint8_t *>(block->ptr_) + block->size_ ==
+                   next->ptr_ &&
+               !(pre->is_free_ &&
+                 reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ ==
+                     block->ptr_)) {
+      // merge with next
+      block->size_ += next->size_;
+      block->is_free_ = true;
+      free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+      all_blocks_.erase(next);
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    } else if (pre->is_free_ &&
+               reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ ==
+                   block->ptr_ &&
+               next->is_free_ &&
+               reinterpret_cast<uint8_t *>(block->ptr_) + block->size_ ==
+                   next->ptr_) {
+      // merge with pre and next
+      free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+      free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+      pre->size_ += (block->size_ + next->size_);
+      all_blocks_.erase(block);
+      all_blocks_.erase(next);
+      free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+    } else {
+      block->is_free_ = true;
+      free_blocks_.emplace(std::make_pair(block->size_, block->ptr_), block);
+    }
+  }
+}
+
+void VirtualMemoryAutoGrowthBestFitAllocator::ExtendAndMerge(size_t size) {
+  void *ptr = nullptr;
+
+  auto allocateptr = underlying_allocator_->Allocate(size);
+  ptr = allocateptr->ptr();
+  size = allocateptr->size();
+  allocations_.push_back(std::move(allocateptr));  // hold allocation
+
+  if (all_blocks_.empty()) {
+    all_blocks_.push_back(Block(ptr, size, true));
+    free_blocks_.emplace(std::make_pair(size, ptr), all_blocks_.begin());
+    return;
+  }
+  for (auto block_it = all_blocks_.begin(); block_it != all_blocks_.end();
+       ++block_it) {
+    if (block_it->ptr_ > ptr) {
+      if (block_it == all_blocks_.begin()) {
+        // insert to front
+        if (block_it->is_free_ &&
+            reinterpret_cast<uint8_t *>(ptr) + size == block_it->ptr_) {
+          // merge with next
+          free_blocks_.erase(std::make_pair(block_it->size_, block_it->ptr_));
+          block_it->ptr_ = ptr;
+          block_it->size_ += size;
+          free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
+                               block_it);
+        } else {
+          // do not merge
+          all_blocks_.push_front(Block(ptr, size, true));
+          free_blocks_.emplace(std::make_pair(size, ptr), all_blocks_.begin());
+        }
+      } else {
+        // insert to middle
+        auto next = block_it;
+        auto pre = std::prev(block_it);
+        if (pre->is_free_ &&
+            reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ == ptr &&
+            !(next->is_free_ &&
+              reinterpret_cast<uint8_t *>(ptr) + size == next->ptr_)) {
+          // merge with pre
+          free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+          pre->size_ += size;
+          free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+        } else if (next->is_free_ &&
+                   reinterpret_cast<uint8_t *>(ptr) + size == next->ptr_ &&
+                   !(pre->is_free_ &&
+                     reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ ==
+                         ptr)) {
+          // merge with next
+          free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+          next->ptr_ = ptr;
+          next->size_ += size;
+          free_blocks_.emplace(std::make_pair(next->size_, next->ptr_), next);
+        } else if (pre->is_free_ &&
+                   reinterpret_cast<uint8_t *>(pre->ptr_) + pre->size_ == ptr &&
+                   next->is_free_ &&
+                   reinterpret_cast<uint8_t *>(ptr) + size == next->ptr_) {
+          // merge with pre and next
+          free_blocks_.erase(std::make_pair(pre->size_, pre->ptr_));
+          free_blocks_.erase(std::make_pair(next->size_, next->ptr_));
+          pre->size_ += (size + next->size_);
+          free_blocks_.emplace(std::make_pair(pre->size_, pre->ptr_), pre);
+          all_blocks_.erase(next);
+        } else {
+          // do not merge
+          auto iter = all_blocks_.insert(next, Block(ptr, size, true));
+          free_blocks_.emplace(std::make_pair(size, ptr), iter);
+        }
+      }
+      return;
+    }
+  }
+
+  // insert to back
+  auto block_it = all_blocks_.end();
+  block_it--;
+  if (block_it->is_free_ &&
+      reinterpret_cast<uint8_t *>(block_it->ptr_) + block_it->size_ == ptr) {
+    // merge with pre
+    free_blocks_.erase(std::make_pair(block_it->size_, block_it->ptr_));
+    block_it->size_ += size;
+    free_blocks_.emplace(std::make_pair(block_it->size_, block_it->ptr_),
+                         block_it);
+  } else {
+    // do not merge
+    all_blocks_.push_back(Block(ptr, size, true));
+    auto block_it = all_blocks_.end();
+    block_it--;
+    free_blocks_.emplace(std::make_pair(size, ptr), block_it);
+  }
+}
+
+pten::Allocation *VirtualMemoryAutoGrowthBestFitAllocator::AllocFromFreeBlocks(
+    size_t size) {
+  auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr));
+  if (iter != free_blocks_.end()) {
+    std::list<Block>::iterator block_it = iter->second;
+    free_blocks_.erase(iter);
+    if (NeedSplit(block_it->size_, alignment_, size)) {
+      size_t remaining_size = block_it->size_ - size;
+      auto remaining_free_block = all_blocks_.insert(
+          block_it, Block(block_it->ptr_, remaining_size, true));
+      free_blocks_.emplace(std::make_pair(remaining_size, block_it->ptr_),
+                           remaining_free_block);
+      block_it->ptr_ =
+          reinterpret_cast<uint8_t *>(block_it->ptr_) + remaining_size;
+      block_it->size_ = size;
+    }
+
+    block_it->is_free_ = false;
+    return new BlockAllocation(block_it, place_);
+  }
+
+  return nullptr;
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
index e69de29bb2..10bf0bbf49 100644
--- a/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/virtual_memory_auto_growth_best_fit_allocator.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <list>
+#include <map>
+#include <set>
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/allocation/spin_lock.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+struct Block {
+  Block(void *ptr, size_t size, bool is_free)
+      : ptr_(ptr), size_(size), is_free_(is_free) {}
+
+  void *ptr_;
+  size_t size_;
+  bool is_free_;
+};
+
+struct BlockAllocation : public Allocation {
+  explicit BlockAllocation(const std::list<Block>::iterator &it,
+                           platform::Place place)
+      : Allocation(it->ptr_, it->size_, place), block_it_(it) {}
+
+  std::list<Block>::iterator block_it_;
+};
+
+/**
+ * Like AutoGrowthBestFitAllocator, VirtualMemoryAutoGrowthBestFitAllocator will
+ * gradually apply to GPU for video memory as the model uses more video memory.
+ * However, the difference is that VirtualMemoryAutoGrowthBestFitAllocator uses
+ * nviaid's virtual memory management technology and obtains the virtual memory
+ * address. If the video memory applied for twice is continuous, we can combine
+ * the two video memories later. This combination can greatly reduce
+ * fragmentation.
+ */
+class VirtualMemoryAutoGrowthBestFitAllocator : public Allocator {
+ public:
+  VirtualMemoryAutoGrowthBestFitAllocator(
+      const std::shared_ptr<Allocator> &underlying_allocator, size_t alignment,
+      const platform::CUDAPlace &place);
+
+  bool IsAllocThreadSafe() const override { return true; }
+
+ protected:
+  pten::Allocation *AllocateImpl(size_t size) override;
+
+  void FreeImpl(pten::Allocation *allocation) override;
+
+ private:
+  pten::Allocation *AllocFromFreeBlocks(size_t size);
+  void ExtendAndMerge(size_t size);
+  void TryMergeBlock2Blocks(std::list<Block>::iterator iter);
+
+  std::shared_ptr<Allocator> underlying_allocator_;
+  size_t alignment_;
+
+  std::map<std::pair<size_t, void *>, std::list<Block>::iterator> free_blocks_;
+  std::list<Block> all_blocks_;
+  std::list<AllocationPtr> allocations_;
+  platform::Place place_;
+  SpinLock spinlock_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 3b8d07548e..8830c46a17 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -18,6 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/allocation/allocator.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/stream/stream.h"
 
 namespace paddle {
 
@@ -27,7 +28,7 @@ class DeviceContext;
 
 namespace memory {
 
-using allocation::Allocation;
+using pten::Allocation;
 using allocation::Allocator;
 using allocation::AllocationPtr;
 
@@ -40,5 +41,24 @@ extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
 
 extern uint64_t Release(const platform::Place& place);
 
+extern std::shared_ptr<Allocation> AllocShared(const platform::Place& place,
+                                               size_t size,
+                                               const platform::Stream& stream);
+
+extern bool InSameStream(const std::shared_ptr<Allocation>& allocation,
+                         const platform::Stream& stream);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+extern AllocationPtr Alloc(const platform::CUDAPlace& place, size_t size,
+                           const gpuStream_t& stream);
+
+extern uint64_t Release(const platform::CUDAPlace& place,
+                        const gpuStream_t& stream);
+
+void RecordStream(std::shared_ptr<Allocation> allocation,
+                  const gpuStream_t& stream);
+
+const gpuStream_t& GetStream(const std::shared_ptr<Allocation>& allocation);
+#endif
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index b9481f1c8e..6892f7ce4e 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -18,7 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
@@ -286,10 +287,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     const T** dev_ins_data = nullptr;
     if (!has_same_shape || in_num < 2 || in_num > 4) {
       tmp_dev_ins_data = memory::Alloc(context, in_num * sizeof(T*));
+      auto* restored =
+          platform::RestoreHostMemIfCapturingCUDAGraph(inputs_data, in_num);
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(inputs_data), in_num * sizeof(T*),
-                   context.stream());
+                   tmp_dev_ins_data->ptr(), platform::CPUPlace(), restored,
+                   in_num * sizeof(T*), context.stream());
       dev_ins_data = reinterpret_cast<const T**>(tmp_dev_ins_data->ptr());
     }
 
@@ -313,9 +315,11 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     } else {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context, inputs_col_num * sizeof(int64_t));
+
+      auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
+          inputs_col, inputs_col_num);
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                   static_cast<void*>(inputs_col),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored,
                    inputs_col_num * sizeof(int64_t), context.stream());
       int64_t* dev_ins_col_data =
           static_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
@@ -332,9 +336,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* col_alloc_released = col_alloc.release();
     context.AddStreamCallback([data_alloc_released, col_alloc_released] {
-      memory::allocation::AllocationDeleter deleter;
-      deleter(data_alloc_released);
-      deleter(col_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(col_alloc_released);
     });
 #endif
   }
@@ -415,10 +418,11 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     T** dev_out_gpu_data = nullptr;
     if (!has_same_shape || o_num < 2 || o_num > 4) {
       tmp_dev_outs_data = memory::Alloc(context, o_num * sizeof(T*));
+      auto* restored =
+          platform::RestoreHostMemIfCapturingCUDAGraph(outputs_data, o_num);
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_outs_data->ptr(), platform::CPUPlace(),
-                   reinterpret_cast<void*>(outputs_data), o_num * sizeof(T*),
-                   context.stream());
+                   tmp_dev_outs_data->ptr(), platform::CPUPlace(), restored,
+                   o_num * sizeof(T*), context.stream());
       dev_out_gpu_data = reinterpret_cast<T**>(tmp_dev_outs_data->ptr());
     }
 
@@ -442,9 +446,10 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     } else {
       auto tmp_dev_ins_col_data =
           memory::Alloc(context, outputs_cols_num * sizeof(int64_t));
+      auto* restored = platform::RestoreHostMemIfCapturingCUDAGraph(
+          outputs_cols, outputs_cols_num);
       memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()),
-                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(),
-                   reinterpret_cast<void*>(outputs_cols),
+                   tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), restored,
                    outputs_cols_num * sizeof(int64_t), context.stream());
       int64_t* dev_outs_col_data =
           reinterpret_cast<int64_t*>(tmp_dev_ins_col_data->ptr());
@@ -460,9 +465,8 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
     auto* data_alloc_released = data_alloc.release();
     auto* cols_alloc_released = cols_alloc.release();
     context.AddStreamCallback([data_alloc_released, cols_alloc_released] {
-      memory::allocation::AllocationDeleter deleter;
-      deleter(data_alloc_released);
-      deleter(cols_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(data_alloc_released);
+      memory::allocation::Allocator::AllocationDeleter(cols_alloc_released);
     });
 #endif
   }
diff --git a/paddle/fluid/platform/device/mlu/device_context_allocator.h b/paddle/fluid/platform/device/mlu/device_context_allocator.h
index e69de29bb2..408016c0f0 100644
--- a/paddle/fluid/platform/device/mlu/device_context_allocator.h
+++ b/paddle/fluid/platform/device/mlu/device_context_allocator.h
@@ -0,0 +1,162 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/platform/device/mlu/device_context.h"
+#include "paddle/fluid/platform/device/mlu/mlu_info.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+
+namespace platform {
+class MLUDeviceContext;
+}  // namespace platform
+
+namespace memory {
+namespace allocation {
+
+/**
+ * MLUDeviceContextAllocation is a wrapper of the underbeneath allocation.
+ * MLUDeviceContextAllocation adds a MLU stream callback for the underbeneath
+ * allocation so that MLUDeviceContextAllocation can be used in a MLU stream
+ * which deletes allocation in the callback.
+ */
+class MLUDeviceContextAllocation : public Allocation {
+ public:
+  explicit MLUDeviceContextAllocation(AllocationPtr allocation)
+      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
+        underlying_allocation_(std::move(allocation)) {}
+
+  ~MLUDeviceContextAllocation() {
+    PADDLE_ENFORCE_NOT_NULL(
+        dev_ctx_,
+        platform::errors::PreconditionNotMet(
+            "Device context is not set for MLUDeviceContextAllocation"));
+    auto *p_allocation = underlying_allocation_.release();
+    VLOG(4) << "Adding callback to delete MLUDeviceContextAllocation at "
+            << p_allocation;
+    dev_ctx_->AddStreamCallback([p_allocation] {
+      VLOG(4) << "Delete MLUDeviceContextAllocation at " << p_allocation;
+      Allocator::AllocationDeleter(p_allocation);
+    });
+  }
+
+  void SetMLUDeviceContext(const platform::MLUDeviceContext *dev_ctx) {
+    dev_ctx_ = dev_ctx;
+  }
+
+ private:
+  AllocationPtr underlying_allocation_;
+  const platform::MLUDeviceContext *dev_ctx_{nullptr};
+};
+
+/**
+ * MLUDeviceContextAllocator will allocate a MLUDeviceContextAllocation
+ * after waiting for a self-created event on the default stream. It does so to
+ * let the non-default stream be able to allocate GPU memory which will be
+ * released by stream callback
+ */
+class MLUDeviceContextAllocator : public Allocator {
+ public:
+  explicit MLUDeviceContextAllocator(platform::MLUPlace place,
+                                     mluStream default_stream)
+      : place_(place), default_stream_(default_stream) {
+    platform::MLUDeviceGuard guard(place_.device);
+    PADDLE_ENFORCE_MLU_SUCCESS(cnrtNotifierCreate(&event_));
+  }
+
+  ~MLUDeviceContextAllocator() {
+    if (event_) {
+      platform::MLUDeviceGuard guard(place_.device);
+      PADDLE_ENFORCE_MLU_SUCCESS(cnrtNotifierDestroy(event_));
+    }
+  }
+
+ protected:
+  pten::Allocation *AllocateImpl(size_t size) override {
+    PADDLE_ENFORCE_NOT_NULL(
+        default_stream_,
+        platform::errors::PreconditionNotMet(
+            "Default stream is not set for MLUDeviceContextAllocator"));
+    platform::MLUDeviceGuard guard(place_.device);
+    auto allocation =
+        new MLUDeviceContextAllocation(memory::Alloc(place_, size));
+    // Wait for the event on stream
+    PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event_, default_stream_));
+    PADDLE_ENFORCE_MLU_SUCCESS(cnrtWaitNotifier(event_));
+    return allocation;
+  }
+
+  void FreeImpl(pten::Allocation *allocation) override { delete allocation; }
+
+ private:
+  platform::MLUPlace place_;
+  mluEventHandle event_{nullptr};
+  mluStream default_stream_{nullptr};
+};
+
+/**
+ * MLUDeviceContextAllocatorPool is a singletion stores mapping from
+ * MLUPlace(s) to std::shared_ptr<MLUDeviceContextAllocator>. When a
+ * MLUDeviceContext's compute stream isn't default stream, it can call this
+ * class to allocate GPU memory which will be released by a callback after
+ * stream execution.
+ */
+class MLUDeviceContextAllocatorPool {
+ public:
+  static MLUDeviceContextAllocatorPool &Instance() {
+    static MLUDeviceContextAllocatorPool pool;
+    return pool;
+  }
+
+  AllocationPtr Alloc(const platform::MLUDeviceContext &dev_ctx, size_t size) {
+    auto iter = allocators_.find(
+        BOOST_GET_CONST(platform::MLUPlace, dev_ctx.GetPlace()));
+    PADDLE_ENFORCE_NE(
+        iter, allocators_.end(),
+        platform::errors::NotFound("No allocator found for MLUPlace."));
+    auto &allocator = iter->second;
+    AllocationPtr allocation = allocator->Allocate(size);
+    static_cast<MLUDeviceContextAllocation *>(allocation.get())
+        ->SetMLUDeviceContext(&dev_ctx);
+    return allocation;
+  }
+
+ private:
+  MLUDeviceContextAllocatorPool() {
+    std::vector<int> devices = platform::GetMLUSelectedDevices();
+    for (int i : devices) {
+      auto place = platform::MLUPlace(i);
+      auto compute_stream =
+          platform::DeviceContextPool::Instance().GetByPlace(place)->stream();
+      auto allocator = std::shared_ptr<MLUDeviceContextAllocator>(
+          new MLUDeviceContextAllocator(place, compute_stream));
+      allocators_.insert(make_pair(place, allocator));
+    }
+  }
+
+  std::map<platform::MLUPlace, std::shared_ptr<MLUDeviceContextAllocator>>
+      allocators_;
+};
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index e83057e682..c049da3b33 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -158,8 +158,7 @@ void FillNpuTensorWithConstant(Tensor *tensor, T val) {
             paddle::memory::allocation::AllocatorFacade::Instance()
                 .GetAllocator(npu_pinned_place)
                 .get());
-    paddle::memory::allocation::Allocation *allocation =
-        npu_pinned_tensor.Holder().get();
+    pten::Allocation *allocation = npu_pinned_tensor.Holder().get();
 
     npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
   } else {
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index e69de29bb2..659df6b9b4 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -0,0 +1,174 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// disable numpy compile error
+#include <Python.h>
+
+#include <string>
+#include <vector>
+
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+#include "paddle/fluid/eager/accumulation/accumulation_node.h"
+#include "paddle/fluid/eager/api/all.h"
+#include "paddle/fluid/eager/autograd_meta.h"
+#include "paddle/fluid/eager/backward.h"
+#include "paddle/fluid/eager/utils.h"
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/pybind/eager.h"
+#include "paddle/fluid/pybind/eager_utils.h"
+#include "paddle/fluid/pybind/exception.h"
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/api/lib/utils/storage.h"
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/common/data_type.h"
+#include "paddle/pten/core/convert_utils.h"
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/include/core.h"
+
+namespace paddle {
+namespace pybind {
+
+namespace py = ::pybind11;
+
+extern PyTypeObject* p_eager_tensor_type;
+
+size_t PyArray_Size_(PyObject* numpy_data) {
+  size_t res = 1;
+  auto dims = pybind11::detail::array_proxy(numpy_data)->dimensions;
+  auto nd = pybind11::detail::array_proxy(numpy_data)->nd;
+  while (nd--) {
+    res *= (*dims++);
+  }
+  return res;
+}
+
+class EagerNumpyAllocation : public pten::Allocation {
+ public:
+  explicit EagerNumpyAllocation(PyObject* numpy_data, pten::DataType dtype)
+      : Allocation(
+            static_cast<void*>(pybind11::detail::array_proxy(numpy_data)->data),
+            pten::DataTypeSize(dtype) * PyArray_Size_(numpy_data),
+            paddle::platform::CPUPlace()),
+        arr_(numpy_data) {
+    PADDLE_ENFORCE_NOT_NULL(arr_, platform::errors::InvalidArgument(
+                                      "The underlying PyObject pointer of "
+                                      "numpy array cannot be nullptr"));
+    PADDLE_ENFORCE_NE(
+        arr_, Py_None,
+        platform::errors::PreconditionNotMet(
+            "The underlying PyObject pointer of numpy array cannot be None"));
+    Py_INCREF(arr_);
+  }
+  ~EagerNumpyAllocation() override {
+    py::gil_scoped_acquire gil;
+    Py_DECREF(arr_);
+  }
+
+ private:
+  PyObject* arr_;
+};
+
+static PyObject* eager_api_set_expected_place(PyObject* self, PyObject* args,
+                                              PyObject* kwargs) {
+  EAGER_TRY
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
+  egr::Controller::Instance().SetExpectedPlace(place);
+
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_get_expected_place(PyObject* self, PyObject* args,
+                                              PyObject* kwargs) {
+  EAGER_TRY
+  return ToPyObject(egr::Controller::Instance().GetExpectedPlace());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_scale(PyObject* self, PyObject* args,
+                                 PyObject* kwargs) {
+  EAGER_TRY
+  // TODO(jiabin): Sync Tensor and Variable here when we support
+  egr::EagerTensor ret =
+      egr::scale(reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 0))
+                     ->eager_tensor,
+                 CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 1), 1),
+                 CastPyArg2AttrFloat(PyTuple_GET_ITEM(args, 2), 2),
+                 CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3),
+                 CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 4), 4));
+  return ToPyObject(ret);
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_run_backward(PyObject* self, PyObject* args,
+                                        PyObject* kwargs) {
+  EAGER_TRY
+  auto tensors = CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 0), 0);
+  auto grad_tensors =
+      CastPyArg2VectorOfEagerTensor(PyTuple_GET_ITEM(args, 1), 1);
+  RunBackward(tensors, grad_tensors,
+              CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2));
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
+                                       PyObject* kwargs) {
+  EAGER_TRY
+  egr::EagerTensor& src =
+      reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 0))
+          ->eager_tensor;
+  egr::EagerTensor& dst =
+      reinterpret_cast<EagerTensorObject*>(PyTuple_GET_ITEM(args, 1))
+          ->eager_tensor;
+  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
+  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
+
+  dst = src.copy_to(pten::TransToPtenBackend(place), blocking);
+  egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
+      egr::EagerUtils::autograd_meta(&(src))->StopGradient());
+  egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
+      egr::EagerUtils::autograd_meta(&(src))->Persistable());
+  Py_INCREF(Py_None);
+  return Py_None;
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
+PyMethodDef variable_functions[] = {
+    {"scale", (PyCFunction)(void (*)(void))eager_api_scale,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_set_expected_place",
+     (PyCFunction)(void (*)(void))eager_api_set_expected_place,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"_get_expected_place",
+     (PyCFunction)(void (*)(void))eager_api_get_expected_place,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"run_backward", (PyCFunction)(void (*)(void))eager_api_run_backward,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {"tensor_copy", (PyCFunction)(void (*)(void))eager_api_tensor_copy,
+     METH_VARARGS | METH_KEYWORDS, NULL},
+    {NULL, NULL, 0, NULL}};
+
+void BindFunctions(PyObject* module) {
+  if (PyModule_AddFunctions(module, variable_functions) < 0) {
+    PADDLE_THROW(platform::errors::Fatal(
+        "Init Paddle erroe in BindFunctions(PyModule_AddFunctions)."));
+    return;
+  }
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/CMakeLists.txt b/paddle/pten/api/lib/utils/CMakeLists.txt
index 4a44ad7758..a4db8c4b19 100644
--- a/paddle/pten/api/lib/utils/CMakeLists.txt
+++ b/paddle/pten/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
-cc_library(pten_api_utils SRCS allocator.cc storage.cc tensor_utils.cc DEPS
+cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS
 tensor_base convert_utils dense_tensor lod_tensor selected_rows place var_type_traits)
diff --git a/paddle/pten/api/lib/utils/allocator.h b/paddle/pten/api/lib/utils/allocator.h
new file mode 100644
index 0000000000..a8c05b7651
--- /dev/null
+++ b/paddle/pten/api/lib/utils/allocator.h
@@ -0,0 +1,49 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/memory/allocation/allocator.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/pten/core/allocator.h"
+#include "paddle/pten/core/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+class DefaultAllocator : public pten::deprecated::Allocator {
+ public:
+  using Allocation = pten::deprecated::Allocation;
+  explicit DefaultAllocator(const paddle::platform::Place& place)
+      : place_(place) {}
+
+  static void Delete(Allocation* allocation) {
+    paddle::memory::allocation::Allocator::AllocationDeleter(
+        allocation->CastContextWithoutCheck<paddle::memory::Allocation>());
+  }
+
+  Allocation Allocate(size_t bytes_size) override {
+    paddle::memory::AllocationPtr a = memory::Alloc(place_, bytes_size);
+    void* ptr = a->ptr();
+    return Allocation(ptr, a.release(), &Delete, place_);
+  }
+
+  const paddle::platform::Place& place() override { return place_; }
+
+ private:
+  paddle::platform::Place place_;
+};
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/storage.cc b/paddle/pten/api/lib/utils/storage.cc
new file mode 100644
index 0000000000..6116a709d5
--- /dev/null
+++ b/paddle/pten/api/lib/utils/storage.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/lib/utils/storage.h"
+
+namespace paddle {
+namespace experimental {
+
+ExternalStorage::ExternalStorage(void* ptr,
+                                 size_t size,
+                                 const paddle::platform::Place& place)
+    : pten::Storage(std::make_shared<pten::Allocation>(ptr, size, place)),
+      size_(size) {}
+
+ExternalStorage::ExternalStorage(const pten::intrusive_ptr<pten::Storage>& root,
+                                 size_t delta,
+                                 size_t size)
+    : Storage(std::make_shared<pten::Allocation>(
+          static_cast<uint8_t*>(root->data()) + delta, size, root->place())),
+      size_(size) {
+  PADDLE_ENFORCE_LE(static_cast<size_t>(delta + size),
+                    root->size(),
+                    paddle::platform::errors::InvalidArgument(
+                        "The size of the external storage does "
+                        "not meet the metadata requirements."));
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/api/lib/utils/tensor_utils.cc b/paddle/pten/api/lib/utils/tensor_utils.cc
new file mode 100644
index 0000000000..0b6cb8d95c
--- /dev/null
+++ b/paddle/pten/api/lib/utils/tensor_utils.cc
@@ -0,0 +1,521 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+
+#include <utility>
+#include <vector>
+
+#include "paddle/pten/core/compat_utils.h"
+
+namespace paddle {
+namespace experimental {
+
+template <typename DstLoD, typename SrcLoD>
+void SetLoD(DstLoD* dst, const SrcLoD& src) {
+  dst->reserve(src.size());
+  dst->clear();
+  for (auto&& v : src) {
+    dst->emplace_back(v);
+  }
+}
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::Tensor& src) {
+  VLOG(3) << "MakePtenDenseTensor based Tensor.";
+  pten::DenseTensorMeta meta{pten::TransToPtenDataType(src.type()),
+                             src.dims(),
+                             src.layout(),
+                             src.offset()};
+  auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
+  return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
+                                             std::move(meta));
+}
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::LoDTensor& src) {
+  auto out =
+      MakePtenDenseTensor(static_cast<const paddle::framework::Tensor&>(src));
+  SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod),
+         src.lod());
+  return std::move(out);
+}
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::Tensor& src, const pten::TensorArgDef& arg_def) {
+  pten::DenseTensorMeta meta{
+      arg_def.dtype, src.dims(), src.layout(), src.offset()};
+
+  if (src.IsInitialized() &&
+      src.place() == pten::TransToFluidPlace(arg_def.backend)) {
+    auto shared_storage = pten::make_intrusive<SharedStorage>(src.Holder());
+    return std::make_unique<pten::DenseTensor>(std::move(shared_storage),
+                                               std::move(meta));
+  } else {
+    return std::make_unique<pten::DenseTensor>(
+        std::move(pten::make_intrusive<SharedStorage>(
+            pten::TransToFluidPlace(arg_def.backend))),
+        std::move(meta));
+  }
+}
+
+std::unique_ptr<pten::DenseTensor> MakePtenDenseTensor(
+    const paddle::framework::LoDTensor& src,
+    const pten::TensorArgDef& arg_def) {
+  auto out = MakePtenDenseTensor(
+      static_cast<const paddle::framework::Tensor&>(src), arg_def);
+  SetLoD(&(pten::CompatibleDenseTensorUtils::GetMutableMeta(out.get())->lod),
+         src.lod());
+  return std::move(out);
+}
+
+pten::Scalar MakePtenScalar(const paddle::framework::LoDTensor& src) {
+  PADDLE_ENFORCE_EQ(src.numel(),
+                    1,
+                    paddle::platform::errors::InvalidArgument(
+                        "The Scalar only supports Tensor with 1 element, "
+                        "but now Tensor has %d element.",
+                        src.numel()));
+  switch (src.type()) {
+    case paddle::framework::proto::VarType::FP32:
+      return {src.template data<float>()[0]};
+    case paddle::framework::proto::VarType::FP64:
+      return {src.template data<double>()[0]};
+    case paddle::framework::proto::VarType::FP16:
+      return {src.template data<float16>()[0]};
+    case paddle::framework::proto::VarType::BF16:
+      return {src.template data<bfloat16>()[0]};
+    case paddle::framework::proto::VarType::INT32:
+      return {src.template data<int32_t>()[0]};
+    case paddle::framework::proto::VarType::INT64:
+      return {src.template data<int64_t>()[0]};
+    case paddle::framework::proto::VarType::INT16:
+      return {src.template data<int16_t>()[0]};
+    case paddle::framework::proto::VarType::INT8:
+      return {src.template data<int8_t>()[0]};
+    case paddle::framework::proto::VarType::UINT8:
+      return {src.template data<uint8_t>()[0]};
+    case paddle::framework::proto::VarType::BOOL:
+      return {src.template data<bool>()[0]};
+    case paddle::framework::proto::VarType::COMPLEX64:
+      return {src.template data<complex64>()[0]};
+    case paddle::framework::proto::VarType::COMPLEX128:
+      return {src.template data<complex128>()[0]};
+    default:
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Data type error. Don't support casting a %d LoDTensor to Scalar.",
+          src.type()));
+  }
+}
+
+pten::Scalar MakePtenScalarFromVar(const framework::Variable& variable) {
+  auto expected_place = pten::TransToFluidPlace(pten::Backend::CPU);
+  if (variable.IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.Get<framework::LoDTensor>();
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      return MakePtenScalar(tmp_tensor);
+    } else {
+      return MakePtenScalar(tensor);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupport casting input `%s` type to Scalar when call pt "
+        "kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+}
+
+pten::ScalarArray MakePtenScalarArray(const paddle::framework::LoDTensor& src) {
+  if (src.type() == paddle::framework::proto::VarType::INT64) {
+    return {src.data<int64_t>(), src.numel()};
+  } else if (src.type() == paddle::framework::proto::VarType::INT32) {
+    return {src.data<int32_t>(), src.numel()};
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Data type error. When cast a LoDTensor to ScalarArray, "
+        "the data type of LoDTensor must be int32 or int64, "
+        "but now data type is %s.",
+        src.type()));
+  }
+}
+
+pten::ScalarArray MakePtenScalarArrayFromVar(
+    const framework::Variable& variable) {
+  auto expected_place = pten::TransToFluidPlace(pten::Backend::CPU);
+  if (variable.IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.Get<framework::LoDTensor>();
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      return MakePtenScalarArray(tmp_tensor);
+    } else {
+      return MakePtenScalarArray(tensor);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupport casting input `%s` type to ScalarArray when call pt "
+        "kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+}
+
+pten::ScalarArray MakePtenScalarArrayFromVarList(
+    const std::vector<framework::Variable*>& variable_list) {
+  if (variable_list.size() == 0) {
+    return pten::ScalarArray();
+  }
+  auto expected_place = pten::TransToFluidPlace(pten::Backend::CPU);
+
+  paddle::framework::proto::VarType::Type data_type;
+  auto* first_var = variable_list.front();
+  if (first_var->IsType<framework::LoDTensor>()) {
+    const auto& tensor = first_var->Get<framework::LoDTensor>();
+    data_type = tensor.type();
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupport casting input `%s` type to VectorTensor when call pt "
+        "kernel.",
+        framework::ToTypeName(first_var->Type())));
+  }
+
+  std::vector<int64_t> vector_data;
+  vector_data.reserve(variable_list.size());
+
+  if (data_type == paddle::framework::proto::VarType::INT64) {
+    for (auto* var : variable_list) {
+      if (var->IsType<framework::LoDTensor>()) {
+        const auto& tensor = var->Get<framework::LoDTensor>();
+        if (!platform::is_same_place(tensor.place(), expected_place)) {
+          framework::LoDTensor tmp_tensor;
+          framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+          vector_data.push_back(*tmp_tensor.data<int64_t>());
+        } else {
+          vector_data.push_back(*tensor.data<int64_t>());
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport casting input `%s` type to VectorTensor when call pt "
+            "kernel.",
+            framework::ToTypeName(var->Type())));
+      }
+    }
+
+  } else if (data_type == paddle::framework::proto::VarType::INT32) {
+    for (auto* var : variable_list) {
+      if (var->IsType<framework::LoDTensor>()) {
+        const auto& tensor = var->Get<framework::LoDTensor>();
+        if (!platform::is_same_place(tensor.place(), expected_place)) {
+          framework::LoDTensor tmp_tensor;
+          framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+          vector_data.push_back(*tmp_tensor.data<int32_t>());
+        } else {
+          vector_data.push_back(*tensor.data<int32_t>());
+        }
+      } else {
+        PADDLE_THROW(platform::errors::Unimplemented(
+            "Unsupport casting input `%s` type to VectorTensor when call pt "
+            "kernel.",
+            framework::ToTypeName(var->Type())));
+      }
+    }
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "Data type error. When cast a LoDTensor to VectorTensor, "
+        "the data type of LoDTensor must be int32 or int64, "
+        "but now data type is %s.",
+        data_type));
+  }
+
+  return {vector_data};
+}
+
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    const framework::Variable& variable, const pten::TensorArgDef& arg_def) {
+  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
+
+  if (variable.IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.Get<framework::LoDTensor>();
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      return MakePtenDenseTensor(tmp_tensor);
+    } else {
+      return MakePtenDenseTensor(tensor);
+    }
+  } else if (variable.IsType<framework::SelectedRows>()) {
+    // TODO(chenweihang): now we don't deal with row and height
+    // by xiaowei's advice
+    const auto& tensor = variable.Get<framework::SelectedRows>();
+    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+      framework::Tensor tmp_tensor;
+      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+      return MakePtenDenseTensor(tmp_tensor);
+    } else {
+      return MakePtenDenseTensor(tensor.value());
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared input `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+  return {};
+}
+
+std::unique_ptr<pten::TensorBase> MakePtenTensorBaseFromVar(
+    framework::Variable* variable, const pten::TensorArgDef& arg_def) {
+  // mutable_data before run kernel, to avoid share output form
+  // KernelContext to original tensor
+  if (variable->template IsType<framework::LoDTensor>()) {
+    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+    return MakePtenDenseTensor(*tensor, arg_def);
+  } else if (variable->template IsType<framework::SelectedRows>()) {
+    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
+    // here the row and height will lost in output!
+    return MakePtenDenseTensor(tensor->value(), arg_def);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared output `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable->Type())));
+  }
+  return {};
+}
+
+void MovesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+  PADDLE_ENFORCE_NOT_NULL(
+      src,
+      platform::errors::InvalidArgument(
+          "The source DenseTensor is nullptr when move storage."));
+  PADDLE_ENFORCE_NOT_NULL(
+      dst,
+      platform::errors::InvalidArgument(
+          "The destination Tensor is nullptr when move storage."));
+  dst->Resize(src->dims());
+  dst->set_type(pten::TransToProtoVarType(src->dtype()));
+  auto storage = src->release();
+  std::shared_ptr<pten::Allocation> holder(
+      new TensorStorage(std::move(storage)));
+  dst->ResetHolderWithType(holder, pten::TransToProtoVarType(src->dtype()));
+  dst->set_offset(src->meta().offset);
+}
+
+void MovesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
+  MovesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+  SetLoD(dst->mutable_lod(), src->lod());
+}
+
+void SharesStorage(pten::DenseTensor* src, paddle::framework::Tensor* dst) {
+  PADDLE_ENFORCE_NOT_NULL(
+      src,
+      platform::errors::InvalidArgument(
+          "The source DenseTensor is nullptr when move allocation."));
+  PADDLE_ENFORCE_NOT_NULL(
+      dst,
+      platform::errors::InvalidArgument(
+          "The destination Tensor is nullptr when move allocation."));
+  dst->Resize(src->dims());
+  auto* storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
+  dst->ResetHolderWithType(storage->GetAllocation(),
+                           pten::TransToProtoVarType(src->dtype()));
+  dst->set_offset(src->meta().offset);
+}
+
+void SharesStorage(pten::DenseTensor* src, paddle::framework::LoDTensor* dst) {
+  SharesStorage(src, static_cast<paddle::framework::Tensor*>(dst));
+  SetLoD(dst->mutable_lod(), src->lod());
+}
+
+void ReMakePtenDenseTensor(const paddle::framework::Tensor& src,
+                           pten::DenseTensor* dst) {
+  VLOG(3) << "ReMakePtenDenseTensor based Tensor.";
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  meta->dims = src.dims();
+  meta->dtype = pten::TransToPtenDataType(src.type());
+  meta->layout = src.layout();
+  meta->offset = src.offset();
+
+  auto* shared_storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
+  PADDLE_ENFORCE_NOT_NULL(
+      shared_storage,
+      platform::errors::NotFound(
+          "Target DenseTensor's shared storage is nullptr."));
+
+  PADDLE_ENFORCE_EQ(src.IsInitialized(),
+                    true,
+                    paddle::platform::errors::InvalidArgument(
+                        "Source Tensor is not initialized."));
+  shared_storage->ResetAllocation(src.Holder());
+}
+
+void ReMakePtenDenseTensor(const paddle::framework::LoDTensor& src,
+                           pten::DenseTensor* dst) {
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  SetLoD(&meta->lod, src.lod());
+  ReMakePtenDenseTensor(static_cast<const paddle::framework::Tensor&>(src),
+                        dst);
+}
+
+void ReMakePtenDenseTensorByArgDef(const paddle::framework::Tensor& src,
+                                   const pten::TensorArgDef& arg_def,
+                                   pten::DenseTensor* dst) {
+  VLOG(3) << "ReMakePtenDenseTensor based Tensor and TensorArgDef.";
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  meta->dims = src.dims();
+  meta->dtype = arg_def.dtype;
+  meta->layout = src.layout();
+  meta->offset = src.offset();
+
+  auto* shared_storage = static_cast<SharedStorage*>(
+      pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(dst));
+  PADDLE_ENFORCE_NOT_NULL(
+      shared_storage,
+      platform::errors::NotFound(
+          "Target DenseTensor's shared storage is nullptr."));
+
+  if (src.IsInitialized() &&
+      src.place() == pten::TransToFluidPlace(arg_def.backend)) {
+    shared_storage->ResetAllocation(src.Holder());
+  } else {
+    shared_storage->ResetAllocationPlace(
+        pten::TransToFluidPlace(arg_def.backend));
+  }
+}
+
+void ReMakePtenDenseTensorByArgDef(const paddle::framework::LoDTensor& src,
+                                   const pten::TensorArgDef& arg_def,
+                                   pten::DenseTensor* dst) {
+  auto* meta = pten::CompatibleDenseTensorUtils::GetMutableMeta(dst);
+  SetLoD(&meta->lod, src.lod());
+  ReMakePtenDenseTensorByArgDef(
+      static_cast<const paddle::framework::Tensor&>(src), arg_def, dst);
+}
+
+void ReMakePtenDenseTensorFromVar(const framework::Variable& variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst) {
+  auto expected_place = pten::TransToFluidPlace(arg_def.backend);
+  if (variable.IsType<framework::LoDTensor>()) {
+    const auto& tensor = variable.Get<framework::LoDTensor>();
+    // check input dtype before ReMakePtenDenseTensor
+    PADDLE_ENFORCE(
+        (arg_def.dtype == pten::TransToPtenDataType(tensor.type())),
+        paddle::platform::errors::InvalidArgument(
+            "The type of input data is diffrent from the type of the "
+            "argument's definition in kernel."));
+    if (!platform::is_same_place(tensor.place(), expected_place)) {
+      framework::LoDTensor tmp_tensor;
+      framework::TensorCopySync(tensor, expected_place, &tmp_tensor);
+      ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
+    } else {
+      ReMakePtenDenseTensorByArgDef(tensor, arg_def, dst);
+    }
+  } else if (variable.IsType<framework::SelectedRows>()) {
+    // TODO(chenweihang): now we don't deal with row and height
+    // by xiaowei's advice
+    const auto& tensor = variable.Get<framework::SelectedRows>();
+    PADDLE_ENFORCE(
+        (arg_def.dtype == pten::TransToPtenDataType(tensor.value().type())),
+        paddle::platform::errors::InvalidArgument(
+            "The type of input data is diffrent from the type of the "
+            "argument's definition in kernel."));
+    if (!platform::is_same_place(tensor.value().place(), expected_place)) {
+      framework::Tensor tmp_tensor;
+      TensorCopySync(tensor.value(), expected_place, &tmp_tensor);
+      // TODO(chenweihang): adapt SelectedRows by xiaowei's design
+      ReMakePtenDenseTensorByArgDef(tmp_tensor, arg_def, dst);
+    } else {
+      ReMakePtenDenseTensorByArgDef(tensor.value(), arg_def, dst);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared input `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable.Type())));
+  }
+}
+
+void ReMakePtenDenseTensorFromVar(framework::Variable* variable,
+                                  const pten::TensorArgDef& arg_def,
+                                  pten::DenseTensor* dst) {
+  // mutable_data before run kernel, to avoid share output form
+  // KernelContext to original tensor
+  if (variable->template IsType<framework::LoDTensor>()) {
+    auto* tensor = variable->template GetMutable<framework::LoDTensor>();
+    ReMakePtenDenseTensorByArgDef(*tensor, arg_def, dst);
+  } else if (variable->template IsType<framework::SelectedRows>()) {
+    auto* tensor = variable->template GetMutable<framework::SelectedRows>();
+    // TODO(chenweihang): adapt SelectedRows by xiaowei's design,
+    // here the row and height will lost in output!
+    ReMakePtenDenseTensorByArgDef(tensor->value(), arg_def, dst);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared output `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable->Type())));
+  }
+}
+
+static bool IsSameAllocation(const std::shared_ptr<memory::Allocation>& a,
+                             const std::shared_ptr<memory::Allocation>& b) {
+  return a->ptr() == b->ptr() && a->size() == b->size() &&
+         platform::is_same_place(a->place(), b->place());
+}
+
+void MakeVariableFromPtenTensor(pten::DenseTensor* src,
+                                framework::Variable* variable) {
+  if (variable->IsType<framework::LoDTensor>()) {
+    auto* tensor = variable->GetMutable<framework::LoDTensor>();
+
+    auto dtype = pten::TransToProtoVarType(src->dtype());
+    tensor->Resize(src->dims());
+    SetLoD(tensor->mutable_lod(), src->lod());
+
+    // here dynamic_cast is slow
+    auto* storage = static_cast<SharedStorage*>(
+        pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
+
+    if (!tensor->IsInitialized() ||
+        (tensor->IsInitialized() &&
+         !IsSameAllocation(tensor->Holder(), storage->GetAllocation()))) {
+      tensor->ResetHolderWithType(std::move(storage->GetAllocation()), dtype);
+    } else {
+      // Even the pten tensor and Variable have the same Alloctation (both have
+      // the same pointer address, same size and same place)
+      // but there is possible that they do not have the same data_type.
+      // so, here we set the variable's type with the pten tensor dtype.
+      tensor->set_type(dtype);
+    }
+
+  } else if (variable->IsType<framework::SelectedRows>()) {
+    auto* tensor = variable->GetMutable<framework::SelectedRows>();
+    auto dtype = pten::TransToProtoVarType(src->dtype());
+
+    if (!tensor->value().IsInitialized()) {
+      auto storage = dynamic_cast<SharedStorage*>(
+          pten::CompatibleDenseTensorUtils::UnsafeGetMutableStorage(src));
+      tensor->mutable_value()->ResetHolderWithType(
+          std::move(storage->GetAllocation()), dtype);
+    }
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupported shared input `%s` type now when call pt kernel.",
+        framework::ToTypeName(variable->Type())));
+  }
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/pten/core/allocator.h b/paddle/pten/core/allocator.h
index e69de29bb2..2647490c9f 100644
--- a/paddle/pten/core/allocator.h
+++ b/paddle/pten/core/allocator.h
@@ -0,0 +1,153 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include "paddle/fluid/platform/place.h"
+#include "paddle/pten/core/candidate/allocator.h"
+
+namespace pten {
+namespace deprecated {
+
+/// \brief Encapsulates strategies for access/addressing, allocation/
+/// deallocation and construction/destruction of objects.
+class RawAllocator {
+ public:
+  using Place = paddle::platform::Place;
+
+  /// \brief Default destructor.
+  virtual ~RawAllocator() = default;
+
+  /// \brief Allocates storage suitable for an array object of n bytes
+  /// and creates the array, but does not construct array elements.
+  /// May throw exceptions.
+  /// \param bytes_size The number of bytes to allocate.
+  /// \return The first address allocated.
+  virtual void* Allocate(size_t bytes_size) = 0;
+
+  /// \brief Deallocates storage pointed to ptr, which must be a value
+  /// returned by a previous call to allocate that has not been
+  /// invalidated by an intervening call to deallocate. The bytes_size
+  /// must match the value previously passed to allocate.
+  /// \param ptr The first address to deallocate.
+  /// \param bytes_size The number of bytes to deallocate.
+  virtual void Deallocate(void* ptr, size_t bytes_size) = 0;
+
+  /// \brief Get the place value of the allocator and the allocation.
+  /// \return The place value of the allocator and the allocation.
+  virtual const Place& place() const = 0;
+};
+
+/// \brief Fancy pointer with context. The use of this data type
+/// is to be compatible with allocators from different frameworks
+/// without significant performance loss. This class does not
+/// support being inherited.
+class Allocation final {
+ public:
+  using Place = paddle::platform::Place;
+  using DeleterFnPtr = void (*)(Allocation*);
+
+  Allocation() = default;
+
+  // Don't own resources, only provide access.
+  Allocation(void* data, const Place& place) : data_(data), place_(place) {}
+
+  // Own resources.
+  Allocation(void* data, void* ctx, DeleterFnPtr deleter, const Place& place)
+      : data_(data), ctx_(ctx), deleter_(deleter), place_(place) {}
+
+  Allocation(Allocation&& other) { swap(*this, other); }
+  Allocation& operator=(Allocation&& other) {
+    // Exchange them explicitly to avoid moving is equivalent
+    // to copying.
+    swap(*this, other);
+    return *this;
+  }
+  ~Allocation() { Clear(); }
+
+  void* ptr() const noexcept { return data_; }
+  void* operator->() const noexcept { return data_; }
+  operator bool() const noexcept { return data_ || ctx_; }
+  const Place& place() const noexcept { return place_; }
+
+  void Clear() {
+    if (deleter_) {
+      deleter_(this);
+    }
+    ctx_ = nullptr;
+    deleter_ = nullptr;
+    data_ = nullptr;
+  }
+
+  DeleterFnPtr deleter() const noexcept { return deleter_; }
+
+  template <typename T>
+  T* CastContextWithoutCheck() const noexcept {
+    return static_cast<T*>(ctx_);
+  }
+
+  /// \brief Statically cast the void pointer of the context object to
+  /// the primitive type. Conversion of any pointer to void* and back
+  /// to pointer to the original cv type preserves its original value.
+  /// \param T The primitive type name of the context pointer.
+  /// \param expected_deleter The destructor passed in to enhance type
+  /// safety checking.
+  template <typename T>
+  T* CastContext(DeleterFnPtr expected_deleter) const {
+    PADDLE_ENFORCE_EQ(
+        deleter_ == expected_deleter,
+        true,
+        paddle::platform::errors::InvalidArgument(
+            "The deleter of the allocation does not match, so the pointer "
+            "cannot be safely removed."));
+    return CastContextWithoutCheck<T>();
+  }
+
+ private:
+  friend void swap(Allocation& a, Allocation& b) noexcept;
+  void* data_{nullptr};
+  void* ctx_{nullptr};
+  DeleterFnPtr deleter_{nullptr};
+  // TODO(Shixiaowei02): Enum needs to be used instead to reduce
+  // the construction overhead by more than 50%.
+  Place place_;
+};
+
+inline void swap(Allocation& a, Allocation& b) noexcept {
+  ::std::swap(a.data_, b.data_);
+  ::std::swap(a.ctx_, b.ctx_);
+  ::std::swap(a.deleter_, b.deleter_);
+  ::std::swap(a.place_, b.place_);
+}
+
+/// \brief Context compatible allocator interface. This allocator is
+/// mainly used for general data structures such as Tensor. The raw
+/// allocator is more universal and efficient.
+class Allocator {
+  using Place = paddle::platform::Place;
+
+ public:
+  virtual ~Allocator() = default;
+  virtual Allocation Allocate(size_t bytes_size) = 0;
+  virtual const Place& place() = 0;
+};
+
+inline Allocation Allocate(const std::shared_ptr<Allocator>& a, size_t n) {
+  CHECK(a);
+  return a->Allocate(n);
+}
+
+}  // namespace deprecated
+}  // namespace pten
diff --git a/paddle/pten/core/candidate/allocator.h b/paddle/pten/core/candidate/allocator.h
new file mode 100644
index 0000000000..75d42c4fd1
--- /dev/null
+++ b/paddle/pten/core/candidate/allocator.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <functional>
+#include "paddle/fluid/platform/place.h"
+
+namespace pten {
+
+/// \brief Fancy pointer with deleter. The use of this data type
+/// is to be compatible with allocators from different frameworks
+/// without significant performance loss. This class does not
+/// support being inherited.
+class Allocation {
+ public:
+  using Place = paddle::platform::Place;
+  using DeleterFnPtr = void (*)(Allocation*);
+
+  Allocation() = default;
+
+  // Don't own resources, only provide access.
+  Allocation(void* data, size_t size, const Place& place)
+      : ptr_(data), size_(size), place_(place) {}
+
+  // Own resources.
+  Allocation(void* data, size_t size, DeleterFnPtr deleter, const Place& place)
+      : ptr_(data), size_(size), deleter_(deleter), place_(place) {}
+
+  Allocation(Allocation&& other) noexcept { swap(*this, other); }
+  Allocation& operator=(Allocation&& other) noexcept {
+    // Exchange them explicitly to avoid moving is equivalent
+    // to copying.
+    swap(*this, other);
+    return *this;
+  }
+
+  virtual ~Allocation() {
+    if (deleter_) {
+      deleter_(this);
+    }
+  }
+
+  // Returns the holding pointer.
+  // NOTE: For performance consideration, it is better not to make this method
+  // as a virtual method. If we want to implement a `defragmentation` later,
+  // we might need to make `ptr_` field as a protected field, and add a virtual
+  // method like `defragmentation` to change `ptr_`.
+  void* ptr() const noexcept { return ptr_; }
+
+  // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
+  // last valid element.
+  //
+  // NOTE: Some allocator might alloc more memory than request. The size
+  // could larger than its request. For example,
+  //    the AlignedAllocator will always allocate memory as size + kAlignment.
+  //    The raw pointer might not aligned, so an offset might be added to raw
+  //    the pointer. The size of this allocation will be
+  //    `size + kAlignemnt - offset`.
+  size_t size() const noexcept { return size_; }
+
+  void* operator->() const noexcept { return ptr_; }
+  operator bool() const noexcept { return ptr_; }
+  const Place& place() const noexcept { return place_; }
+  DeleterFnPtr deleter() const noexcept { return deleter_; }
+
+ protected:
+  friend void swap(Allocation& a, Allocation& b) noexcept;
+  void* ptr_{nullptr};
+  size_t size_{};
+  DeleterFnPtr deleter_{nullptr};
+  // TODO(Shixiaowei02): Enum needs to be used instead to reduce
+  // the construction overhead by more than 50%.
+  Place place_;
+};
+
+inline void swap(Allocation& a, Allocation& b) noexcept {
+  ::std::swap(a.ptr_, b.ptr_);
+  ::std::swap(a.deleter_, b.deleter_);
+  ::std::swap(a.place_, b.place_);
+  ::std::swap(a.size_, b.size_);
+}
+
+class Allocator {
+ public:
+  using DeleterType = std::function<void(Allocation*)>;
+  using AllocationPtr = std::unique_ptr<Allocation, DeleterType>;
+
+  virtual ~Allocator() = default;
+  virtual AllocationPtr Allocate(size_t bytes_size) = 0;
+
+  virtual bool IsAllocThreadSafe() const { return false; }
+};
+
+}  // namespace pten
diff --git a/paddle/pten/core/dense_tensor.h b/paddle/pten/core/dense_tensor.h
index 1502accd19..1802a24611 100644
--- a/paddle/pten/core/dense_tensor.h
+++ b/paddle/pten/core/dense_tensor.h
@@ -60,6 +60,8 @@ class TensorInplaceVersion {
 class DenseTensor : public TensorBase,
                     public TypeInfoTraits<TensorBase, DenseTensor> {
  public:
+  using Allocator = deprecated::Allocator;
+
   /// \brief Construct a dense tensor and allocate space.
   /// \param a The allocator used to allocate space.
   /// \param meta The meta data of dense tensor.
diff --git a/paddle/pten/core/storage.h b/paddle/pten/core/storage.h
index fc56935eea..cf18dd9130 100644
--- a/paddle/pten/core/storage.h
+++ b/paddle/pten/core/storage.h
@@ -91,6 +91,7 @@ class Storage : public intrusive_ref_counter<Storage> {
 class TensorStorage : public Storage {
  public:
   using Place = paddle::platform::Place;
+  using Allocator = deprecated::Allocator;
 
   explicit TensorStorage(const std::shared_ptr<Allocator>& a) : alloc_(a) {}
 
diff --git a/paddle/pten/tests/core/allocator.h b/paddle/pten/tests/core/allocator.h
index e69de29bb2..c2c74e1aac 100644
--- a/paddle/pten/tests/core/allocator.h
+++ b/paddle/pten/tests/core/allocator.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/pten/core/allocator.h"
+
+namespace pten {
+namespace tests {
+
+class HostAllocatorSample : public pten::deprecated::RawAllocator {
+ public:
+  using Place = paddle::platform::Place;
+  void* Allocate(size_t bytes_size) override {
+    return ::operator new(bytes_size);
+  }
+  void Deallocate(void* ptr, size_t bytes_size) override {
+    return ::operator delete(ptr);
+  }
+  const Place& place() const override { return place_; }
+
+ private:
+  Place place_{paddle::platform::CPUPlace()};
+};
+
+class FancyAllocator : public pten::deprecated::Allocator {
+ public:
+  using Allocation = pten::deprecated::Allocation;
+  static void Delete(Allocation* allocation) {
+    ::operator delete(allocation->ptr());
+  }
+
+  Allocation Allocate(size_t bytes_size) override {
+    void* data = ::operator new(bytes_size);
+    return Allocation(data, data, &Delete, place());
+  }
+
+  const paddle::platform::Place& place() override { return place_; }
+
+  paddle::platform::Place place_ = paddle::platform::CPUPlace();
+};
+
+template <typename T>
+struct CustomAllocator {
+  using value_type = T;
+  using Allocator = pten::deprecated::RawAllocator;
+
+  explicit CustomAllocator(const std::shared_ptr<Allocator>& a) noexcept
+      : alloc_(a) {}
+
+  CustomAllocator(const CustomAllocator&) noexcept = default;
+  T* allocate(std::size_t n) {
+    return static_cast<T*>(alloc_->Allocate(n * sizeof(T)));
+  }
+  void deallocate(T* p, std::size_t n) {
+    return alloc_->Deallocate(p, sizeof(T) * n);
+  }
+
+  template <typename R, typename U>
+  friend bool operator==(const CustomAllocator<R>&,
+                         const CustomAllocator<U>&) noexcept;
+  template <typename R, typename U>
+  friend bool operator!=(const CustomAllocator<R>&,
+                         const CustomAllocator<U>&) noexcept;
+
+ private:
+  std::shared_ptr<Allocator> alloc_;
+};
+
+template <typename T, typename U>
+inline bool operator==(const CustomAllocator<T>& lhs,
+                       const CustomAllocator<U>& rhs) noexcept {
+  return &lhs.alloc_ == &rhs.alloc_;
+}
+
+template <typename T, typename U>
+inline bool operator!=(const CustomAllocator<T>& lhs,
+                       const CustomAllocator<U>& rhs) noexcept {
+  return &lhs.alloc_ != &rhs.alloc_;
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/paddle/pten/tests/core/test_allocator.cc b/paddle/pten/tests/core/test_allocator.cc
index e69de29bb2..94ba9a1e1b 100644
--- a/paddle/pten/tests/core/test_allocator.cc
+++ b/paddle/pten/tests/core/test_allocator.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/pten/tests/core/allocator.h"
+#include "paddle/pten/tests/core/random.h"
+#include "paddle/pten/tests/core/timer.h"
+
+namespace pten {
+namespace tests {
+
+using RawAllocator = pten::deprecated::RawAllocator;
+using Allocator = pten::deprecated::Allocator;
+using Allocation = pten::deprecated::Allocation;
+
+template <typename T>
+bool host_allocator_test(size_t vector_size) {
+  std::vector<T> src(vector_size);
+  std::generate(src.begin(), src.end(), make_generator(src));
+  std::vector<T, CustomAllocator<T>> dst(
+      src.begin(),
+      src.end(),
+      CustomAllocator<T>(std::make_shared<HostAllocatorSample>()));
+  return std::equal(src.begin(), src.end(), dst.begin());
+}
+
+TEST(raw_allocator, host) {
+  CHECK(host_allocator_test<float>(1000));
+  CHECK(host_allocator_test<int32_t>(1000));
+  CHECK(host_allocator_test<int64_t>(1000));
+}
+
+class StorageRawAlloc {
+ public:
+  StorageRawAlloc(const std::shared_ptr<RawAllocator>& a, size_t size)
+      : alloc_(a) {
+    data_ = alloc_->Allocate(size);
+  }
+  ~StorageRawAlloc() { alloc_->Deallocate(data_, size); }
+
+ private:
+  void* data_;
+  size_t size;
+  std::shared_ptr<RawAllocator> alloc_;
+};
+
+class StorageFancyAlloc {
+ public:
+  StorageFancyAlloc(const std::shared_ptr<Allocator>& a, size_t size)
+      : alloc_(a), allocation_(a->Allocate(size)) {}
+
+ private:
+  std::shared_ptr<Allocator> alloc_;
+  Allocation allocation_;
+};
+
+TEST(benchmark, allocator) {
+  std::shared_ptr<RawAllocator> raw_allocator(new HostAllocatorSample);
+  std::shared_ptr<Allocator> fancy_allocator(new FancyAllocator);
+  const size_t cycles = 100;
+  Timer timer;
+  double t1{}, t2{};
+  for (size_t i = 0; i < cycles; ++i) {
+    timer.tic();
+    for (size_t i = 0; i < cycles; ++i) {
+      StorageRawAlloc(raw_allocator, i * 100);
+    }
+    t1 += timer.toc();
+    timer.tic();
+    for (size_t i = 0; i < cycles; ++i) {
+      StorageFancyAlloc(fancy_allocator, i * 100);
+    }
+    t2 += timer.toc();
+  }
+  std::cout << "The cost of raw alloc is " << t1 << "ms.\n";
+  std::cout << "The cost of fancy alloc with place is " << t2 << "ms.\n";
+}
+
+}  // namespace tests
+}  // namespace pten
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index e0ae600819..caacecf446 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -226,7 +226,7 @@ if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
 HAS_MODIFIED_ALLOCATION=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/memory/allocation" || true`
 if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="You must be approved by zhiqiu and Shixiaowei02 for paddle/fluid/memory/allocation.\nIt is being modularized and refactored. Thanks!\n"
-    check_approval 2 6888866 39303645
+    check_approval 1 6888866 39303645
   fi
 
 HAS_MODIFIED_TENSOR=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/framework/tensor" || true`
@@ -241,23 +241,6 @@ if [ "${HAS_MODIFIED_TENSOR}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 22561442 22334008
   fi
 
-ALLOCSHARED_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH |grep -E "*\.(h|cc)" || true`
-if [ "${ALLOCSHARED_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
-    ERROR_LINES=""
-    for TEST_FILE in ${ALLOCSHARED_FILE_CHANGED};
-    do
-        HAS_SKIP_CHECK_ALLOC_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "AllocShared" || true`
-        if [ "${HAS_SKIP_CHECK_ALLOC_CI}" != "" ]; then
-            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${HAS_SKIP_CHECK_ALLOC_CI}\n"
-        fi
-    done
-    if [ "${ERROR_LINES}" != "" ]; then
-        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
-        echo_line="memory::AllocShared is not recommended, because it is being modularized and refactored. Please use memory::Alloc here. Otherwise, please request zhiqiu and Shixiaowei02 review and approve.\n"
-        check_approval 2 6888866 39303645
-    fi
-fi
-
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"
-- 
Gitee